diff --git a/config/config.md b/config/config.md
index 0e395dc4458d..1b9cdb522b2a 100644
--- a/config/config.md
+++ b/config/config.md
@@ -193,11 +193,6 @@
| `region_engine.mito.bloom_filter_index.create_on_compaction` | String | `auto` | Whether to create the bloom filter on compaction.
- `auto`: automatically (default)
- `disable`: never |
| `region_engine.mito.bloom_filter_index.apply_on_query` | String | `auto` | Whether to apply the bloom filter on query
- `auto`: automatically (default)
- `disable`: never |
| `region_engine.mito.bloom_filter_index.mem_threshold_on_create` | String | `auto` | Memory threshold for bloom filter creation.
- `auto`: automatically determine the threshold based on the system memory size (default)
- `unlimited`: no memory limit
- `[size]` e.g. `64MB`: fixed memory threshold |
-| `region_engine.mito.memtable` | -- | -- | -- |
-| `region_engine.mito.memtable.type` | String | `time_series` | Memtable type.
- `time_series`: time-series memtable
- `partition_tree`: partition tree memtable (experimental) |
-| `region_engine.mito.memtable.index_max_keys_per_shard` | Integer | `8192` | The max number of keys in one shard.
Only available for `partition_tree` memtable. |
-| `region_engine.mito.memtable.data_freeze_threshold` | Integer | `32768` | The max rows of data inside the actively writing buffer in one shard.
Only available for `partition_tree` memtable. |
-| `region_engine.mito.memtable.fork_dictionary_bytes` | String | `1GiB` | Max dictionary bytes.
Only available for `partition_tree` memtable. |
| `region_engine.file` | -- | -- | Enable the file engine. |
| `region_engine.metric` | -- | -- | Metric engine options. |
| `region_engine.metric.sparse_primary_key_encoding` | Bool | `true` | Whether to use sparse primary key encoding. |
@@ -591,11 +586,6 @@
| `region_engine.mito.bloom_filter_index.create_on_compaction` | String | `auto` | Whether to create the index on compaction.
- `auto`: automatically (default)
- `disable`: never |
| `region_engine.mito.bloom_filter_index.apply_on_query` | String | `auto` | Whether to apply the index on query
- `auto`: automatically (default)
- `disable`: never |
| `region_engine.mito.bloom_filter_index.mem_threshold_on_create` | String | `auto` | Memory threshold for the index creation.
- `auto`: automatically determine the threshold based on the system memory size (default)
- `unlimited`: no memory limit
- `[size]` e.g. `64MB`: fixed memory threshold |
-| `region_engine.mito.memtable` | -- | -- | -- |
-| `region_engine.mito.memtable.type` | String | `time_series` | Memtable type.
- `time_series`: time-series memtable
- `partition_tree`: partition tree memtable (experimental) |
-| `region_engine.mito.memtable.index_max_keys_per_shard` | Integer | `8192` | The max number of keys in one shard.
Only available for `partition_tree` memtable. |
-| `region_engine.mito.memtable.data_freeze_threshold` | Integer | `32768` | The max rows of data inside the actively writing buffer in one shard.
Only available for `partition_tree` memtable. |
-| `region_engine.mito.memtable.fork_dictionary_bytes` | String | `1GiB` | Max dictionary bytes.
Only available for `partition_tree` memtable. |
| `region_engine.mito.gc` | -- | -- | -- |
| `region_engine.mito.gc.enable` | Bool | `false` | Whether GC is enabled. Need to be the same with metasrv's `gc.enable` or unexpected behavior will occur |
| `region_engine.mito.gc.lingering_time` | String | `1m` | Lingering time before deleting files.
Should be long enough to allow long running queries to finish.
If set to None, then unused files will be deleted immediately. |
diff --git a/config/datanode.example.toml b/config/datanode.example.toml
index 6effec4c8798..d74b02084996 100644
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -654,24 +654,6 @@ apply_on_query = "auto"
## - `[size]` e.g. `64MB`: fixed memory threshold
mem_threshold_on_create = "auto"
-[region_engine.mito.memtable]
-## Memtable type.
-## - `time_series`: time-series memtable
-## - `partition_tree`: partition tree memtable (experimental)
-type = "time_series"
-
-## The max number of keys in one shard.
-## Only available for `partition_tree` memtable.
-index_max_keys_per_shard = 8192
-
-## The max rows of data inside the actively writing buffer in one shard.
-## Only available for `partition_tree` memtable.
-data_freeze_threshold = 32768
-
-## Max dictionary bytes.
-## Only available for `partition_tree` memtable.
-fork_dictionary_bytes = "1GiB"
-
[region_engine.mito.gc]
## Whether GC is enabled. Need to be the same with metasrv's `gc.enable` or unexpected behavior will occur
enable = false
diff --git a/config/standalone.example.toml b/config/standalone.example.toml
index aa745bb6ba49..5e790749feca 100644
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -762,24 +762,6 @@ apply_on_query = "auto"
## - `[size]` e.g. `64MB`: fixed memory threshold
mem_threshold_on_create = "auto"
-[region_engine.mito.memtable]
-## Memtable type.
-## - `time_series`: time-series memtable
-## - `partition_tree`: partition tree memtable (experimental)
-type = "time_series"
-
-## The max number of keys in one shard.
-## Only available for `partition_tree` memtable.
-index_max_keys_per_shard = 8192
-
-## The max rows of data inside the actively writing buffer in one shard.
-## Only available for `partition_tree` memtable.
-data_freeze_threshold = 32768
-
-## Max dictionary bytes.
-## Only available for `partition_tree` memtable.
-fork_dictionary_bytes = "1GiB"
-
[[region_engine]]
## Enable the file engine.
[region_engine.file]
diff --git a/src/metric-engine/src/engine/bulk_insert.rs b/src/metric-engine/src/engine/bulk_insert.rs
index 24c9e7934c8d..ad01ec6f5480 100644
--- a/src/metric-engine/src/engine/bulk_insert.rs
+++ b/src/metric-engine/src/engine/bulk_insert.rs
@@ -14,20 +14,14 @@
use std::collections::HashSet;
-use api::v1::{ArrowIpc, ColumnDataType, SemanticType};
+use api::v1::{ArrowIpc, SemanticType};
use bytes::Bytes;
-use common_error::ext::ErrorExt;
-use common_error::status_code::StatusCode;
use common_grpc::flight::{FlightEncoder, FlightMessage};
-use common_query::prelude::{greptime_timestamp, greptime_value};
-use datatypes::arrow::array::{Array, Float64Array, StringArray, TimestampMillisecondArray};
use datatypes::arrow::record_batch::RecordBatch;
use snafu::{OptionExt, ensure};
use store_api::codec::PrimaryKeyEncoding;
use store_api::metadata::RegionMetadataRef;
-use store_api::region_request::{
- AffectedRows, RegionBulkInsertsRequest, RegionPutRequest, RegionRequest,
-};
+use store_api::region_request::{AffectedRows, RegionBulkInsertsRequest, RegionRequest};
use store_api::storage::RegionId;
use crate::batch_modifier::{TagColumnInfo, modify_batch_sparse};
@@ -42,8 +36,7 @@ impl MetricEngineInner {
/// **Logical region path:** The request payload is a logical `RecordBatch`
/// (timestamp, value and tag columns). It is transformed to physical format
/// via `modify_batch_sparse`, encoded to Arrow IPC, and forwarded as a
- /// `BulkInserts` request to the data region. If mito reports
- /// `StatusCode::Unsupported`, the request is transparently retried as a `Put`.
+ /// `BulkInserts` request to the data region.
///
/// **Physical region path:** The request payload is already in physical format
/// (produced by the batcher's `flush_batch_physical`). It is forwarded directly
@@ -134,27 +127,9 @@ impl MetricEngineInner {
},
partition_expr_version,
};
- match self
- .data_region
+ self.data_region
.write_data(data_region_id, RegionRequest::BulkInserts(request))
.await
- {
- Ok(affected_rows) => Ok(affected_rows),
- Err(err) if err.status_code() == StatusCode::Unsupported => {
- // todo(hl): fallback path for PartitionTreeMemtable, remove this once we remove it
- let rows = record_batch_to_rows(&batch, region_id)?;
- self.put_region(
- region_id,
- RegionPutRequest {
- rows,
- hint: None,
- partition_expr_version,
- },
- )
- .await
- }
- Err(err) => Err(err),
- }
}
fn resolve_tag_columns_from_metadata(
@@ -214,174 +189,6 @@ impl MetricEngineInner {
}
}
-fn record_batch_to_rows(batch: &RecordBatch, logical_region_id: RegionId) -> Result {
- let schema_ref = batch.schema();
- let fields = schema_ref.fields();
-
- let mut ts_idx = None;
- let mut val_idx = None;
- let mut tag_indices = Vec::new();
-
- for (idx, field) in fields.iter().enumerate() {
- if field.name() == greptime_timestamp() {
- ts_idx = Some(idx);
- if !matches!(
- field.data_type(),
- datatypes::arrow::datatypes::DataType::Timestamp(
- datatypes::arrow::datatypes::TimeUnit::Millisecond,
- _
- )
- ) {
- return error::UnexpectedRequestSnafu {
- reason: format!(
- "Timestamp column '{}' in region {:?} has incompatible type: {:?}",
- field.name(),
- logical_region_id,
- field.data_type()
- ),
- }
- .fail();
- }
- } else if field.name() == greptime_value() {
- val_idx = Some(idx);
- if !matches!(
- field.data_type(),
- datatypes::arrow::datatypes::DataType::Float64
- ) {
- return error::UnexpectedRequestSnafu {
- reason: format!(
- "Value column '{}' in region {:?} has incompatible type: {:?}",
- field.name(),
- logical_region_id,
- field.data_type()
- ),
- }
- .fail();
- }
- } else {
- if !matches!(
- field.data_type(),
- datatypes::arrow::datatypes::DataType::Utf8
- ) {
- return error::UnexpectedRequestSnafu {
- reason: format!(
- "Tag column '{}' in region {:?} must be Utf8, found: {:?}",
- field.name(),
- logical_region_id,
- field.data_type()
- ),
- }
- .fail();
- }
- tag_indices.push(idx);
- }
- }
-
- let ts_idx = ts_idx.with_context(|| error::UnexpectedRequestSnafu {
- reason: format!(
- "Timestamp column '{}' not found in RecordBatch for region {:?}",
- greptime_timestamp(),
- logical_region_id
- ),
- })?;
- let val_idx = val_idx.with_context(|| error::UnexpectedRequestSnafu {
- reason: format!(
- "Value column '{}' not found in RecordBatch for region {:?}",
- greptime_value(),
- logical_region_id
- ),
- })?;
-
- let mut schema = Vec::with_capacity(2 + tag_indices.len());
- schema.push(api::v1::ColumnSchema {
- column_name: greptime_timestamp().to_string(),
- datatype: ColumnDataType::TimestampMillisecond as i32,
- semantic_type: SemanticType::Timestamp as i32,
- datatype_extension: None,
- options: None,
- });
- schema.push(api::v1::ColumnSchema {
- column_name: greptime_value().to_string(),
- datatype: ColumnDataType::Float64 as i32,
- semantic_type: SemanticType::Field as i32,
- datatype_extension: None,
- options: None,
- });
- for &idx in &tag_indices {
- let field = &fields[idx];
- schema.push(api::v1::ColumnSchema {
- column_name: field.name().clone(),
- datatype: ColumnDataType::String as i32,
- semantic_type: SemanticType::Tag as i32,
- datatype_extension: None,
- options: None,
- });
- }
-
- let ts_array = batch
- .column(ts_idx)
- .as_any()
- .downcast_ref::()
- .expect("validated as TimestampMillisecond");
- let val_array = batch
- .column(val_idx)
- .as_any()
- .downcast_ref::()
- .expect("validated as Float64");
- let tag_arrays: Vec<&StringArray> = tag_indices
- .iter()
- .map(|&idx| {
- batch
- .column(idx)
- .as_any()
- .downcast_ref::()
- .expect("validated as Utf8")
- })
- .collect();
-
- let num_rows = batch.num_rows();
- let mut rows = Vec::with_capacity(num_rows);
- for row_idx in 0..num_rows {
- let mut values = Vec::with_capacity(2 + tag_arrays.len());
-
- if ts_array.is_null(row_idx) {
- values.push(api::v1::Value { value_data: None });
- } else {
- values.push(api::v1::Value {
- value_data: Some(api::v1::value::ValueData::TimestampMillisecondValue(
- ts_array.value(row_idx),
- )),
- });
- }
-
- if val_array.is_null(row_idx) {
- values.push(api::v1::Value { value_data: None });
- } else {
- values.push(api::v1::Value {
- value_data: Some(api::v1::value::ValueData::F64Value(
- val_array.value(row_idx),
- )),
- });
- }
-
- for arr in &tag_arrays {
- if arr.is_null(row_idx) {
- values.push(api::v1::Value { value_data: None });
- } else {
- values.push(api::v1::Value {
- value_data: Some(api::v1::value::ValueData::StringValue(
- arr.value(row_idx).to_string(),
- )),
- });
- }
- }
-
- rows.push(api::v1::Row { values });
- }
-
- Ok(api::v1::Rows { schema, rows })
-}
-
fn record_batch_to_ipc(record_batch: &RecordBatch) -> Result<(Bytes, Bytes, Bytes)> {
let mut encoder = FlightEncoder::default();
let schema = encoder.encode_schema(record_batch.schema().as_ref());
@@ -810,65 +617,4 @@ mod tests {
assert_eq!(put_output, bulk_output);
}
-
- #[test]
- fn test_record_batch_to_rows_with_null_values() {
- use datatypes::arrow::array::{Float64Array, StringArray, TimestampMillisecondArray};
- use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema, TimeUnit};
- use datatypes::arrow::record_batch::RecordBatch;
- use store_api::storage::RegionId;
-
- use crate::engine::bulk_insert::record_batch_to_rows;
-
- let schema = Arc::new(ArrowSchema::new(vec![
- Field::new(
- greptime_timestamp(),
- DataType::Timestamp(TimeUnit::Millisecond, None),
- true,
- ),
- Field::new(greptime_value(), DataType::Float64, true),
- Field::new("job", DataType::Utf8, true),
- Field::new("host", DataType::Utf8, true),
- ]));
-
- let ts_array = TimestampMillisecondArray::from(vec![Some(1000), None, Some(3000)]);
- let val_array = Float64Array::from(vec![Some(1.0), Some(2.0), None]);
- let job_array = StringArray::from(vec![Some("job1"), None, Some("job3")]);
- let host_array = StringArray::from(vec![None, Some("host2"), Some("host3")]);
-
- let batch = RecordBatch::try_new(
- schema,
- vec![
- Arc::new(ts_array),
- Arc::new(val_array),
- Arc::new(job_array),
- Arc::new(host_array),
- ],
- )
- .unwrap();
-
- let region_id = RegionId::new(1, 1);
- let rows = record_batch_to_rows(&batch, region_id).unwrap();
-
- assert_eq!(rows.rows.len(), 3);
- assert_eq!(rows.schema.len(), 4);
-
- // Row 0: all non-null except host
- assert!(rows.rows[0].values[0].value_data.is_some());
- assert!(rows.rows[0].values[1].value_data.is_some());
- assert!(rows.rows[0].values[2].value_data.is_some());
- assert!(rows.rows[0].values[3].value_data.is_none());
-
- // Row 1: null timestamp, null job
- assert!(rows.rows[1].values[0].value_data.is_none());
- assert!(rows.rows[1].values[1].value_data.is_some());
- assert!(rows.rows[1].values[2].value_data.is_none());
- assert!(rows.rows[1].values[3].value_data.is_some());
-
- // Row 2: null value
- assert!(rows.rows[2].values[0].value_data.is_some());
- assert!(rows.rows[2].values[1].value_data.is_none());
- assert!(rows.rows[2].values[2].value_data.is_some());
- assert!(rows.rows[2].values[3].value_data.is_some());
- }
}
diff --git a/src/metric-engine/src/engine/options.rs b/src/metric-engine/src/engine/options.rs
index 232d3e93c50a..1c623611b7d4 100644
--- a/src/metric-engine/src/engine/options.rs
+++ b/src/metric-engine/src/engine/options.rs
@@ -67,7 +67,7 @@ pub fn set_data_region_options(
SEG_ROW_COUNT_FOR_DATA_REGION.to_string(),
);
// Set memtable options for the data region.
- options.insert("memtable.type".to_string(), "partition_tree".to_string());
+ options.insert("memtable.type".to_string(), "bulk".to_string());
if sparse_primary_key_encoding_if_absent
&& !options.contains_key(MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING)
{
@@ -213,6 +213,7 @@ mod tests {
let mut options = HashMap::new();
set_data_region_options(&mut options, false);
+ assert_eq!(options.get("memtable.type"), Some(&"bulk".to_string()));
assert_eq!(
options.get(COMPACTION_TYPE),
Some(&COMPACTION_TYPE_TWCS.to_string())
@@ -220,6 +221,18 @@ mod tests {
assert_eq!(options.get(TWCS_TIME_WINDOW), Some(&"1d".to_string()));
}
+ #[test]
+ fn test_set_data_region_options_sparse_primary_key_encoding() {
+ let mut options = HashMap::new();
+ set_data_region_options(&mut options, true);
+
+ assert_eq!(options.get("memtable.type"), Some(&"bulk".to_string()));
+ assert_eq!(
+ options.get(MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING),
+ Some(&"sparse".to_string())
+ );
+ }
+
#[test]
fn test_set_data_region_options_respects_user_compaction_time_window() {
// Test that user-specified time window is preserved
diff --git a/src/mito-codec/benches/bench_primary_key_filter.rs b/src/mito-codec/benches/bench_primary_key_filter.rs
index 528c374761a1..ee06ac28e16b 100644
--- a/src/mito-codec/benches/bench_primary_key_filter.rs
+++ b/src/mito-codec/benches/bench_primary_key_filter.rs
@@ -247,12 +247,12 @@ fn bench_primary_key_filter(c: &mut Criterion) {
let dense_pk = encode_dense_pk(&metadata, &row);
let dense_codec = DensePrimaryKeyCodec::new(&metadata);
- let mut dense_fast = dense_codec.primary_key_filter(&metadata, filters.clone(), false);
+ let mut dense_fast = dense_codec.primary_key_filter(&metadata, filters.clone());
let mut dense_offsets = Vec::new();
let sparse_pk = encode_sparse_pk(&metadata, &row);
let sparse_codec = SparsePrimaryKeyCodec::new(&metadata);
- let mut sparse_fast = sparse_codec.primary_key_filter(&metadata, filters.clone(), false);
+ let mut sparse_fast = sparse_codec.primary_key_filter(&metadata, filters.clone());
let mut sparse_offsets = SparseOffsetsCache::new();
let mut group = c.benchmark_group(format!("primary_key_filter/{case_name}"));
diff --git a/src/mito-codec/src/primary_key_filter.rs b/src/mito-codec/src/primary_key_filter.rs
index 2450a6e44af6..f6f9d78b6be3 100644
--- a/src/mito-codec/src/primary_key_filter.rs
+++ b/src/mito-codec/src/primary_key_filter.rs
@@ -21,7 +21,6 @@ use datatypes::value::Value;
use memcomparable::Serializer;
use snafu::ResultExt;
use store_api::metadata::RegionMetadataRef;
-use store_api::metric_engine_consts::DATA_SCHEMA_TABLE_ID_COLUMN_NAME;
use store_api::storage::ColumnId;
use crate::error::{EvaluateFilterSnafu, Result};
@@ -29,11 +28,6 @@ use crate::row_converter::{
DensePrimaryKeyCodec, PrimaryKeyFilter, SortField, SparseOffsetsCache, SparsePrimaryKeyCodec,
};
-/// Returns true if this is a partition column for metrics in the memtable.
-pub fn is_partition_column(name: &str) -> bool {
- name == DATA_SCHEMA_TABLE_ID_COLUMN_NAME
-}
-
#[derive(Clone)]
struct PrimaryKeyFilterInner {
filters: Arc>,
@@ -41,12 +35,8 @@ struct PrimaryKeyFilterInner {
}
impl PrimaryKeyFilterInner {
- fn new(
- metadata: RegionMetadataRef,
- filters: Arc>,
- skip_partition_column: bool,
- ) -> Self {
- let compiled_filters = Self::compile_filters(&metadata, &filters, skip_partition_column);
+ fn new(metadata: RegionMetadataRef, filters: Arc>) -> Self {
+ let compiled_filters = Self::compile_filters(&metadata, &filters);
Self {
filters,
compiled_filters,
@@ -56,7 +46,6 @@ impl PrimaryKeyFilterInner {
fn compile_filters(
metadata: &RegionMetadataRef,
filters: &[SimpleFilterEvaluator],
- skip_partition_column: bool,
) -> Vec {
if filters.is_empty() || metadata.primary_key.is_empty() {
return Vec::new();
@@ -64,10 +53,6 @@ impl PrimaryKeyFilterInner {
let mut compiled_filters = Vec::with_capacity(filters.len());
for (filter_idx, filter) in filters.iter().enumerate() {
- if skip_partition_column && is_partition_column(filter.column_name()) {
- continue;
- }
-
let Some(column) = metadata.column_by_name(filter.column_name()) else {
continue;
};
@@ -256,10 +241,9 @@ impl DensePrimaryKeyFilter {
metadata: RegionMetadataRef,
filters: Arc>,
codec: DensePrimaryKeyCodec,
- skip_partition_column: bool,
) -> Self {
Self {
- inner: PrimaryKeyFilterInner::new(metadata, filters, skip_partition_column),
+ inner: PrimaryKeyFilterInner::new(metadata, filters),
codec,
offsets_buf: Vec::new(),
}
@@ -310,10 +294,9 @@ impl SparsePrimaryKeyFilter {
metadata: RegionMetadataRef,
filters: Arc>,
codec: SparsePrimaryKeyCodec,
- skip_partition_column: bool,
) -> Self {
Self {
- inner: PrimaryKeyFilterInner::new(metadata, filters, skip_partition_column),
+ inner: PrimaryKeyFilterInner::new(metadata, filters),
codec,
offsets_cache: SparseOffsetsCache::new(),
}
@@ -513,7 +496,7 @@ mod tests {
)]);
let pk = encode_sparse_pk(&metadata, 1, 0, create_test_row());
let codec = SparsePrimaryKeyCodec::new(&metadata);
- let mut filter = SparsePrimaryKeyFilter::new(metadata, filters, codec, false);
+ let mut filter = SparsePrimaryKeyFilter::new(metadata, filters, codec);
assert!(filter.matches(&pk).unwrap());
}
@@ -526,7 +509,7 @@ mod tests {
)]);
let pk = encode_sparse_pk(&metadata, 1, 0, create_test_row());
let codec = SparsePrimaryKeyCodec::new(&metadata);
- let mut filter = SparsePrimaryKeyFilter::new(metadata, filters, codec, false);
+ let mut filter = SparsePrimaryKeyFilter::new(metadata, filters, codec);
assert!(!filter.matches(&pk).unwrap());
}
@@ -539,7 +522,7 @@ mod tests {
)]);
let pk = encode_sparse_pk(&metadata, 1, 0, create_test_row());
let codec = SparsePrimaryKeyCodec::new(&metadata);
- let mut filter = SparsePrimaryKeyFilter::new(metadata, filters, codec, false);
+ let mut filter = SparsePrimaryKeyFilter::new(metadata, filters, codec);
assert!(filter.matches(&pk).unwrap());
}
@@ -552,7 +535,7 @@ mod tests {
)]);
let pk = encode_dense_pk(&metadata, create_test_row());
let codec = DensePrimaryKeyCodec::new(&metadata);
- let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec, false);
+ let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec);
assert!(filter.matches(&pk).unwrap());
}
@@ -565,7 +548,7 @@ mod tests {
)]);
let pk = encode_dense_pk(&metadata, create_test_row());
let codec = DensePrimaryKeyCodec::new(&metadata);
- let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec, false);
+ let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec);
assert!(!filter.matches(&pk).unwrap());
}
@@ -578,7 +561,7 @@ mod tests {
)]);
let pk = encode_dense_pk(&metadata, create_test_row());
let codec = DensePrimaryKeyCodec::new(&metadata);
- let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec, false);
+ let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec);
assert!(filter.matches(&pk).unwrap());
}
@@ -597,8 +580,7 @@ mod tests {
for (op, value, expected) in cases {
let filters = Arc::new(vec![create_filter_with_op("pod", op, value)]);
- let mut filter =
- DensePrimaryKeyFilter::new(metadata.clone(), filters, codec.clone(), false);
+ let mut filter = DensePrimaryKeyFilter::new(metadata.clone(), filters, codec.clone());
assert_eq!(expected, filter.matches(&pk).unwrap());
}
}
@@ -618,8 +600,7 @@ mod tests {
for (op, value, expected) in cases {
let filters = Arc::new(vec![create_filter_with_op("pod", op, value)]);
- let mut filter =
- SparsePrimaryKeyFilter::new(metadata.clone(), filters, codec.clone(), false);
+ let mut filter = SparsePrimaryKeyFilter::new(metadata.clone(), filters, codec.clone());
assert_eq!(expected, filter.matches(&pk).unwrap());
}
}
@@ -652,7 +633,7 @@ mod tests {
.unwrap();
let filters = Arc::new(vec![create_filter_with_op("f", Operator::Eq, 0.0_f64)]);
- let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec, false);
+ let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec);
assert!(filter.matches(&pk).unwrap());
}
@@ -674,29 +655,7 @@ mod tests {
Operator::Eq,
42_u32,
)]);
- let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec, false);
-
- assert!(filter.matches(&pk).unwrap());
- }
-
- #[test]
- fn test_dense_primary_key_filter_can_skip_partition_column() {
- let metadata = setup_partitioned_metadata();
- let codec = DensePrimaryKeyCodec::new(&metadata);
- let mut pk = Vec::new();
- codec
- .encode_to_vec(
- [ValueRef::UInt32(42), ValueRef::String("host-a")].into_iter(),
- &mut pk,
- )
- .unwrap();
-
- let filters = Arc::new(vec![create_filter_with_op(
- DATA_SCHEMA_TABLE_ID_COLUMN_NAME,
- Operator::Eq,
- 7_u32,
- )]);
- let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec, true);
+ let mut filter = DensePrimaryKeyFilter::new(metadata, filters, codec);
assert!(filter.matches(&pk).unwrap());
}
diff --git a/src/mito-codec/src/row_converter.rs b/src/mito-codec/src/row_converter.rs
index 0a3205ce9ec1..fae2997182cc 100644
--- a/src/mito-codec/src/row_converter.rs
+++ b/src/mito-codec/src/row_converter.rs
@@ -124,7 +124,6 @@ pub trait PrimaryKeyCodec: Send + Sync + Debug {
&self,
metadata: &RegionMetadataRef,
filters: Arc>,
- skip_partition_column: bool,
) -> Box;
/// Returns the estimated size of the primary key.
diff --git a/src/mito-codec/src/row_converter/dense.rs b/src/mito-codec/src/row_converter/dense.rs
index 4bc774c94196..6cc70feaeaf2 100644
--- a/src/mito-codec/src/row_converter/dense.rs
+++ b/src/mito-codec/src/row_converter/dense.rs
@@ -556,13 +556,11 @@ impl PrimaryKeyCodec for DensePrimaryKeyCodec {
&self,
metadata: &RegionMetadataRef,
filters: Arc>,
- skip_partition_column: bool,
) -> Box {
Box::new(DensePrimaryKeyFilter::new(
metadata.clone(),
filters,
self.clone(),
- skip_partition_column,
))
}
diff --git a/src/mito-codec/src/row_converter/sparse.rs b/src/mito-codec/src/row_converter/sparse.rs
index 9c0e488576b7..37e18134702a 100644
--- a/src/mito-codec/src/row_converter/sparse.rs
+++ b/src/mito-codec/src/row_converter/sparse.rs
@@ -507,13 +507,11 @@ impl PrimaryKeyCodec for SparsePrimaryKeyCodec {
&self,
metadata: &RegionMetadataRef,
filters: Arc>,
- skip_partition_column: bool,
) -> Box {
Box::new(SparsePrimaryKeyFilter::new(
metadata.clone(),
filters,
self.clone(),
- skip_partition_column,
))
}
diff --git a/src/mito2/benches/memtable_bench.rs b/src/mito2/benches/memtable_bench.rs
index 8336625e3c3f..23ba1411f525 100644
--- a/src/mito2/benches/memtable_bench.rs
+++ b/src/mito2/benches/memtable_bench.rs
@@ -28,7 +28,6 @@ use mito2::memtable::bulk::context::BulkIterContext;
use mito2::memtable::bulk::part::BulkPartConverter;
use mito2::memtable::bulk::part_reader::BulkPartBatchIter;
use mito2::memtable::bulk::{BulkMemtable, BulkMemtableConfig};
-use mito2::memtable::partition_tree::{PartitionTreeConfig, PartitionTreeMemtable};
use mito2::memtable::time_series::TimeSeriesMemtable;
use mito2::memtable::{IterBuilder, Memtable, RangesOptions};
use mito2::read::flat_merge::FlatMergeIterator;
@@ -45,21 +44,6 @@ fn write_rows(c: &mut Criterion) {
// Note that this test only generate one time series.
let mut group = c.benchmark_group("write");
- group.bench_function("partition_tree", |b| {
- let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
- let memtable = PartitionTreeMemtable::new(
- 1,
- codec,
- metadata.clone(),
- None,
- &PartitionTreeConfig::default(),
- );
- let kvs =
- memtable_util::build_key_values(&metadata, "hello".to_string(), 42, ×tamps, 1);
- b.iter(|| {
- memtable.write(&kvs).unwrap();
- });
- });
group.bench_function("time_series", |b| {
let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None, true, MergeMode::LastRow);
let kvs =
@@ -73,26 +57,11 @@ fn write_rows(c: &mut Criterion) {
/// Scans all rows.
fn full_scan(c: &mut Criterion) {
let metadata = Arc::new(cpu_metadata());
- let config = PartitionTreeConfig::default();
let start_sec = 1710043200;
let generator = CpuDataGenerator::new(metadata.clone(), 4000, start_sec, start_sec + 3600 * 2);
let mut group = c.benchmark_group("full_scan");
group.sample_size(10);
- group.bench_function("partition_tree", |b| {
- let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
- let memtable = PartitionTreeMemtable::new(1, codec, metadata.clone(), None, &config);
- for kvs in generator.iter() {
- memtable.write(&kvs).unwrap();
- }
-
- b.iter(|| {
- let iter = memtable.iter(None, None, None).unwrap();
- for batch in iter {
- let _batch = batch.unwrap();
- }
- });
- });
group.bench_function("time_series", |b| {
let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None, true, MergeMode::LastRow);
for kvs in generator.iter() {
@@ -115,27 +84,11 @@ fn full_scan(c: &mut Criterion) {
/// Filters 1 host.
fn filter_1_host(c: &mut Criterion) {
let metadata = Arc::new(cpu_metadata());
- let config = PartitionTreeConfig::default();
let start_sec = 1710043200;
let generator = CpuDataGenerator::new(metadata.clone(), 4000, start_sec, start_sec + 3600 * 2);
let mut group = c.benchmark_group("filter_1_host");
group.sample_size(10);
- group.bench_function("partition_tree", |b| {
- let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
- let memtable = PartitionTreeMemtable::new(1, codec, metadata.clone(), None, &config);
- for kvs in generator.iter() {
- memtable.write(&kvs).unwrap();
- }
- let predicate = generator.random_host_filter();
-
- b.iter(|| {
- let iter = memtable.iter(None, Some(predicate.clone()), None).unwrap();
- for batch in iter {
- let _batch = batch.unwrap();
- }
- });
- });
group.bench_function("time_series", |b| {
let memtable = TimeSeriesMemtable::new(metadata.clone(), 1, None, true, MergeMode::LastRow);
for kvs in generator.iter() {
diff --git a/src/mito2/src/compaction/window.rs b/src/mito2/src/compaction/window.rs
index 6fc416050dd7..34d736331232 100644
--- a/src/mito2/src/compaction/window.rs
+++ b/src/mito2/src/compaction/window.rs
@@ -252,6 +252,7 @@ mod tests {
memtable: None,
merge_mode: None,
sst_format: None,
+ primary_key_encoding: None,
},
compaction_time_window: None,
}
diff --git a/src/mito2/src/config.rs b/src/mito2/src/config.rs
index 120b5adbe315..98e97fca85f0 100644
--- a/src/mito2/src/config.rs
+++ b/src/mito2/src/config.rs
@@ -29,7 +29,6 @@ use serde_with::serde_as;
use crate::cache::file_cache::DEFAULT_INDEX_CACHE_PERCENT;
use crate::error::Result;
use crate::gc::GcConfig;
-use crate::memtable::MemtableConfig;
use crate::sst::DEFAULT_WRITE_BUFFER_SIZE;
const MULTIPART_UPLOAD_MINIMUM_SIZE: ReadableSize = ReadableSize::mb(5);
@@ -167,9 +166,6 @@ pub struct MitoConfig {
#[cfg(feature = "vector_index")]
pub vector_index: VectorIndexConfig,
- /// Memtable config
- pub memtable: MemtableConfig,
-
/// Minimum time interval between two compactions.
/// To align with the old behavior, the default value is 0 (no restrictions).
#[serde(with = "humantime_serde")]
@@ -225,7 +221,6 @@ impl Default for MitoConfig {
bloom_filter_index: BloomFilterConfig::default(),
#[cfg(feature = "vector_index")]
vector_index: VectorIndexConfig::default(),
- memtable: MemtableConfig::default(),
min_compaction_interval: Duration::from_secs(0),
default_flat_format: true,
gc: GcConfig::default(),
@@ -700,25 +695,3 @@ fn divide_num_cpus(divisor: usize) -> usize {
cores.div_ceil(divisor)
}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn test_deserialize_config() {
- let s = r#"
-[memtable]
-type = "partition_tree"
-index_max_keys_per_shard = 8192
-data_freeze_threshold = 1024
-dedup = true
-fork_dictionary_bytes = "512MiB"
-"#;
- let config: MitoConfig = toml::from_str(s).unwrap();
- let MemtableConfig::PartitionTree(config) = &config.memtable else {
- unreachable!()
- };
- assert_eq!(1024, config.data_freeze_threshold);
- }
-}
diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs
index b822c1797621..9ae748205e21 100644
--- a/src/mito2/src/error.rs
+++ b/src/mito2/src/error.rs
@@ -58,13 +58,6 @@ pub enum Error {
location: Location,
},
- #[snafu(display("Failed to encode sparse primary key, reason: {}", reason))]
- EncodeSparsePrimaryKey {
- reason: String,
- #[snafu(implicit)]
- location: Location,
- },
-
#[snafu(display("OpenDAL operator failed"))]
OpenDal {
#[snafu(implicit)]
@@ -1346,7 +1339,6 @@ impl ErrorExt for Error {
WriteParquet { .. } => StatusCode::StorageUnavailable,
WriteGroup { source, .. } => source.status_code(),
- EncodeSparsePrimaryKey { .. } => StatusCode::Unexpected,
InvalidBatch { .. } => StatusCode::InvalidArguments,
InvalidRecordBatch { .. } => StatusCode::InvalidArguments,
ConvertVector { source, .. } => source.status_code(),
diff --git a/src/mito2/src/memtable.rs b/src/mito2/src/memtable.rs
index e1494aa47be9..1371c9a5782a 100644
--- a/src/mito2/src/memtable.rs
+++ b/src/mito2/src/memtable.rs
@@ -27,8 +27,8 @@ use datatypes::arrow::record_batch::RecordBatch;
use mito_codec::key_values::KeyValue;
pub use mito_codec::key_values::KeyValues;
use mito_codec::row_converter::{PrimaryKeyCodec, build_primary_key_codec};
-use serde::{Deserialize, Serialize};
use snafu::ensure;
+use store_api::codec::PrimaryKeyEncoding;
use store_api::metadata::RegionMetadataRef;
use store_api::storage::{ColumnId, SequenceNumber, SequenceRange};
@@ -36,7 +36,6 @@ use crate::config::MitoConfig;
use crate::error::{Result, UnsupportedOperationSnafu};
use crate::flush::WriteBufferManagerRef;
use crate::memtable::bulk::{BulkMemtableBuilder, CompactDispatcher};
-use crate::memtable::partition_tree::{PartitionTreeConfig, PartitionTreeMemtableBuilder};
use crate::memtable::time_series::TimeSeriesMemtableBuilder;
use crate::metrics::WRITE_BUFFER_BYTES;
use crate::read::Batch;
@@ -51,7 +50,6 @@ use crate::sst::parquet::file_range::PreFilterMode;
mod builder;
pub mod bulk;
-pub mod partition_tree;
pub mod simple_bulk_memtable;
mod stats;
pub mod time_partition;
@@ -70,15 +68,6 @@ pub use time_partition::filter_record_batch;
/// Should be unique under the same region.
pub type MemtableId = u32;
-/// Config for memtables.
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
-#[serde(tag = "type", rename_all = "snake_case")]
-pub enum MemtableConfig {
- PartitionTree(PartitionTreeConfig),
- #[default]
- TimeSeries,
-}
-
/// Options for querying ranges from a memtable.
#[derive(Clone)]
pub struct RangesOptions {
@@ -418,70 +407,89 @@ impl MemtableBuilderProvider {
pub(crate) fn builder_for_options(&self, options: &RegionOptions) -> MemtableBuilderRef {
let dedup = options.need_dedup();
let merge_mode = options.merge_mode();
+ let primary_key_encoding = options.primary_key_encoding();
let flat_format = options
.sst_format
.map(|format| format == FormatType::Flat)
.unwrap_or(self.config.default_flat_format);
if flat_format {
- if options.memtable.is_some() {
+ if options.memtable.is_some()
+ && !matches!(&options.memtable, Some(MemtableOptions::Bulk(_)))
+ {
common_telemetry::info!(
"Overriding memtable config, use BulkMemtable under flat format"
);
}
- return Arc::new(
- BulkMemtableBuilder::new(
- self.write_buffer_manager.clone(),
- !dedup, // append_mode: true if not dedup, false if dedup
- merge_mode,
- )
- .with_compact_dispatcher(self.compact_dispatcher.clone()),
- );
+ return Arc::new(self.bulk_memtable_builder(dedup, merge_mode, options));
+ }
+
+ if primary_key_encoding == PrimaryKeyEncoding::Sparse {
+ if options.memtable.is_some()
+ && !matches!(&options.memtable, Some(MemtableOptions::Bulk(_)))
+ {
+ common_telemetry::info!(
+ "Overriding memtable config, use BulkMemtable for sparse primary key encoding"
+ );
+ }
+ return Arc::new(self.bulk_memtable_builder(dedup, merge_mode, options));
}
// The format is not flat.
match &options.memtable {
+ Some(MemtableOptions::Bulk(config)) => Arc::new(
+ BulkMemtableBuilder::new(self.write_buffer_manager.clone(), !dedup, merge_mode)
+ .with_config(config.clone())
+ .with_compact_dispatcher(self.compact_dispatcher.clone()),
+ ),
Some(MemtableOptions::TimeSeries) => Arc::new(TimeSeriesMemtableBuilder::new(
self.write_buffer_manager.clone(),
dedup,
merge_mode,
)),
- Some(MemtableOptions::PartitionTree(opts)) => {
- Arc::new(PartitionTreeMemtableBuilder::new(
- PartitionTreeConfig {
- index_max_keys_per_shard: opts.index_max_keys_per_shard,
- data_freeze_threshold: opts.data_freeze_threshold,
- fork_dictionary_bytes: opts.fork_dictionary_bytes,
- dedup,
- merge_mode,
- },
- self.write_buffer_manager.clone(),
- ))
+ Some(MemtableOptions::PartitionTree(_)) => {
+ common_telemetry::warn!(
+ "PartitionTreeMemtable is deprecated, falling back to BulkMemtable"
+ );
+ Arc::new(
+ BulkMemtableBuilder::new(self.write_buffer_manager.clone(), !dedup, merge_mode)
+ .with_compact_dispatcher(self.compact_dispatcher.clone()),
+ )
}
None => self.default_primary_key_memtable_builder(dedup, merge_mode),
}
}
+ fn bulk_memtable_builder(
+ &self,
+ dedup: bool,
+ merge_mode: MergeMode,
+ options: &RegionOptions,
+ ) -> BulkMemtableBuilder {
+ let mut builder = BulkMemtableBuilder::new(
+ self.write_buffer_manager.clone(),
+ !dedup, // append_mode: true if not dedup, false if dedup
+ merge_mode,
+ )
+ .with_compact_dispatcher(self.compact_dispatcher.clone());
+
+ if let Some(MemtableOptions::Bulk(config)) = &options.memtable {
+ builder = builder.with_config(config.clone());
+ }
+
+ builder
+ }
+
fn default_primary_key_memtable_builder(
&self,
dedup: bool,
merge_mode: MergeMode,
) -> MemtableBuilderRef {
- match &self.config.memtable {
- MemtableConfig::PartitionTree(config) => {
- let mut config = config.clone();
- config.dedup = dedup;
- Arc::new(PartitionTreeMemtableBuilder::new(
- config,
- self.write_buffer_manager.clone(),
- ))
- }
- MemtableConfig::TimeSeries => Arc::new(TimeSeriesMemtableBuilder::new(
- self.write_buffer_manager.clone(),
- dedup,
- merge_mode,
- )),
- }
+ Arc::new(TimeSeriesMemtableBuilder::new(
+ self.write_buffer_manager.clone(),
+ dedup,
+ merge_mode,
+ ))
}
}
@@ -749,29 +757,9 @@ impl MemtableRange {
mod tests {
use std::sync::Arc;
- use common_base::readable_size::ReadableSize;
-
use super::*;
use crate::flush::{WriteBufferManager, WriteBufferManagerImpl};
-
- #[test]
- fn test_deserialize_memtable_config() {
- let s = r#"
-type = "partition_tree"
-index_max_keys_per_shard = 8192
-data_freeze_threshold = 1024
-dedup = true
-fork_dictionary_bytes = "512MiB"
-"#;
- let config: MemtableConfig = toml::from_str(s).unwrap();
- let MemtableConfig::PartitionTree(memtable_config) = config else {
- unreachable!()
- };
- assert!(memtable_config.dedup);
- assert_eq!(8192, memtable_config.index_max_keys_per_shard);
- assert_eq!(1024, memtable_config.data_freeze_threshold);
- assert_eq!(ReadableSize::mb(512), memtable_config.fork_dictionary_bytes);
- }
+ use crate::memtable::bulk::BulkMemtableConfig;
#[test]
fn test_alloc_tracker_without_manager() {
@@ -824,4 +812,25 @@ fork_dictionary_bytes = "512MiB"
assert_eq!(0, manager.memory_usage());
assert_eq!(0, manager.mutable_usage());
}
+
+ #[test]
+ fn test_forced_bulk_memtable_preserves_bulk_config() {
+ let provider = MemtableBuilderProvider::new(None, Arc::new(MitoConfig::default()));
+ let config = BulkMemtableConfig {
+ merge_threshold: 7,
+ encode_row_threshold: 11,
+ encode_bytes_threshold: 13,
+ max_merge_groups: 17,
+ };
+ let options = RegionOptions {
+ memtable: Some(MemtableOptions::Bulk(config.clone())),
+ primary_key_encoding: Some(PrimaryKeyEncoding::Sparse),
+ ..Default::default()
+ };
+
+ let builder =
+ provider.bulk_memtable_builder(options.need_dedup(), options.merge_mode(), &options);
+
+ assert_eq!(&config, builder.config());
+ }
}
diff --git a/src/mito2/src/memtable/bulk.rs b/src/mito2/src/memtable/bulk.rs
index 50330dd3aca3..0113b8f2dfa8 100644
--- a/src/mito2/src/memtable/bulk.rs
+++ b/src/mito2/src/memtable/bulk.rs
@@ -37,6 +37,8 @@ use common_time::Timestamp;
use datatypes::arrow::datatypes::SchemaRef;
use mito_codec::key_values::KeyValue;
use rayon::prelude::*;
+use serde::{Deserialize, Serialize};
+use serde_with::{DisplayFromStr, serde_as};
use store_api::metadata::RegionMetadataRef;
use store_api::storage::{ColumnId, FileId, RegionId, SequenceRange};
use tokio::sync::Semaphore;
@@ -98,15 +100,21 @@ static ENCODE_BYTES_THRESHOLD: LazyLock = LazyLock::new(|| {
});
/// Configuration for bulk memtable.
-#[derive(Debug, Clone)]
+#[serde_as]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(default)]
pub struct BulkMemtableConfig {
/// Threshold for triggering merge of parts.
+ #[serde_as(as = "DisplayFromStr")]
pub merge_threshold: usize,
/// Row threshold for encoding parts.
+ #[serde_as(as = "DisplayFromStr")]
pub encode_row_threshold: usize,
/// Bytes threshold for encoding parts.
+ #[serde_as(as = "DisplayFromStr")]
pub encode_bytes_threshold: usize,
/// Maximum number of groups for parallel merging.
+ #[serde_as(as = "DisplayFromStr")]
pub max_merge_groups: usize,
}
@@ -1349,11 +1357,22 @@ impl BulkMemtableBuilder {
}
}
+ /// Sets the bulk memtable config.
+ pub fn with_config(mut self, config: BulkMemtableConfig) -> Self {
+ self.config = config;
+ self
+ }
+
/// Sets the compact dispatcher.
pub fn with_compact_dispatcher(mut self, compact_dispatcher: Arc) -> Self {
self.compact_dispatcher = Some(compact_dispatcher);
self
}
+
+ #[cfg(test)]
+ pub(crate) fn config(&self) -> &BulkMemtableConfig {
+ &self.config
+ }
}
impl MemtableBuilder for BulkMemtableBuilder {
diff --git a/src/mito2/src/memtable/bulk/context.rs b/src/mito2/src/memtable/bulk/context.rs
index 3ac8e009fe0a..46713acad410 100644
--- a/src/mito2/src/memtable/bulk/context.rs
+++ b/src/mito2/src/memtable/bulk/context.rs
@@ -143,11 +143,10 @@ impl BulkIterContext {
pub(crate) fn build_pk_filter(&self) -> Option {
let pk_filters = self.pk_filters.as_ref()?;
let metadata = self.base.read_format.metadata();
- // Parquet PK prefilter always supports the partition column.
let inner = self
.base
.codec
- .primary_key_filter(metadata, Arc::clone(pk_filters), false);
+ .primary_key_filter(metadata, Arc::clone(pk_filters));
Some(CachedPrimaryKeyFilter::new(inner))
}
diff --git a/src/mito2/src/memtable/partition_tree.rs b/src/mito2/src/memtable/partition_tree.rs
deleted file mode 100644
index 662bfd99f648..000000000000
--- a/src/mito2/src/memtable/partition_tree.rs
+++ /dev/null
@@ -1,1037 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Memtable implementation based on a partition tree.
-
-pub(crate) mod data;
-mod dedup;
-mod dict;
-mod merger;
-mod partition;
-mod shard;
-mod shard_builder;
-mod tree;
-
-use std::fmt;
-use std::sync::Arc;
-use std::sync::atomic::{AtomicI64, AtomicU64, AtomicUsize, Ordering};
-
-use common_base::readable_size::ReadableSize;
-use common_stat::get_total_memory_readable;
-use mito_codec::key_values::KeyValue;
-use mito_codec::row_converter::{PrimaryKeyCodec, build_primary_key_codec};
-use serde::{Deserialize, Serialize};
-use store_api::metadata::RegionMetadataRef;
-use store_api::storage::{ColumnId, SequenceRange};
-use table::predicate::Predicate;
-
-use crate::error::{Result, UnsupportedOperationSnafu};
-use crate::flush::WriteBufferManagerRef;
-use crate::memtable::bulk::part::BulkPart;
-use crate::memtable::partition_tree::tree::PartitionTree;
-use crate::memtable::stats::WriteMetrics;
-use crate::memtable::{
- AllocTracker, BatchToRecordBatchContext, BoxedBatchIterator, IterBuilder, KeyValues,
- MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange, MemtableRangeContext,
- MemtableRanges, MemtableRef, MemtableStats, RangesOptions, read_column_ids_from_projection,
-};
-use crate::region::options::MergeMode;
-
-/// Use `1/DICTIONARY_SIZE_FACTOR` of OS memory as dictionary size.
-pub(crate) const DICTIONARY_SIZE_FACTOR: u64 = 8;
-pub(crate) const DEFAULT_MAX_KEYS_PER_SHARD: usize = 8192;
-pub(crate) const DEFAULT_FREEZE_THRESHOLD: usize = 131072;
-
-/// Id of a shard, only unique inside a partition.
-type ShardId = u32;
-/// Index of a primary key in a shard.
-type PkIndex = u16;
-
-/// Id of a primary key inside a tree.
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-struct PkId {
- shard_id: ShardId,
- pk_index: PkIndex,
-}
-
-// TODO(yingwen): `fork_dictionary_bytes` is per region option, if we have multiple partition tree
-// memtable then we will use a lot memory. We should find a better way to control the
-// dictionary size.
-/// Config for the partition tree memtable.
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-#[serde(default)]
-pub struct PartitionTreeConfig {
- /// Max keys in an index shard.
- pub index_max_keys_per_shard: usize,
- /// Number of rows to freeze a data part.
- pub data_freeze_threshold: usize,
- /// Whether to delete duplicates rows.
- ///
- /// Skips deserializing as it should be determined by whether the
- /// table is append only.
- #[serde(skip_deserializing)]
- pub dedup: bool,
- /// Total bytes of dictionary to keep in fork.
- pub fork_dictionary_bytes: ReadableSize,
- /// Merge mode of the tree.
- #[serde(skip_deserializing)]
- pub merge_mode: MergeMode,
-}
-
-impl Default for PartitionTreeConfig {
- fn default() -> Self {
- let mut fork_dictionary_bytes = ReadableSize::mb(512);
- if let Some(total_memory) = get_total_memory_readable() {
- let adjust_dictionary_bytes =
- std::cmp::min(total_memory / DICTIONARY_SIZE_FACTOR, fork_dictionary_bytes);
- if adjust_dictionary_bytes.0 > 0 {
- fork_dictionary_bytes = adjust_dictionary_bytes;
- }
- }
-
- Self {
- index_max_keys_per_shard: 8192,
- data_freeze_threshold: 131072,
- dedup: true,
- fork_dictionary_bytes,
- merge_mode: MergeMode::LastRow,
- }
- }
-}
-
-/// Memtable based on a partition tree.
-pub struct PartitionTreeMemtable {
- id: MemtableId,
- tree: Arc,
- alloc_tracker: AllocTracker,
- max_timestamp: AtomicI64,
- min_timestamp: AtomicI64,
- max_sequence: AtomicU64,
- /// Total written rows in memtable. This also includes deleted and duplicated rows.
- num_rows: AtomicUsize,
-}
-
-impl fmt::Debug for PartitionTreeMemtable {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- f.debug_struct("PartitionTreeMemtable")
- .field("id", &self.id)
- .finish()
- }
-}
-
-impl Memtable for PartitionTreeMemtable {
- fn id(&self) -> MemtableId {
- self.id
- }
-
- fn write(&self, kvs: &KeyValues) -> Result<()> {
- if kvs.is_empty() {
- return Ok(());
- }
-
- // TODO(yingwen): Validate schema while inserting rows.
-
- let mut metrics = WriteMetrics::default();
- let mut pk_buffer = Vec::new();
- // Ensures the memtable always updates stats.
- let res = self.tree.write(kvs, &mut pk_buffer, &mut metrics);
-
- if res.is_ok() {
- metrics.max_sequence = kvs.max_sequence();
- metrics.num_rows = kvs.num_rows();
- self.update_stats(&metrics);
- }
- res
- }
-
- fn write_one(&self, key_value: KeyValue) -> Result<()> {
- let mut metrics = WriteMetrics::default();
- let mut pk_buffer = Vec::new();
- // Ensures the memtable always updates stats.
- let res = self.tree.write_one(key_value, &mut pk_buffer, &mut metrics);
-
- // update max_sequence
- if res.is_ok() {
- metrics.max_sequence = metrics.max_sequence.max(key_value.sequence());
- metrics.num_rows = 1;
- self.update_stats(&metrics);
- }
- res
- }
-
- fn write_bulk(&self, _part: BulkPart) -> Result<()> {
- UnsupportedOperationSnafu {
- err_msg: "PartitionTreeMemtable does not support write_bulk",
- }
- .fail()
- }
-
- fn ranges(
- &self,
- projection: Option<&[ColumnId]>,
- options: RangesOptions,
- ) -> Result {
- let predicate = options.predicate;
- let sequence = options.sequence;
- let read_column_ids = read_column_ids_from_projection(&self.tree.metadata, projection);
- let projection = projection.map(|ids| ids.to_vec());
- let builder = Box::new(PartitionTreeIterBuilder {
- tree: self.tree.clone(),
- projection,
- predicate: predicate.predicate().cloned(),
- sequence,
- });
- let adapter_context = Arc::new(BatchToRecordBatchContext::new(
- self.tree.metadata.clone(),
- read_column_ids,
- ));
- let context = Arc::new(MemtableRangeContext::new_with_batch_to_record_batch(
- self.id,
- builder,
- predicate,
- Some(adapter_context),
- ));
-
- let range_stats = self.stats();
- let range = MemtableRange::new(context, range_stats);
- Ok(MemtableRanges {
- ranges: [(0, range)].into(),
- })
- }
-
- fn is_empty(&self) -> bool {
- self.tree.is_empty()
- }
-
- fn freeze(&self) -> Result<()> {
- self.alloc_tracker.done_allocating();
-
- self.tree.freeze()
- }
-
- fn stats(&self) -> MemtableStats {
- let estimated_bytes = self.alloc_tracker.bytes_allocated();
-
- if estimated_bytes == 0 {
- // no rows ever written
- return MemtableStats {
- estimated_bytes,
- time_range: None,
- num_rows: 0,
- num_ranges: 0,
- max_sequence: 0,
- series_count: 0,
- };
- }
-
- let ts_type = self
- .tree
- .metadata
- .time_index_column()
- .column_schema
- .data_type
- .clone()
- .as_timestamp()
- .expect("Timestamp column must have timestamp type");
- let max_timestamp = ts_type.create_timestamp(self.max_timestamp.load(Ordering::Relaxed));
- let min_timestamp = ts_type.create_timestamp(self.min_timestamp.load(Ordering::Relaxed));
- let series_count = self.tree.series_count();
- MemtableStats {
- estimated_bytes,
- time_range: Some((min_timestamp, max_timestamp)),
- num_rows: self.num_rows.load(Ordering::Relaxed),
- num_ranges: 1,
- max_sequence: self.max_sequence.load(Ordering::Relaxed),
- series_count,
- }
- }
-
- fn fork(&self, id: MemtableId, metadata: &RegionMetadataRef) -> MemtableRef {
- let tree = self.tree.fork(metadata.clone());
-
- let memtable = PartitionTreeMemtable::with_tree(id, tree);
- Arc::new(memtable)
- }
-}
-
-impl PartitionTreeMemtable {
- /// Returns a new memtable.
- pub fn new(
- id: MemtableId,
- row_codec: Arc,
- metadata: RegionMetadataRef,
- write_buffer_manager: Option,
- config: &PartitionTreeConfig,
- ) -> Self {
- Self::with_tree(
- id,
- PartitionTree::new(row_codec, metadata, config, write_buffer_manager.clone()),
- )
- }
-
- /// Creates a mutable memtable from the tree.
- ///
- /// It also adds the bytes used by shared parts (e.g. index) to the memory usage.
- fn with_tree(id: MemtableId, tree: PartitionTree) -> Self {
- let alloc_tracker = AllocTracker::new(tree.write_buffer_manager());
-
- Self {
- id,
- tree: Arc::new(tree),
- alloc_tracker,
- max_timestamp: AtomicI64::new(i64::MIN),
- min_timestamp: AtomicI64::new(i64::MAX),
- num_rows: AtomicUsize::new(0),
- max_sequence: AtomicU64::new(0),
- }
- }
-
- /// Updates stats of the memtable.
- fn update_stats(&self, metrics: &WriteMetrics) {
- // Only let the tracker tracks value bytes.
- self.alloc_tracker.on_allocation(metrics.value_bytes);
- self.max_timestamp
- .fetch_max(metrics.max_ts, Ordering::SeqCst);
- self.min_timestamp
- .fetch_min(metrics.min_ts, Ordering::SeqCst);
- self.num_rows.fetch_add(metrics.num_rows, Ordering::SeqCst);
- self.max_sequence
- .fetch_max(metrics.max_sequence, Ordering::SeqCst);
- }
-
- #[cfg(any(test, feature = "test"))]
- pub fn iter(
- &self,
- projection: Option<&[ColumnId]>,
- predicate: Option,
- sequence: Option,
- ) -> Result {
- self.tree.read(projection, predicate, sequence, None)
- }
-}
-
-/// Builder to build a [PartitionTreeMemtable].
-#[derive(Debug, Default)]
-pub struct PartitionTreeMemtableBuilder {
- config: PartitionTreeConfig,
- write_buffer_manager: Option,
-}
-
-impl PartitionTreeMemtableBuilder {
- /// Creates a new builder with specific `write_buffer_manager`.
- pub fn new(
- config: PartitionTreeConfig,
- write_buffer_manager: Option,
- ) -> Self {
- Self {
- config,
- write_buffer_manager,
- }
- }
-}
-
-impl MemtableBuilder for PartitionTreeMemtableBuilder {
- fn build(&self, id: MemtableId, metadata: &RegionMetadataRef) -> MemtableRef {
- let codec = build_primary_key_codec(metadata);
- Arc::new(PartitionTreeMemtable::new(
- id,
- codec,
- metadata.clone(),
- self.write_buffer_manager.clone(),
- &self.config,
- ))
- }
-
- fn use_bulk_insert(&self, _metadata: &RegionMetadataRef) -> bool {
- false
- }
-}
-
-struct PartitionTreeIterBuilder {
- tree: Arc,
- projection: Option>,
- predicate: Option,
- sequence: Option,
-}
-
-impl IterBuilder for PartitionTreeIterBuilder {
- fn build(&self, metrics: Option) -> Result {
- self.tree.read(
- self.projection.as_deref(),
- self.predicate.clone(),
- self.sequence,
- metrics,
- )
- }
-}
-
-#[cfg(test)]
-mod tests {
- use std::collections::HashMap;
- use std::sync::Arc;
-
- use api::v1::helper::{field_column_schema, row, tag_column_schema, time_index_column_schema};
- use api::v1::value::ValueData;
- use api::v1::{Mutation, OpType, Rows, SemanticType};
- use common_query::prelude::{greptime_timestamp, greptime_value};
- use common_time::Timestamp;
- use datatypes::data_type::ConcreteDataType;
- use datatypes::prelude::Vector;
- use datatypes::scalars::ScalarVector;
- use datatypes::schema::ColumnSchema;
- use datatypes::value::Value;
- use datatypes::vectors::{Int64Vector, StringVector};
- use mito_codec::row_converter::DensePrimaryKeyCodec;
- use store_api::metadata::{ColumnMetadata, RegionMetadataBuilder};
- use store_api::storage::RegionId;
-
- use super::*;
- use crate::test_util::memtable_util::{
- self, collect_iter_timestamps, region_metadata_to_row_schema,
- };
-
- #[test]
- fn test_memtable_sorted_input() {
- write_iter_sorted_input(true);
- write_iter_sorted_input(false);
- }
-
- fn write_iter_sorted_input(has_pk: bool) {
- let metadata = if has_pk {
- Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true))
- } else {
- Arc::new(memtable_util::metadata_with_primary_key(vec![], false))
- };
- let timestamps = (0..100).collect::>();
- let kvs =
- memtable_util::build_key_values(&metadata, "hello".to_string(), 42, ×tamps, 1);
- let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
- let memtable = PartitionTreeMemtable::new(
- 1,
- codec,
- metadata.clone(),
- None,
- &PartitionTreeConfig::default(),
- );
- memtable.write(&kvs).unwrap();
-
- let expected_ts = kvs
- .iter()
- .map(|kv| {
- kv.timestamp()
- .try_into_timestamp()
- .unwrap()
- .unwrap()
- .value()
- })
- .collect::>();
-
- let iter = memtable.iter(None, None, None).unwrap();
- let read = collect_iter_timestamps(iter);
- assert_eq!(expected_ts, read);
-
- let stats = memtable.stats();
- assert!(stats.bytes_allocated() > 0);
- assert_eq!(
- Some((
- Timestamp::new_millisecond(0),
- Timestamp::new_millisecond(99)
- )),
- stats.time_range()
- );
- }
-
- #[test]
- fn test_memtable_unsorted_input() {
- write_iter_unsorted_input(true);
- write_iter_unsorted_input(false);
- }
-
- fn write_iter_unsorted_input(has_pk: bool) {
- let metadata = if has_pk {
- Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true))
- } else {
- Arc::new(memtable_util::metadata_with_primary_key(vec![], false))
- };
- let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
- let memtable = PartitionTreeMemtable::new(
- 1,
- codec,
- metadata.clone(),
- None,
- &PartitionTreeConfig::default(),
- );
-
- let kvs = memtable_util::build_key_values(
- &metadata,
- "hello".to_string(),
- 0,
- &[1, 3, 7, 5, 6],
- 0, // sequence 0, 1, 2, 3, 4
- );
- memtable.write(&kvs).unwrap();
-
- let kvs = memtable_util::build_key_values(
- &metadata,
- "hello".to_string(),
- 0,
- &[5, 2, 4, 0, 7],
- 5, // sequence 5, 6, 7, 8, 9
- );
- memtable.write(&kvs).unwrap();
-
- let iter = memtable.iter(None, None, None).unwrap();
- let read = collect_iter_timestamps(iter);
- assert_eq!(vec![0, 1, 2, 3, 4, 5, 6, 7], read);
-
- let iter = memtable.iter(None, None, None).unwrap();
- let read = iter
- .flat_map(|batch| {
- batch
- .unwrap()
- .sequences()
- .iter_data()
- .collect::>()
- .into_iter()
- })
- .map(|v| v.unwrap())
- .collect::>();
- assert_eq!(vec![8, 0, 6, 1, 7, 5, 4, 9], read);
-
- let stats = memtable.stats();
- assert!(stats.bytes_allocated() > 0);
- assert_eq!(
- Some((Timestamp::new_millisecond(0), Timestamp::new_millisecond(7))),
- stats.time_range()
- );
- }
-
- #[test]
- fn test_memtable_projection() {
- write_iter_projection(true);
- write_iter_projection(false);
- }
-
- fn write_iter_projection(has_pk: bool) {
- let metadata = if has_pk {
- Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true))
- } else {
- Arc::new(memtable_util::metadata_with_primary_key(vec![], false))
- };
- // Try to build a memtable via the builder.
- let memtable = PartitionTreeMemtableBuilder::new(PartitionTreeConfig::default(), None)
- .build(1, &metadata);
-
- let expect = (0..100).collect::>();
- let kvs = memtable_util::build_key_values(&metadata, "hello".to_string(), 10, &expect, 1);
- memtable.write(&kvs).unwrap();
- let ranges = memtable
- .ranges(Some(&[3]), RangesOptions::default())
- .unwrap();
- let iter = ranges.build(None).unwrap();
-
- let mut v0_all = vec![];
- for res in iter {
- let batch = res.unwrap();
- assert_eq!(1, batch.fields().len());
- let v0 = batch
- .fields()
- .first()
- .unwrap()
- .data
- .as_any()
- .downcast_ref::()
- .unwrap();
- v0_all.extend(v0.iter_data().map(|v| v.unwrap()));
- }
- assert_eq!(expect, v0_all);
- }
-
- #[test]
- fn test_write_iter_multi_keys() {
- write_iter_multi_keys(1, 100);
- write_iter_multi_keys(2, 100);
- write_iter_multi_keys(4, 100);
- write_iter_multi_keys(8, 5);
- write_iter_multi_keys(2, 10);
- }
-
- fn write_iter_multi_keys(max_keys: usize, freeze_threshold: usize) {
- let metadata = Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true));
- let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
- let memtable = PartitionTreeMemtable::new(
- 1,
- codec,
- metadata.clone(),
- None,
- &PartitionTreeConfig {
- index_max_keys_per_shard: max_keys,
- data_freeze_threshold: freeze_threshold,
- ..Default::default()
- },
- );
-
- let mut data = Vec::new();
- // 4 partitions, each partition 4 pks.
- for i in 0..4 {
- for j in 0..4 {
- // key: i, a{j}
- let timestamps = [11, 13, 1, 5, 3, 7, 9];
- let key = format!("a{j}");
- let kvs =
- memtable_util::build_key_values(&metadata, key.clone(), i, ×tamps, 0);
- memtable.write(&kvs).unwrap();
- for ts in timestamps {
- data.push((i, key.clone(), ts));
- }
- }
- for j in 0..4 {
- // key: i, a{j}
- let timestamps = [10, 2, 4, 8, 6];
- let key = format!("a{j}");
- let kvs =
- memtable_util::build_key_values(&metadata, key.clone(), i, ×tamps, 200);
- memtable.write(&kvs).unwrap();
- for ts in timestamps {
- data.push((i, key.clone(), ts));
- }
- }
- }
- data.sort_unstable();
-
- let expect = data.into_iter().map(|x| x.2).collect::>();
- let iter = memtable.iter(None, None, None).unwrap();
- let read = collect_iter_timestamps(iter);
- assert_eq!(expect, read);
- }
-
- #[test]
- fn test_deserialize_config() {
- let config = PartitionTreeConfig {
- dedup: false,
- ..Default::default()
- };
- // Creates a json with dedup = false.
- let json = serde_json::to_string(&config).unwrap();
- let config: PartitionTreeConfig = serde_json::from_str(&json).unwrap();
- assert!(config.dedup);
- assert_eq!(PartitionTreeConfig::default(), config);
- }
-
- fn metadata_for_metric_engine() -> RegionMetadataRef {
- let mut builder = RegionMetadataBuilder::new(RegionId::new(123, 456));
- builder
- .push_column_metadata(ColumnMetadata {
- column_schema: ColumnSchema::new(
- "__table_id",
- ConcreteDataType::uint32_datatype(),
- false,
- ),
- semantic_type: SemanticType::Tag,
- column_id: 2147483652,
- })
- .push_column_metadata(ColumnMetadata {
- column_schema: ColumnSchema::new(
- "__tsid",
- ConcreteDataType::uint64_datatype(),
- false,
- ),
- semantic_type: SemanticType::Tag,
- column_id: 2147483651,
- })
- .push_column_metadata(ColumnMetadata {
- column_schema: ColumnSchema::new(
- "test_label",
- ConcreteDataType::string_datatype(),
- false,
- ),
- semantic_type: SemanticType::Tag,
- column_id: 2,
- })
- .push_column_metadata(ColumnMetadata {
- column_schema: ColumnSchema::new(
- greptime_timestamp(),
- ConcreteDataType::timestamp_millisecond_datatype(),
- false,
- ),
- semantic_type: SemanticType::Timestamp,
- column_id: 0,
- })
- .push_column_metadata(ColumnMetadata {
- column_schema: ColumnSchema::new(
- greptime_value(),
- ConcreteDataType::float64_datatype(),
- true,
- ),
- semantic_type: SemanticType::Field,
- column_id: 1,
- })
- .primary_key(vec![2147483652, 2147483651, 2]);
- let region_metadata = builder.build().unwrap();
- Arc::new(region_metadata)
- }
-
- fn build_key_values(
- metadata: RegionMetadataRef,
- labels: &[&str],
- table_id: &[u32],
- ts_id: &[u64],
- ts: &[i64],
- values: &[f64],
- sequence: u64,
- ) -> KeyValues {
- let column_schema = region_metadata_to_row_schema(&metadata);
-
- let rows = ts
- .iter()
- .zip(table_id.iter())
- .zip(ts_id.iter())
- .zip(labels.iter())
- .zip(values.iter())
- .map(|((((ts, table_id), ts_id), label), val)| {
- row(vec![
- ValueData::U32Value(*table_id),
- ValueData::U64Value(*ts_id),
- ValueData::StringValue(label.to_string()),
- ValueData::TimestampMillisecondValue(*ts),
- ValueData::F64Value(*val),
- ])
- })
- .collect();
- let mutation = api::v1::Mutation {
- op_type: 1,
- sequence,
- rows: Some(Rows {
- schema: column_schema,
- rows,
- }),
- write_hint: None,
- };
- KeyValues::new(metadata.as_ref(), mutation).unwrap()
- }
-
- #[test]
- fn test_write_freeze() {
- let metadata = metadata_for_metric_engine();
- let memtable = PartitionTreeMemtableBuilder::new(
- PartitionTreeConfig {
- index_max_keys_per_shard: 40,
- ..Default::default()
- },
- None,
- )
- .build(1, &metadata);
-
- let codec = DensePrimaryKeyCodec::new(&metadata);
-
- memtable
- .write(&build_key_values(
- metadata.clone(),
- &["daily", "10min", "daily", "10min"],
- &[1025, 1025, 1025, 1025],
- &[
- 16442255374049317291,
- 5686004715529701024,
- 16442255374049317291,
- 5686004715529701024,
- ],
- &[1712070000000, 1712717731000, 1712761200000, 1712761200000],
- &[0.0, 0.0, 0.0, 0.0],
- 1,
- ))
- .unwrap();
-
- memtable.freeze().unwrap();
- let new_memtable = memtable.fork(2, &metadata);
-
- new_memtable
- .write(&build_key_values(
- metadata.clone(),
- &["10min"],
- &[1025],
- &[5686004715529701024],
- &[1714643131000],
- &[0.1],
- 2,
- ))
- .unwrap();
-
- let mut reader = new_memtable
- .ranges(None, RangesOptions::default())
- .unwrap()
- .build(None)
- .unwrap();
- let batch = reader.next().unwrap().unwrap();
- let pk = codec.decode(batch.primary_key()).unwrap().into_dense();
- if let Value::String(s) = &pk[2] {
- assert_eq!("10min", s.as_utf8());
- } else {
- unreachable!()
- }
- }
-
- fn kv_region_metadata() -> RegionMetadataRef {
- let mut builder = RegionMetadataBuilder::new(RegionId::new(123, 456));
- builder
- .push_column_metadata(ColumnMetadata {
- column_schema: ColumnSchema::new(
- "ts",
- ConcreteDataType::timestamp_millisecond_datatype(),
- false,
- ),
- semantic_type: SemanticType::Timestamp,
- column_id: 0,
- })
- .push_column_metadata(ColumnMetadata {
- column_schema: ColumnSchema::new("k", ConcreteDataType::string_datatype(), false),
- semantic_type: SemanticType::Tag,
- column_id: 1,
- })
- .push_column_metadata(ColumnMetadata {
- column_schema: ColumnSchema::new("v", ConcreteDataType::string_datatype(), false),
- semantic_type: SemanticType::Field,
- column_id: 2,
- })
- .primary_key(vec![1]);
- let region_metadata = builder.build().unwrap();
- Arc::new(region_metadata)
- }
-
- fn kv_column_schemas() -> Vec {
- vec![
- time_index_column_schema("ts", api::v1::ColumnDataType::TimestampMillisecond),
- tag_column_schema("k", api::v1::ColumnDataType::String),
- field_column_schema("v", api::v1::ColumnDataType::String),
- ]
- }
-
- fn key_values>(
- metadata: &RegionMetadataRef,
- keys: impl Iterator- ,
- ) -> KeyValues {
- let rows = keys
- .map(|c| {
- row(vec![
- ValueData::TimestampMillisecondValue(0),
- ValueData::StringValue(c.as_ref().to_string()),
- ValueData::StringValue(c.as_ref().to_string()),
- ])
- })
- .collect();
- let mutation = Mutation {
- op_type: OpType::Put as i32,
- sequence: 0,
- rows: Some(Rows {
- schema: kv_column_schemas(),
- rows,
- }),
- write_hint: None,
- };
- KeyValues::new(metadata, mutation).unwrap()
- }
-
- fn collect_kvs(
- iter: BoxedBatchIterator,
- region_meta: &RegionMetadataRef,
- ) -> HashMap {
- let decoder = DensePrimaryKeyCodec::new(region_meta);
- let mut res = HashMap::new();
- for v in iter {
- let batch = v.unwrap();
- let values = decoder.decode(batch.primary_key()).unwrap().into_dense();
- let field_vector = batch.fields()[0]
- .data
- .as_any()
- .downcast_ref::()
- .unwrap();
- for row in 0..batch.num_rows() {
- res.insert(
- values[0].as_string().unwrap(),
- field_vector.get(row).as_string().unwrap(),
- );
- }
- }
- res
- }
-
- #[test]
- fn test_reorder_insert_key_values() {
- let metadata = kv_region_metadata();
- let memtable = PartitionTreeMemtableBuilder::new(PartitionTreeConfig::default(), None)
- .build(1, &metadata);
-
- memtable
- .write(&key_values(&metadata, ('a'..'h').map(|c| c.to_string())))
- .unwrap();
- memtable.freeze().unwrap();
- assert_eq!(
- collect_kvs(
- memtable
- .ranges(None, RangesOptions::default())
- .unwrap()
- .build(None)
- .unwrap(),
- &metadata
- ),
- ('a'..'h').map(|c| (c.to_string(), c.to_string())).collect()
- );
- let forked = memtable.fork(2, &metadata);
-
- let keys = ["c", "f", "i", "h", "b", "e", "g"];
- forked.write(&key_values(&metadata, keys.iter())).unwrap();
- forked.freeze().unwrap();
- assert_eq!(
- collect_kvs(
- forked
- .ranges(None, RangesOptions::default())
- .unwrap()
- .build(None)
- .unwrap(),
- &metadata
- ),
- keys.iter()
- .map(|c| (c.to_string(), c.to_string()))
- .collect()
- );
-
- let forked2 = forked.fork(3, &metadata);
-
- let keys = ["g", "e", "a", "f", "b", "c", "h"];
- forked2.write(&key_values(&metadata, keys.iter())).unwrap();
-
- let kvs = collect_kvs(
- forked2
- .ranges(None, RangesOptions::default())
- .unwrap()
- .build(None)
- .unwrap(),
- &metadata,
- );
- let expected = keys
- .iter()
- .map(|c| (c.to_string(), c.to_string()))
- .collect::>();
- assert_eq!(kvs, expected);
- }
-
- #[test]
- fn test_build_record_batch_iter_from_memtable() {
- let metadata = Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true));
- let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
- let memtable = PartitionTreeMemtable::new(
- 1,
- codec,
- metadata.clone(),
- None,
- &PartitionTreeConfig::default(),
- );
-
- let kvs =
- memtable_util::build_key_values(&metadata, "hello".to_string(), 42, &[1, 2, 3], 0);
- memtable.write(&kvs).unwrap();
-
- let read_column_ids: Vec = metadata
- .column_metadatas
- .iter()
- .map(|c| c.column_id)
- .collect();
- let ranges = memtable
- .ranges(Some(&read_column_ids), RangesOptions::default())
- .unwrap();
- assert!(!ranges.ranges.is_empty());
-
- let mut total_rows = 0;
- for range in ranges.ranges.into_values() {
- let mut iter = range.build_record_batch_iter(None, None).unwrap();
- while let Some(rb) = iter.next().transpose().unwrap() {
- total_rows += rb.num_rows();
- let schema = rb.schema();
- let column_names: Vec<_> =
- schema.fields().iter().map(|f| f.name().as_str()).collect();
- assert_eq!(
- column_names,
- vec![
- "__table_id",
- "k0",
- "v0",
- "v1",
- "ts",
- "__primary_key",
- "__sequence",
- "__op_type",
- ]
- );
- }
- }
- assert_eq!(3, total_rows);
- }
-
- #[test]
- fn test_build_record_batch_iter_with_time_range() {
- let metadata = Arc::new(memtable_util::metadata_with_primary_key(vec![1, 0], true));
- let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
- let memtable = PartitionTreeMemtable::new(
- 1,
- codec,
- metadata.clone(),
- None,
- &PartitionTreeConfig::default(),
- );
-
- let kvs = memtable_util::build_key_values(
- &metadata,
- "hello".to_string(),
- 42,
- &[1, 2, 3, 4, 5],
- 0,
- );
- memtable.write(&kvs).unwrap();
-
- let read_column_ids: Vec = metadata
- .column_metadatas
- .iter()
- .map(|c| c.column_id)
- .collect();
- let ranges = memtable
- .ranges(Some(&read_column_ids), RangesOptions::default())
- .unwrap();
- assert!(!ranges.ranges.is_empty());
-
- let time_range = (Timestamp::new_millisecond(2), Timestamp::new_millisecond(4));
-
- let mut total_rows = 0;
- let mut all_timestamps = Vec::new();
- for range in ranges.ranges.into_values() {
- let mut iter = range
- .build_record_batch_iter(Some(time_range), None)
- .unwrap();
- while let Some(rb) = iter.next().transpose().unwrap() {
- total_rows += rb.num_rows();
- // ts column is at index 4 (after __table_id, k0, v0, v1)
- let ts_col = rb
- .column_by_name("ts")
- .unwrap()
- .as_any()
- .downcast_ref::()
- .unwrap();
- for i in 0..ts_col.len() {
- all_timestamps.push(ts_col.value(i));
- }
- }
- }
- assert_eq!(3, total_rows);
- all_timestamps.sort();
- assert_eq!(vec![2, 3, 4], all_timestamps);
- }
-}
diff --git a/src/mito2/src/memtable/partition_tree/data.rs b/src/mito2/src/memtable/partition_tree/data.rs
deleted file mode 100644
index f6e2a59becce..000000000000
--- a/src/mito2/src/memtable/partition_tree/data.rs
+++ /dev/null
@@ -1,1398 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Data part of a shard.
-
-use std::cmp::{Ordering, Reverse};
-use std::fmt::{Debug, Formatter};
-use std::ops::Range;
-use std::sync::Arc;
-use std::time::{Duration, Instant};
-
-use bytes::Bytes;
-use datatypes::arrow;
-use datatypes::arrow::array::{ArrayRef, RecordBatch, UInt16Array, UInt32Array, UInt64Array};
-use datatypes::arrow::datatypes::{Field, Schema, SchemaRef};
-use datatypes::data_type::DataType;
-use datatypes::prelude::{ConcreteDataType, MutableVector, ScalarVectorBuilder, Vector, VectorRef};
-use datatypes::schema::ColumnSchema;
-use datatypes::types::TimestampType;
-use datatypes::vectors::{
- TimestampMicrosecondVector, TimestampMillisecondVector, TimestampNanosecondVector,
- TimestampSecondVector, UInt8Vector, UInt8VectorBuilder, UInt16Vector, UInt16VectorBuilder,
- UInt64Vector, UInt64VectorBuilder,
-};
-use mito_codec::key_values::KeyValue;
-use parquet::arrow::ArrowWriter;
-use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder};
-use parquet::basic::{Compression, Encoding, ZstdLevel};
-use parquet::file::properties::{EnabledStatistics, WriterProperties};
-use parquet::schema::types::ColumnPath;
-use snafu::ResultExt;
-use store_api::metadata::RegionMetadataRef;
-use store_api::storage::consts::{OP_TYPE_COLUMN_NAME, SEQUENCE_COLUMN_NAME};
-
-use crate::error;
-use crate::error::Result;
-use crate::memtable::partition_tree::PkIndex;
-use crate::memtable::partition_tree::merger::{DataBatchKey, DataNode, DataSource, Merger};
-use crate::metrics::{
- PARTITION_TREE_DATA_BUFFER_FREEZE_STAGE_ELAPSED, PARTITION_TREE_READ_STAGE_ELAPSED,
-};
-use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE;
-
-const PK_INDEX_COLUMN_NAME: &str = "__pk_index";
-
-/// Initial capacity for the data buffer.
-pub(crate) const DATA_INIT_CAP: usize = 8;
-
-/// Range of a data batch.
-#[derive(Debug, Clone, Copy)]
-pub(crate) struct DataBatchRange {
- /// Primary key index of this batch.
- pub(crate) pk_index: PkIndex,
- /// Start of current primary key inside record batch.
- pub(crate) start: usize,
- /// End of current primary key inside record batch.
- pub(crate) end: usize,
-}
-
-impl DataBatchRange {
- pub(crate) fn len(&self) -> usize {
- self.end - self.start
- }
-}
-
-/// Data part batches returns by `DataParts::read`.
-#[derive(Debug, Clone, Copy)]
-pub struct DataBatch<'a> {
- /// Record batch of data.
- rb: &'a RecordBatch,
- /// Range of current primary key inside record batch
- range: DataBatchRange,
-}
-
-impl<'a> DataBatch<'a> {
- pub(crate) fn pk_index(&self) -> PkIndex {
- self.range.pk_index
- }
-
- pub(crate) fn range(&self) -> DataBatchRange {
- self.range
- }
-
- pub(crate) fn slice_record_batch(&self) -> RecordBatch {
- self.rb.slice(self.range.start, self.range.len())
- }
-
- pub(crate) fn first_row(&self) -> (i64, u64) {
- let ts_values = timestamp_array_to_i64_slice(self.rb.column(1));
- let sequence_values = self
- .rb
- .column(2)
- .as_any()
- .downcast_ref::()
- .unwrap()
- .values();
- (
- ts_values[self.range.start],
- sequence_values[self.range.start],
- )
- }
-
- pub(crate) fn last_row(&self) -> (i64, u64) {
- let ts_values = timestamp_array_to_i64_slice(self.rb.column(1));
- let sequence_values = self
- .rb
- .column(2)
- .as_any()
- .downcast_ref::()
- .unwrap()
- .values();
- (
- ts_values[self.range.end - 1],
- sequence_values[self.range.end - 1],
- )
- }
-
- pub(crate) fn first_key(&self) -> DataBatchKey {
- let pk_index = self.pk_index();
- let ts_array = self.rb.column(1);
-
- // maybe safe the result somewhere.
- let ts_values = timestamp_array_to_i64_slice(ts_array);
- let timestamp = ts_values[self.range.start];
- DataBatchKey {
- pk_index,
- timestamp,
- }
- }
-
- pub(crate) fn search_key(&self, key: &DataBatchKey) -> Result {
- let DataBatchKey {
- pk_index,
- timestamp,
- } = key;
- assert_eq!(*pk_index, self.range.pk_index);
- let ts_values = timestamp_array_to_i64_slice(self.rb.column(1));
- let ts_values = &ts_values[self.range.start..self.range.end];
- ts_values.binary_search(timestamp)
- }
-
- pub(crate) fn slice(self, offset: usize, length: usize) -> DataBatch<'a> {
- let start = self.range.start + offset;
- let end = start + length;
- DataBatch {
- rb: self.rb,
- range: DataBatchRange {
- pk_index: self.range.pk_index,
- start,
- end,
- },
- }
- }
-
- pub(crate) fn num_rows(&self) -> usize {
- self.range.len()
- }
-}
-
-/// Buffer for the value part (pk_index, ts, sequence, op_type, field columns) in a shard.
-pub struct DataBuffer {
- metadata: RegionMetadataRef,
- /// Schema for data part (primary keys are replaced with pk_index)
- data_part_schema: SchemaRef,
- /// Builder for primary key index.
- pk_index_builder: UInt16VectorBuilder,
- /// Builder for timestamp column.
- ts_builder: Box,
- /// Builder for sequence column.
- sequence_builder: UInt64VectorBuilder,
- /// Builder for op_type column.
- op_type_builder: UInt8VectorBuilder,
- /// Builders for field columns.
- field_builders: Vec,
-
- dedup: bool,
-}
-
-impl DataBuffer {
- /// Creates a `DataBuffer` instance with given schema and capacity.
- pub fn with_capacity(metadata: RegionMetadataRef, init_capacity: usize, dedup: bool) -> Self {
- let ts_builder = metadata
- .time_index_column()
- .column_schema
- .data_type
- .create_mutable_vector(init_capacity);
-
- let pk_id_builder = UInt16VectorBuilder::with_capacity(init_capacity);
- let sequence_builder = UInt64VectorBuilder::with_capacity(init_capacity);
- let op_type_builder = UInt8VectorBuilder::with_capacity(init_capacity);
-
- let field_builders = metadata
- .field_columns()
- .map(|c| LazyMutableVectorBuilder::new(c.column_schema.data_type.clone()))
- .collect::>();
-
- let data_part_schema = memtable_schema_to_encoded_schema(&metadata);
- Self {
- metadata,
- data_part_schema,
- pk_index_builder: pk_id_builder,
- ts_builder,
- sequence_builder,
- op_type_builder,
- field_builders,
- dedup,
- }
- }
-
- /// Writes a row to data buffer.
- pub fn write_row(&mut self, pk_index: PkIndex, kv: &KeyValue) {
- self.ts_builder.push_value_ref(&kv.timestamp());
- self.pk_index_builder.push(Some(pk_index));
- self.sequence_builder.push(Some(kv.sequence()));
- self.op_type_builder.push(Some(kv.op_type() as u8));
-
- debug_assert_eq!(self.field_builders.len(), kv.num_fields());
-
- for (idx, field) in kv.fields().enumerate() {
- self.field_builders[idx]
- .get_or_create_builder(self.ts_builder.len())
- .push_value_ref(&field);
- }
- }
-
- /// Freezes `DataBuffer` to bytes.
- /// If `pk_weights` is present, it will be used to sort rows.
- ///
- /// `freeze` clears the buffers of builders.
- pub fn freeze(
- &mut self,
- pk_weights: Option<&[u16]>,
- replace_pk_index: bool,
- ) -> Result {
- let timestamp_col_name = self.metadata.time_index_column().column_schema.name.clone();
- let encoder = DataPartEncoder::new(
- &self.metadata,
- pk_weights,
- None,
- timestamp_col_name,
- replace_pk_index,
- self.dedup,
- );
- let parts = encoder.write(self)?;
- Ok(parts)
- }
-
- /// Builds a lazily initialized data buffer reader from [DataBuffer]
- pub fn read(&self) -> Result {
- let _timer = PARTITION_TREE_READ_STAGE_ELAPSED
- .with_label_values(&["read_data_buffer"])
- .start_timer();
-
- let (pk_index, timestamp, sequence, op_type) = (
- self.pk_index_builder.finish_cloned(),
- self.ts_builder.to_vector_cloned(),
- self.sequence_builder.finish_cloned(),
- self.op_type_builder.finish_cloned(),
- );
-
- let mut fields = Vec::with_capacity(self.field_builders.len());
- for b in self.field_builders.iter() {
- let field = match b {
- LazyMutableVectorBuilder::Type(ty) => LazyFieldVector::Type(ty.clone()),
- LazyMutableVectorBuilder::Builder(builder) => {
- LazyFieldVector::Vector(builder.to_vector_cloned())
- }
- };
- fields.push(field);
- }
-
- Ok(DataBufferReaderBuilder {
- schema: self.data_part_schema.clone(),
- pk_index,
- timestamp,
- sequence,
- op_type,
- fields,
- dedup: self.dedup,
- })
- }
-
- /// Returns num of rows in data buffer.
- pub fn num_rows(&self) -> usize {
- self.ts_builder.len()
- }
-
- /// Returns whether the buffer is empty.
- pub fn is_empty(&self) -> bool {
- self.num_rows() == 0
- }
-}
-
-enum LazyMutableVectorBuilder {
- Type(ConcreteDataType),
- Builder(Box),
-}
-
-impl LazyMutableVectorBuilder {
- fn new(ty: ConcreteDataType) -> Self {
- Self::Type(ty)
- }
-
- fn get_or_create_builder(&mut self, init_capacity: usize) -> &mut Box {
- match self {
- LazyMutableVectorBuilder::Type(ty) => {
- let builder = ty.create_mutable_vector(init_capacity);
- *self = LazyMutableVectorBuilder::Builder(builder);
- self.get_or_create_builder(init_capacity)
- }
- LazyMutableVectorBuilder::Builder(builder) => builder,
- }
- }
-}
-
-/// Converts `DataBuffer` to record batches, with rows sorted according to pk_weights.
-/// `dedup`: whether to true to remove the duplicated rows inside `DataBuffer`.
-/// `replace_pk_index`: whether to replace the pk_index values with corresponding pk weight.
-fn drain_data_buffer_to_record_batches(
- schema: SchemaRef,
- buffer: &mut DataBuffer,
- pk_weights: Option<&[u16]>,
- dedup: bool,
- replace_pk_index: bool,
-) -> Result {
- let num_rows = buffer.ts_builder.len();
-
- let (pk_index_v, ts_v, sequence_v, op_type_v) = (
- buffer.pk_index_builder.finish(),
- buffer.ts_builder.to_vector(),
- buffer.sequence_builder.finish(),
- buffer.op_type_builder.finish(),
- );
-
- let (indices_to_take, mut columns) = build_row_sort_indices_and_columns(
- pk_weights,
- pk_index_v,
- ts_v,
- sequence_v,
- op_type_v,
- replace_pk_index,
- dedup,
- buffer.field_builders.len() + 4,
- )?;
-
- for b in buffer.field_builders.iter_mut() {
- let array = match b {
- LazyMutableVectorBuilder::Type(ty) => {
- let mut single_null = ty.create_mutable_vector(num_rows);
- single_null.push_nulls(num_rows);
- single_null.to_vector().to_arrow_array()
- }
- LazyMutableVectorBuilder::Builder(builder) => builder.to_vector().to_arrow_array(),
- };
- columns.push(
- arrow::compute::take(&array, &indices_to_take, None)
- .context(error::ComputeArrowSnafu)?,
- );
- }
-
- RecordBatch::try_new(schema, columns).context(error::NewRecordBatchSnafu)
-}
-
-#[allow(clippy::too_many_arguments)]
-fn build_row_sort_indices_and_columns(
- pk_weights: Option<&[u16]>,
- pk_index: UInt16Vector,
- ts: VectorRef,
- sequence: UInt64Vector,
- op_type: UInt8Vector,
- replace_pk_index: bool,
- dedup: bool,
- column_num: usize,
-) -> Result<(UInt32Array, Vec)> {
- let mut rows = build_rows_to_sort(pk_weights, &pk_index, &ts, &sequence);
-
- let pk_array = if replace_pk_index {
- // replace pk index values with pk weights.
- Arc::new(UInt16Array::from_iter_values(
- rows.iter().map(|(_, key)| key.pk_weight),
- )) as Arc<_>
- } else {
- pk_index.to_arrow_array()
- };
-
- // sort and dedup
- rows.sort_unstable_by(|l, r| l.1.cmp(&r.1));
- if dedup {
- rows.dedup_by(|l, r| l.1.pk_weight == r.1.pk_weight && l.1.timestamp == r.1.timestamp);
- }
-
- let indices_to_take = UInt32Array::from_iter_values(rows.iter().map(|(idx, _)| *idx as u32));
-
- let mut columns = Vec::with_capacity(column_num);
-
- columns.push(
- arrow::compute::take(&pk_array, &indices_to_take, None)
- .context(error::ComputeArrowSnafu)?,
- );
-
- columns.push(
- arrow::compute::take(&ts.to_arrow_array(), &indices_to_take, None)
- .context(error::ComputeArrowSnafu)?,
- );
-
- columns.push(
- arrow::compute::take(&sequence.as_arrow(), &indices_to_take, None)
- .context(error::ComputeArrowSnafu)?,
- );
-
- columns.push(
- arrow::compute::take(&op_type.as_arrow(), &indices_to_take, None)
- .context(error::ComputeArrowSnafu)?,
- );
-
- Ok((indices_to_take, columns))
-}
-
-pub(crate) fn timestamp_array_to_i64_slice(arr: &ArrayRef) -> &[i64] {
- use datatypes::arrow::array::{
- TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
- TimestampSecondArray,
- };
- use datatypes::arrow::datatypes::{DataType, TimeUnit};
-
- match arr.data_type() {
- DataType::Timestamp(t, _) => match t {
- TimeUnit::Second => arr
- .as_any()
- .downcast_ref::()
- .unwrap()
- .values(),
- TimeUnit::Millisecond => arr
- .as_any()
- .downcast_ref::()
- .unwrap()
- .values(),
- TimeUnit::Microsecond => arr
- .as_any()
- .downcast_ref::()
- .unwrap()
- .values(),
- TimeUnit::Nanosecond => arr
- .as_any()
- .downcast_ref::()
- .unwrap()
- .values(),
- },
- _ => unreachable!(),
- }
-}
-
-enum LazyFieldVector {
- Type(ConcreteDataType),
- Vector(VectorRef),
-}
-
-pub(crate) struct DataBufferReaderBuilder {
- schema: SchemaRef,
- pk_index: UInt16Vector,
- timestamp: VectorRef,
- sequence: UInt64Vector,
- op_type: UInt8Vector,
- fields: Vec,
- dedup: bool,
-}
-
-impl DataBufferReaderBuilder {
- fn build_record_batch(self, pk_weights: Option<&[u16]>) -> Result {
- let num_rows = self.timestamp.len();
- let (indices_to_take, mut columns) = build_row_sort_indices_and_columns(
- pk_weights,
- self.pk_index,
- self.timestamp,
- self.sequence,
- self.op_type,
- // replace_pk_index is always set to false since:
- // - for DataBuffer in ShardBuilder, pk dict is not frozen
- // - for DataBuffer in Shard, values in pk_index column has already been replaced during `freeze`.
- false,
- self.dedup,
- self.fields.len() + 4,
- )?;
-
- for b in self.fields.iter() {
- let array = match b {
- LazyFieldVector::Type(ty) => {
- let mut single_null = ty.create_mutable_vector(num_rows);
- single_null.push_nulls(num_rows);
- single_null.to_vector().to_arrow_array()
- }
- LazyFieldVector::Vector(vector) => vector.to_arrow_array(),
- };
- columns.push(
- arrow::compute::take(&array, &indices_to_take, None)
- .context(error::ComputeArrowSnafu)?,
- );
- }
- RecordBatch::try_new(self.schema, columns).context(error::NewRecordBatchSnafu)
- }
-
- pub fn build(self, pk_weights: Option<&[u16]>) -> Result {
- self.build_record_batch(pk_weights)
- .and_then(DataBufferReader::new)
- }
-}
-
-#[derive(Debug)]
-pub(crate) struct DataBufferReader {
- batch: RecordBatch,
- offset: usize,
- current_range: Option,
- elapsed_time: Duration,
-}
-
-impl Drop for DataBufferReader {
- fn drop(&mut self) {
- PARTITION_TREE_READ_STAGE_ELAPSED
- .with_label_values(&["read_data_buffer"])
- .observe(self.elapsed_time.as_secs_f64())
- }
-}
-
-impl DataBufferReader {
- pub(crate) fn new(batch: RecordBatch) -> Result {
- let mut reader = Self {
- batch,
- offset: 0,
- current_range: None,
- elapsed_time: Duration::default(),
- };
- reader.next()?; // fill data batch for comparison and merge.
- Ok(reader)
- }
-
- pub(crate) fn is_valid(&self) -> bool {
- self.current_range.is_some()
- }
-
- /// Returns current data batch.
- /// # Panics
- /// If Current reader is exhausted.
- pub(crate) fn current_data_batch(&self) -> DataBatch<'_> {
- let range = self.current_range.unwrap();
- DataBatch {
- rb: &self.batch,
- range,
- }
- }
-
- /// Advances reader to next data batch.
- pub(crate) fn next(&mut self) -> Result<()> {
- if self.offset >= self.batch.num_rows() {
- self.current_range = None;
- return Ok(());
- }
- let start = Instant::now();
- let pk_index_array = pk_index_array(&self.batch);
- if let Some((next_pk, range)) = search_next_pk_range(pk_index_array, self.offset) {
- self.offset = range.end;
- self.current_range = Some(DataBatchRange {
- pk_index: next_pk,
- start: range.start,
- end: range.end,
- });
- } else {
- self.current_range = None;
- }
- self.elapsed_time += start.elapsed();
- Ok(())
- }
-}
-
-/// Gets `pk_index` array from record batch.
-/// # Panics
-/// If pk index column is not the first column or the type is not `UInt16Array`.
-fn pk_index_array(batch: &RecordBatch) -> &UInt16Array {
- batch
- .column(0)
- .as_any()
- .downcast_ref::()
- .unwrap()
-}
-
-/// Searches for next pk index, and it's offset range in a sorted `UInt16Array`.
-fn search_next_pk_range(array: &UInt16Array, start: usize) -> Option<(PkIndex, Range)> {
- let num_rows = array.len();
- if start >= num_rows {
- return None;
- }
-
- let values = array.values();
- let next_pk = values[start];
-
- for idx in start..num_rows {
- if values[idx] != next_pk {
- return Some((next_pk, start..idx));
- }
- }
- Some((next_pk, start..num_rows))
-}
-
-#[derive(Eq, PartialEq)]
-struct InnerKey {
- pk_weight: u16,
- timestamp: i64,
- sequence: u64,
-}
-
-impl PartialOrd for InnerKey {
- fn partial_cmp(&self, other: &Self) -> Option {
- Some(self.cmp(other))
- }
-}
-
-impl Ord for InnerKey {
- fn cmp(&self, other: &Self) -> Ordering {
- (self.pk_weight, self.timestamp, Reverse(self.sequence)).cmp(&(
- other.pk_weight,
- other.timestamp,
- Reverse(other.sequence),
- ))
- }
-}
-
-fn build_rows_to_sort(
- pk_weights: Option<&[u16]>,
- pk_index: &UInt16Vector,
- ts: &VectorRef,
- sequence: &UInt64Vector,
-) -> Vec<(usize, InnerKey)> {
- let ts_values = match ts.data_type() {
- ConcreteDataType::Timestamp(t) => match t {
- TimestampType::Second(_) => ts
- .as_any()
- .downcast_ref::()
- .unwrap()
- .as_arrow()
- .values(),
- TimestampType::Millisecond(_) => ts
- .as_any()
- .downcast_ref::()
- .unwrap()
- .as_arrow()
- .values(),
- TimestampType::Microsecond(_) => ts
- .as_any()
- .downcast_ref::()
- .unwrap()
- .as_arrow()
- .values(),
- TimestampType::Nanosecond(_) => ts
- .as_any()
- .downcast_ref::()
- .unwrap()
- .as_arrow()
- .values(),
- },
- other => unreachable!("Unexpected type {:?}", other),
- };
- let pk_index_values = pk_index.as_arrow().values();
- let sequence_values = sequence.as_arrow().values();
- debug_assert_eq!(ts_values.len(), pk_index_values.len());
- debug_assert_eq!(ts_values.len(), sequence_values.len());
-
- ts_values
- .iter()
- .zip(pk_index_values.iter())
- .zip(sequence_values.iter())
- .enumerate()
- .map(|(idx, ((timestamp, pk_index), sequence))| {
- let pk_weight = if let Some(weights) = pk_weights {
- weights[*pk_index as usize] // if pk_weights is present, sort according to weight.
- } else {
- *pk_index // otherwise pk_index has already been replaced by weights.
- };
- (
- idx,
- InnerKey {
- timestamp: *timestamp,
- pk_weight,
- sequence: *sequence,
- },
- )
- })
- .collect()
-}
-
-fn memtable_schema_to_encoded_schema(schema: &RegionMetadataRef) -> SchemaRef {
- use datatypes::arrow::datatypes::DataType;
- let ColumnSchema {
- name: ts_name,
- data_type: ts_type,
- ..
- } = &schema.time_index_column().column_schema;
-
- let mut fields = vec![
- Field::new(PK_INDEX_COLUMN_NAME, DataType::UInt16, false),
- Field::new(ts_name, ts_type.as_arrow_type(), false),
- Field::new(SEQUENCE_COLUMN_NAME, DataType::UInt64, false),
- Field::new(OP_TYPE_COLUMN_NAME, DataType::UInt8, false),
- ];
-
- fields.extend(schema.field_columns().map(|c| {
- Field::new(
- &c.column_schema.name,
- c.column_schema.data_type.as_arrow_type(),
- c.column_schema.is_nullable(),
- )
- }));
-
- Arc::new(Schema::new(fields))
-}
-
-struct DataPartEncoder<'a> {
- schema: SchemaRef,
- pk_weights: Option<&'a [u16]>,
- row_group_size: Option,
- timestamp_column_name: String,
- replace_pk_index: bool,
- dedup: bool,
-}
-
-impl<'a> DataPartEncoder<'a> {
- pub fn new(
- metadata: &RegionMetadataRef,
- pk_weights: Option<&'a [u16]>,
- row_group_size: Option,
- timestamp_column_name: String,
- replace_pk_index: bool,
- dedup: bool,
- ) -> DataPartEncoder<'a> {
- let schema = memtable_schema_to_encoded_schema(metadata);
- Self {
- schema,
- pk_weights,
- row_group_size,
- timestamp_column_name,
- replace_pk_index,
- dedup,
- }
- }
-
- // todo(hl): more customized config according to region options.
- fn writer_props(self) -> WriterProperties {
- let mut builder = WriterProperties::builder();
- if let Some(row_group_size) = self.row_group_size {
- builder = builder.set_max_row_group_size(row_group_size)
- }
-
- let ts_col = ColumnPath::new(vec![self.timestamp_column_name]);
- let pk_index_col = ColumnPath::new(vec![PK_INDEX_COLUMN_NAME.to_string()]);
- let sequence_col = ColumnPath::new(vec![SEQUENCE_COLUMN_NAME.to_string()]);
- let op_type_col = ColumnPath::new(vec![OP_TYPE_COLUMN_NAME.to_string()]);
-
- builder = builder
- .set_compression(Compression::ZSTD(ZstdLevel::default()))
- .set_statistics_enabled(EnabledStatistics::None);
- builder = builder
- .set_column_encoding(ts_col.clone(), Encoding::DELTA_BINARY_PACKED)
- .set_column_dictionary_enabled(ts_col, false)
- .set_column_encoding(pk_index_col.clone(), Encoding::DELTA_BINARY_PACKED)
- .set_column_dictionary_enabled(pk_index_col, true)
- .set_column_encoding(sequence_col.clone(), Encoding::DELTA_BINARY_PACKED)
- .set_column_dictionary_enabled(sequence_col, false)
- .set_column_encoding(op_type_col.clone(), Encoding::DELTA_BINARY_PACKED)
- .set_column_dictionary_enabled(op_type_col, true)
- .set_column_index_truncate_length(None)
- .set_statistics_truncate_length(None);
- builder.build()
- }
-
- pub fn write(self, source: &mut DataBuffer) -> Result {
- let mut bytes = Vec::with_capacity(1024);
-
- let rb = {
- let _timer = PARTITION_TREE_DATA_BUFFER_FREEZE_STAGE_ELAPSED
- .with_label_values(&["drain_data_buffer_to_batch"])
- .start_timer();
- drain_data_buffer_to_record_batches(
- self.schema.clone(),
- source,
- self.pk_weights,
- self.dedup,
- self.replace_pk_index,
- )?
- };
-
- {
- let _timer = PARTITION_TREE_DATA_BUFFER_FREEZE_STAGE_ELAPSED
- .with_label_values(&["encode"])
- .start_timer();
- let mut writer =
- ArrowWriter::try_new(&mut bytes, self.schema.clone(), Some(self.writer_props()))
- .context(error::EncodeMemtableSnafu)?;
- writer.write(&rb).context(error::EncodeMemtableSnafu)?;
- let _metadata = writer.close().context(error::EncodeMemtableSnafu)?;
- }
- Ok(DataPart::Parquet(ParquetPart {
- data: Bytes::from(bytes),
- }))
- }
-}
-
-/// Format of immutable data part.
-pub enum DataPart {
- Parquet(ParquetPart),
-}
-
-impl DataPart {
- /// Reads frozen data part and yields [DataBatch]es.
- pub fn read(&self) -> Result {
- match self {
- // Keep encoded memtable scans aligned with mito/DataFusion batch sizing instead of
- // parquet-rs's implicit 1024-row default.
- DataPart::Parquet(data_bytes) => {
- DataPartReader::new(data_bytes.data.clone(), Some(DEFAULT_READ_BATCH_SIZE))
- }
- }
- }
-
- fn is_empty(&self) -> bool {
- match self {
- DataPart::Parquet(p) => p.data.is_empty(),
- }
- }
-}
-
-pub struct DataPartReader {
- inner: ParquetRecordBatchReader,
- current_batch: Option,
- current_range: Option,
- elapsed: Duration,
-}
-
-impl Drop for DataPartReader {
- fn drop(&mut self) {
- PARTITION_TREE_READ_STAGE_ELAPSED
- .with_label_values(&["read_data_part"])
- .observe(self.elapsed.as_secs_f64());
- }
-}
-
-impl Debug for DataPartReader {
- fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
- f.debug_struct("DataPartReader")
- .field("current_range", &self.current_range)
- .finish()
- }
-}
-
-impl DataPartReader {
- pub fn new(data: Bytes, batch_size: Option) -> Result {
- let mut builder =
- ParquetRecordBatchReaderBuilder::try_new(data).context(error::ReadDataPartSnafu)?;
- if let Some(batch_size) = batch_size {
- builder = builder.with_batch_size(batch_size);
- }
- let parquet_reader = builder.build().context(error::ReadDataPartSnafu)?;
- let mut reader = Self {
- inner: parquet_reader,
- current_batch: None,
- current_range: None,
- elapsed: Default::default(),
- };
- reader.next()?;
- Ok(reader)
- }
-
- /// Returns false if current reader is exhausted.
- pub(crate) fn is_valid(&self) -> bool {
- self.current_range.is_some()
- }
-
- /// Returns current data batch of reader.
- /// # Panics
- /// If reader is exhausted.
- pub(crate) fn current_data_batch(&self) -> DataBatch<'_> {
- let range = self.current_range.unwrap();
- DataBatch {
- rb: self.current_batch.as_ref().unwrap(),
- range,
- }
- }
-
- pub(crate) fn next(&mut self) -> Result<()> {
- let start = Instant::now();
- if let Some((next_pk, range)) = self.search_next_pk_range() {
- // first try to search next pk in current record batch.
- self.current_range = Some(DataBatchRange {
- pk_index: next_pk,
- start: range.start,
- end: range.end,
- });
- } else {
- // current record batch reaches eof, fetch next record batch from parquet reader.
- if let Some(rb) = self.inner.next() {
- let rb = rb.context(error::ComputeArrowSnafu)?;
- self.current_batch = Some(rb);
- self.current_range = None;
- return self.next();
- } else {
- // parquet is also exhausted
- self.current_batch = None;
- self.current_range = None;
- }
- }
- self.elapsed += start.elapsed();
- Ok(())
- }
-
- /// Searches next primary key along with it's offset range inside record batch.
- fn search_next_pk_range(&self) -> Option<(PkIndex, Range)> {
- self.current_batch.as_ref().and_then(|b| {
- // safety: PK_INDEX_COLUMN_NAME must present in record batch yielded by data part.
- let pk_array = pk_index_array(b);
- let start = self
- .current_range
- .as_ref()
- .map(|range| range.end)
- .unwrap_or(0);
- search_next_pk_range(pk_array, start)
- })
- }
-}
-
-/// Parquet-encoded `DataPart`.
-pub struct ParquetPart {
- data: Bytes,
-}
-
-/// Data parts under a shard.
-pub struct DataParts {
- /// The active writing buffer.
- active: DataBuffer,
- /// immutable (encoded) parts.
- frozen: Vec,
-}
-
-impl DataParts {
- pub(crate) fn new(metadata: RegionMetadataRef, capacity: usize, dedup: bool) -> Self {
- Self {
- active: DataBuffer::with_capacity(metadata, capacity, dedup),
- frozen: Vec::new(),
- }
- }
-
- pub(crate) fn with_frozen(mut self, frozen: Vec) -> Self {
- self.frozen = frozen;
- self
- }
-
- /// Writes a row into parts.
- pub fn write_row(&mut self, pk_index: PkIndex, kv: &KeyValue) {
- self.active.write_row(pk_index, kv)
- }
-
- /// Returns the number of rows in the active buffer.
- pub fn num_active_rows(&self) -> usize {
- self.active.num_rows()
- }
-
- /// Freezes active buffer and creates a new active buffer.
- pub fn freeze(&mut self) -> Result<()> {
- let part = self.active.freeze(None, false)?;
- self.frozen.push(part);
- Ok(())
- }
-
- /// Reads data from all parts including active and frozen parts.
- /// The returned iterator yields a record batch of one primary key at a time.
- /// The order of yielding primary keys is determined by provided weights.
- pub fn read(&self) -> Result {
- let _timer = PARTITION_TREE_READ_STAGE_ELAPSED
- .with_label_values(&["build_data_parts_reader"])
- .start_timer();
-
- let buffer = self.active.read()?;
- let mut parts = Vec::with_capacity(self.frozen.len());
- for p in &self.frozen {
- parts.push(p.read()?);
- }
- Ok(DataPartsReaderBuilder { buffer, parts })
- }
-
- pub(crate) fn is_empty(&self) -> bool {
- self.active.is_empty() && self.frozen.iter().all(|part| part.is_empty())
- }
-
- #[cfg(test)]
- pub(crate) fn frozen_len(&self) -> usize {
- self.frozen.len()
- }
-}
-
-pub struct DataPartsReaderBuilder {
- buffer: DataBufferReaderBuilder,
- parts: Vec,
-}
-
-impl DataPartsReaderBuilder {
- pub(crate) fn build(self) -> Result {
- let mut nodes = Vec::with_capacity(self.parts.len() + 1);
- nodes.push(DataNode::new(DataSource::Buffer(
- // `DataPars::read` ensures that all pk_index inside `DataBuffer` are replaced by weights.
- // then we pass None to sort rows directly according to pk_index.
- self.buffer.build(None)?,
- )));
- for p in self.parts {
- nodes.push(DataNode::new(DataSource::Part(p)));
- }
- let num_parts = nodes.len();
- let merger = Merger::try_new(nodes)?;
- Ok(DataPartsReader {
- merger,
- num_parts,
- elapsed: Default::default(),
- })
- }
-}
-
-/// Reader for all parts inside a `DataParts`.
-pub struct DataPartsReader {
- merger: Merger,
- num_parts: usize,
- elapsed: Duration,
-}
-
-impl Drop for DataPartsReader {
- fn drop(&mut self) {
- PARTITION_TREE_READ_STAGE_ELAPSED
- .with_label_values(&["read_data_parts"])
- .observe(self.elapsed.as_secs_f64())
- }
-}
-
-impl DataPartsReader {
- pub(crate) fn current_data_batch(&self) -> DataBatch<'_> {
- let batch = self.merger.current_node().current_data_batch();
- batch.slice(0, self.merger.current_rows())
- }
-
- pub(crate) fn next(&mut self) -> Result<()> {
- let start = Instant::now();
- let result = self.merger.next();
- self.elapsed += start.elapsed();
- result
- }
-
- pub(crate) fn is_valid(&self) -> bool {
- self.merger.is_valid()
- }
-
- pub(crate) fn num_parts(&self) -> usize {
- self.num_parts
- }
-}
-
-#[cfg(test)]
-mod tests {
- use datafusion::arrow::array::Float64Array;
- use datatypes::arrow::array::UInt16Array;
- use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
- use parquet::data_type::AsBytes;
-
- use super::*;
- use crate::test_util::memtable_util::{
- extract_data_batch, metadata_for_test, write_rows_to_buffer,
- };
-
- #[test]
- fn test_lazy_mutable_vector_builder() {
- let mut builder = LazyMutableVectorBuilder::new(ConcreteDataType::boolean_datatype());
- match builder {
- LazyMutableVectorBuilder::Type(ref t) => {
- assert_eq!(&ConcreteDataType::boolean_datatype(), t);
- }
- LazyMutableVectorBuilder::Builder(_) => {
- unreachable!()
- }
- }
- builder.get_or_create_builder(1);
- match builder {
- LazyMutableVectorBuilder::Type(_) => {
- unreachable!()
- }
- LazyMutableVectorBuilder::Builder(_) => {}
- }
- }
-
- fn check_data_buffer_dedup(dedup: bool) {
- let metadata = metadata_for_test();
- let mut buffer = DataBuffer::with_capacity(metadata.clone(), 10, dedup);
- write_rows_to_buffer(
- &mut buffer,
- &metadata,
- 0,
- vec![2, 3],
- vec![Some(1.0), Some(2.0)],
- 0,
- );
- write_rows_to_buffer(
- &mut buffer,
- &metadata,
- 0,
- vec![1, 2],
- vec![Some(1.1), Some(2.1)],
- 2,
- );
-
- let mut reader = buffer.read().unwrap().build(Some(&[0])).unwrap();
- let mut res = vec![];
- while reader.is_valid() {
- let batch = reader.current_data_batch();
- res.push(extract_data_batch(&batch));
- reader.next().unwrap();
- }
- if dedup {
- assert_eq!(vec![(0, vec![(1, 2), (2, 3), (3, 1)])], res);
- } else {
- assert_eq!(vec![(0, vec![(1, 2), (2, 3), (2, 0), (3, 1)])], res);
- }
- }
-
- #[test]
- fn test_data_buffer_dedup() {
- check_data_buffer_dedup(true);
- check_data_buffer_dedup(false);
- }
-
- fn check_data_buffer_freeze(
- pk_weights: Option<&[u16]>,
- replace_pk_weights: bool,
- expected: &[(u16, Vec<(i64, u64)>)],
- ) {
- let meta = metadata_for_test();
- let mut buffer = DataBuffer::with_capacity(meta.clone(), 10, true);
-
- // write rows with null values.
- write_rows_to_buffer(
- &mut buffer,
- &meta,
- 0,
- vec![0, 1, 2],
- vec![Some(1.0), None, Some(3.0)],
- 0,
- );
- write_rows_to_buffer(&mut buffer, &meta, 1, vec![1], vec![Some(2.0)], 3);
-
- let mut res = Vec::with_capacity(3);
- let mut reader = buffer
- .freeze(pk_weights, replace_pk_weights)
- .unwrap()
- .read()
- .unwrap();
- while reader.is_valid() {
- let batch = reader.current_data_batch();
- res.push(extract_data_batch(&batch));
- reader.next().unwrap();
- }
- assert_eq!(expected, res);
- }
-
- #[test]
- fn test_data_buffer_freeze() {
- check_data_buffer_freeze(
- None,
- false,
- &[(0, vec![(0, 0), (1, 1), (2, 2)]), (1, vec![(1, 3)])],
- );
-
- check_data_buffer_freeze(
- Some(&[1, 2]),
- false,
- &[(0, vec![(0, 0), (1, 1), (2, 2)]), (1, vec![(1, 3)])],
- );
-
- check_data_buffer_freeze(
- Some(&[3, 2]),
- true,
- &[(2, vec![(1, 3)]), (3, vec![(0, 0), (1, 1), (2, 2)])],
- );
-
- check_data_buffer_freeze(
- Some(&[3, 2]),
- false,
- &[(1, vec![(1, 3)]), (0, vec![(0, 0), (1, 1), (2, 2)])],
- );
- }
-
- #[test]
- fn test_encode_data_buffer() {
- let meta = metadata_for_test();
- let mut buffer = DataBuffer::with_capacity(meta.clone(), 10, true);
-
- // write rows with null values.
- write_rows_to_buffer(
- &mut buffer,
- &meta,
- 2,
- vec![0, 1, 2],
- vec![Some(1.0), None, Some(3.0)],
- 2,
- );
-
- assert_eq!(3, buffer.num_rows());
-
- write_rows_to_buffer(&mut buffer, &meta, 2, vec![1], vec![Some(2.0)], 3);
-
- assert_eq!(4, buffer.num_rows());
-
- let encoder = DataPartEncoder::new(
- &meta,
- Some(&[0, 1, 2]),
- None,
- meta.time_index_column().column_schema.name.clone(),
- true,
- true,
- );
- let encoded = match encoder.write(&mut buffer).unwrap() {
- DataPart::Parquet(data) => data.data,
- };
-
- let s = String::from_utf8_lossy(encoded.as_bytes());
- assert!(s.starts_with("PAR1"));
- assert!(s.ends_with("PAR1"));
-
- let builder = ParquetRecordBatchReaderBuilder::try_new(encoded).unwrap();
- let mut reader = builder.build().unwrap();
- let batch = reader.next().unwrap().unwrap();
- assert_eq!(3, batch.num_rows());
- }
-
- fn check_buffer_values_equal(reader: &mut DataBufferReader, expected_values: &[Vec]) {
- let mut output = Vec::with_capacity(expected_values.len());
- while reader.is_valid() {
- let batch = reader.current_data_batch().slice_record_batch();
- let values = batch
- .column_by_name("v1")
- .unwrap()
- .as_any()
- .downcast_ref::()
- .unwrap()
- .iter()
- .map(|v| v.unwrap())
- .collect::>();
- output.push(values);
- reader.next().unwrap();
- }
- assert_eq!(expected_values, output);
- }
-
- #[test]
- fn test_search_next_pk_range() {
- let a = UInt16Array::from_iter_values([1, 1, 3, 3, 4, 6]);
- assert_eq!((1, 0..2), search_next_pk_range(&a, 0).unwrap());
- assert_eq!((3, 2..4), search_next_pk_range(&a, 2).unwrap());
- assert_eq!((4, 4..5), search_next_pk_range(&a, 4).unwrap());
- assert_eq!((6, 5..6), search_next_pk_range(&a, 5).unwrap());
-
- assert_eq!(None, search_next_pk_range(&a, 6));
- }
-
- fn check_iter_data_buffer(pk_weights: Option<&[u16]>, expected: &[Vec]) {
- let meta = metadata_for_test();
- let mut buffer = DataBuffer::with_capacity(meta.clone(), 10, true);
-
- write_rows_to_buffer(
- &mut buffer,
- &meta,
- 3,
- vec![1, 2, 3],
- vec![Some(1.1), Some(2.1), Some(3.1)],
- 3,
- );
-
- write_rows_to_buffer(
- &mut buffer,
- &meta,
- 2,
- vec![0, 1, 2],
- vec![Some(1.0), Some(2.0), Some(3.0)],
- 2,
- );
-
- let mut iter = buffer.read().unwrap().build(pk_weights).unwrap();
- check_buffer_values_equal(&mut iter, expected);
- }
-
- #[test]
- fn test_iter_data_buffer() {
- check_iter_data_buffer(None, &[vec![1.0, 2.0, 3.0], vec![1.1, 2.1, 3.1]]);
- check_iter_data_buffer(
- Some(&[0, 1, 2, 3]),
- &[vec![1.0, 2.0, 3.0], vec![1.1, 2.1, 3.1]],
- );
- check_iter_data_buffer(
- Some(&[3, 2, 1, 0]),
- &[vec![1.1, 2.1, 3.1], vec![1.0, 2.0, 3.0]],
- );
- }
-
- #[test]
- fn test_iter_empty_data_buffer() {
- let meta = metadata_for_test();
- let buffer = DataBuffer::with_capacity(meta.clone(), 10, true);
- let mut iter = buffer.read().unwrap().build(Some(&[0, 1, 3, 2])).unwrap();
- check_buffer_values_equal(&mut iter, &[]);
- }
-
- fn check_part_values_equal(iter: &mut DataPartReader, expected_values: &[Vec]) {
- let mut output = Vec::with_capacity(expected_values.len());
- while iter.is_valid() {
- let batch = iter.current_data_batch().slice_record_batch();
- let values = batch
- .column_by_name("v1")
- .unwrap()
- .as_any()
- .downcast_ref::()
- .unwrap()
- .iter()
- .map(|v| v.unwrap())
- .collect::>();
- output.push(values);
- iter.next().unwrap();
- }
- assert_eq!(expected_values, output);
- }
-
- fn check_iter_data_part(weights: &[u16], expected_values: &[Vec]) {
- let meta = metadata_for_test();
- let mut buffer = DataBuffer::with_capacity(meta.clone(), 10, true);
-
- write_rows_to_buffer(
- &mut buffer,
- &meta,
- 2,
- vec![0, 1, 2],
- vec![Some(1.0), Some(2.0), Some(3.0)],
- 2,
- );
-
- write_rows_to_buffer(
- &mut buffer,
- &meta,
- 3,
- vec![1, 2, 3],
- vec![Some(1.1), Some(2.1), Some(3.1)],
- 3,
- );
-
- write_rows_to_buffer(
- &mut buffer,
- &meta,
- 2,
- vec![2, 3],
- vec![Some(2.2), Some(2.3)],
- 4,
- );
-
- let encoder = DataPartEncoder::new(
- &meta,
- Some(weights),
- Some(4),
- meta.time_index_column().column_schema.name.clone(),
- true,
- true,
- );
- let encoded = encoder.write(&mut buffer).unwrap();
-
- let mut iter = encoded.read().unwrap();
- check_part_values_equal(&mut iter, expected_values);
- }
-
- #[test]
- fn test_iter_data_part() {
- check_iter_data_part(
- &[0, 1, 2, 3],
- &[vec![1.0, 2.0, 3.0, 2.3], vec![1.1, 2.1, 3.1]],
- );
-
- check_iter_data_part(
- &[3, 2, 1, 0],
- &[vec![1.1, 2.1, 3.1], vec![1.0, 2.0, 3.0, 2.3]],
- );
- }
-}
diff --git a/src/mito2/src/memtable/partition_tree/dedup.rs b/src/mito2/src/memtable/partition_tree/dedup.rs
deleted file mode 100644
index a010a1eaf5b6..000000000000
--- a/src/mito2/src/memtable/partition_tree/dedup.rs
+++ /dev/null
@@ -1,221 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::ops::Range;
-
-use crate::error::Result;
-use crate::memtable::partition_tree::PkId;
-use crate::memtable::partition_tree::data::DataBatch;
-use crate::memtable::partition_tree::shard::DataBatchSource;
-
-/// A reader that dedup sorted batches from a merger.
-pub struct DedupReader {
- prev_batch_last_row: Option<(PkId, i64)>,
- current_batch_range: Option>,
- inner: T,
-}
-
-impl DedupReader {
- /// Creates a new dedup reader.
- pub fn try_new(inner: T) -> Result {
- let mut res = Self {
- prev_batch_last_row: None,
- current_batch_range: None,
- inner,
- };
- res.next()?;
- Ok(res)
- }
-}
-
-impl DataBatchSource for DedupReader {
- fn is_valid(&self) -> bool {
- self.current_batch_range.is_some()
- }
-
- fn next(&mut self) -> Result<()> {
- while self.inner.is_valid() {
- match &mut self.prev_batch_last_row {
- None => {
- // First shot, fill prev_batch_last_row and current_batch_range with first batch.
- let current_batch = self.inner.current_data_batch();
- let pk_id = self.inner.current_pk_id();
- let (last_ts, _) = current_batch.last_row();
- self.prev_batch_last_row = Some((pk_id, last_ts));
- self.current_batch_range = Some(0..current_batch.num_rows());
- break;
- }
- Some(prev_last_row) => {
- self.inner.next()?;
- if !self.inner.is_valid() {
- // Resets current_batch_range if inner reader is exhausted.
- self.current_batch_range = None;
- break;
- }
- let current_batch = self.inner.current_data_batch();
- let current_pk_id = self.inner.current_pk_id();
- let (first_ts, _) = current_batch.first_row();
- let rows_in_batch = current_batch.num_rows();
-
- let (start, end) = if &(current_pk_id, first_ts) == prev_last_row {
- // First row in this batch duplicated with the last row in previous batch
- if rows_in_batch == 1 {
- // If batch is exhausted, move to next batch.
- continue;
- } else {
- // Skip the first row, start from offset 1.
- (1, rows_in_batch)
- }
- } else {
- // No duplicates found, yield whole batch.
- (0, rows_in_batch)
- };
-
- let (last_ts, _) = current_batch.last_row();
- *prev_last_row = (current_pk_id, last_ts);
- self.current_batch_range = Some(start..end);
- break;
- }
- }
- }
- Ok(())
- }
-
- fn current_pk_id(&self) -> PkId {
- self.inner.current_pk_id()
- }
-
- fn current_key(&self) -> Option<&[u8]> {
- self.inner.current_key()
- }
-
- fn current_data_batch(&self) -> DataBatch<'_> {
- let range = self.current_batch_range.as_ref().unwrap();
- let data_batch = self.inner.current_data_batch();
- data_batch.slice(range.start, range.len())
- }
-}
-
-#[cfg(test)]
-mod tests {
- use store_api::metadata::RegionMetadataRef;
-
- use super::*;
- use crate::memtable::partition_tree::data::{DataBuffer, DataParts, DataPartsReader};
- use crate::test_util::memtable_util::{
- extract_data_batch, metadata_for_test, write_rows_to_buffer,
- };
-
- struct MockSource(DataPartsReader);
-
- impl DataBatchSource for MockSource {
- fn is_valid(&self) -> bool {
- self.0.is_valid()
- }
-
- fn next(&mut self) -> Result<()> {
- self.0.next()
- }
-
- fn current_pk_id(&self) -> PkId {
- PkId {
- shard_id: 0,
- pk_index: self.0.current_data_batch().pk_index(),
- }
- }
-
- fn current_key(&self) -> Option<&[u8]> {
- None
- }
-
- fn current_data_batch(&self) -> DataBatch<'_> {
- self.0.current_data_batch()
- }
- }
-
- fn build_data_buffer(
- meta: RegionMetadataRef,
- rows: Vec<(u16, Vec)>,
- seq: &mut u64,
- ) -> DataBuffer {
- let mut buffer = DataBuffer::with_capacity(meta.clone(), 10, true);
-
- for row in rows {
- let (pk_index, timestamps) = row;
- let num_rows = timestamps.len() as u64;
- let v = timestamps.iter().map(|v| Some(*v as f64)).collect();
-
- write_rows_to_buffer(&mut buffer, &meta, pk_index, timestamps, v, *seq);
- *seq += num_rows;
- }
- buffer
- }
-
- fn check_data_parts_reader_dedup(
- parts: Vec)>>,
- expected: Vec<(u16, Vec<(i64, u64)>)>,
- ) {
- let meta = metadata_for_test();
- let mut seq = 0;
-
- let mut frozens = Vec::with_capacity(parts.len());
- for part in parts {
- let mut buffer1 = build_data_buffer(meta.clone(), part, &mut seq);
- let part1 = buffer1.freeze(None, false).unwrap();
- frozens.push(part1);
- }
-
- let parts = DataParts::new(meta, 10, true).with_frozen(frozens);
-
- let mut res = Vec::with_capacity(expected.len());
- let mut reader =
- DedupReader::try_new(MockSource(parts.read().unwrap().build().unwrap())).unwrap();
- while reader.is_valid() {
- let batch = reader.current_data_batch();
- res.push(extract_data_batch(&batch));
- reader.next().unwrap();
- }
-
- assert_eq!(expected, res);
- }
-
- #[test]
- fn test_data_parts_reader_dedup() {
- check_data_parts_reader_dedup(vec![vec![(0, vec![1, 2])]], vec![(0, vec![(1, 0), (2, 1)])]);
-
- check_data_parts_reader_dedup(
- vec![
- vec![(0, vec![1, 2])],
- vec![(0, vec![1, 2])],
- vec![(0, vec![2, 3])],
- ],
- vec![(0, vec![(1, 2)]), (0, vec![(2, 4)]), (0, vec![(3, 5)])],
- );
-
- check_data_parts_reader_dedup(
- vec![vec![(0, vec![1])], vec![(0, vec![2])], vec![(0, vec![3])]],
- vec![(0, vec![(1, 0)]), (0, vec![(2, 1)]), (0, vec![(3, 2)])],
- );
-
- check_data_parts_reader_dedup(
- vec![vec![(0, vec![1])], vec![(0, vec![1])], vec![(0, vec![1])]],
- vec![(0, vec![(1, 2)])],
- );
-
- check_data_parts_reader_dedup(
- vec![vec![(0, vec![1])], vec![(1, vec![1])], vec![(2, vec![1])]],
- vec![(0, vec![(1, 0)]), (1, vec![(1, 1)]), (2, vec![(1, 2)])],
- );
- }
-}
diff --git a/src/mito2/src/memtable/partition_tree/dict.rs b/src/mito2/src/memtable/partition_tree/dict.rs
deleted file mode 100644
index 77cc835ea041..000000000000
--- a/src/mito2/src/memtable/partition_tree/dict.rs
+++ /dev/null
@@ -1,493 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Key dictionary of a shard.
-
-use std::collections::BTreeMap;
-use std::sync::Arc;
-
-use datatypes::arrow::array::{Array, ArrayBuilder, BinaryArray, BinaryBuilder};
-
-use crate::memtable::partition_tree::PkIndex;
-use crate::memtable::stats::WriteMetrics;
-use crate::metrics::MEMTABLE_DICT_BYTES;
-
-/// Maximum keys in a [DictBlock].
-const MAX_KEYS_PER_BLOCK: u16 = 256;
-
-/// The key is mcmp-encoded primary keys, while the values are the pk index and
-/// optionally sparsely encoded primary keys.
-type PkIndexMap = BTreeMap, (PkIndex, Option>)>;
-
-/// Builder to build a key dictionary.
-pub struct KeyDictBuilder {
- /// Max keys of the dictionary.
- capacity: usize,
- /// Number of keys in the builder.
- num_keys: usize,
- /// Maps primary key to pk index.
- pk_to_index: PkIndexMap,
- /// Buffer for active dict block.
- key_buffer: KeyBuffer,
- /// Dictionary blocks.
- dict_blocks: Vec,
- /// Bytes allocated by keys in the index.
- key_bytes_in_index: usize,
-}
-
-impl KeyDictBuilder {
- /// Creates a new builder that can hold up to `capacity` keys.
- pub fn new(capacity: usize) -> Self {
- Self {
- capacity,
- num_keys: 0,
- pk_to_index: BTreeMap::new(),
- key_buffer: KeyBuffer::new(MAX_KEYS_PER_BLOCK.into()),
- dict_blocks: Vec::with_capacity(capacity / MAX_KEYS_PER_BLOCK as usize + 1),
- key_bytes_in_index: 0,
- }
- }
-
- /// Returns true if the builder is full.
- pub fn is_full(&self) -> bool {
- self.num_keys >= self.capacity
- }
-
- /// Adds the key to the builder and returns its index if the builder is not full.
- ///
- /// # Panics
- /// Panics if the builder is full.
- pub fn insert_key(
- &mut self,
- full_primary_key: &[u8],
- sparse_key: Option<&[u8]>,
- metrics: &mut WriteMetrics,
- ) -> PkIndex {
- assert!(!self.is_full());
-
- if let Some(pk_index) = self.pk_to_index.get(full_primary_key).map(|v| v.0) {
- // Already in the builder.
- return pk_index;
- }
-
- if self.key_buffer.len() >= MAX_KEYS_PER_BLOCK.into() {
- // The write buffer is full. Freeze a dict block.
- let dict_block = self.key_buffer.finish(false);
- self.dict_blocks.push(dict_block);
- }
-
- // Safety: we have checked the buffer length.
- let pk_index = self.key_buffer.push_key(full_primary_key);
- let (sparse_key, sparse_key_len) = if let Some(sparse_key) = sparse_key {
- (Some(sparse_key.to_vec()), sparse_key.len())
- } else {
- (None, 0)
- };
- self.pk_to_index
- .insert(full_primary_key.to_vec(), (pk_index, sparse_key));
- self.num_keys += 1;
-
- // Since we store the key twice so the bytes usage doubled.
- metrics.key_bytes += full_primary_key.len() * 2 + sparse_key_len;
- self.key_bytes_in_index += full_primary_key.len() + sparse_key_len;
-
- // Adds key size of index to the metrics.
- MEMTABLE_DICT_BYTES.add((full_primary_key.len() + sparse_key_len) as i64);
-
- pk_index
- }
-
- /// Memory size of the builder.
- #[cfg(test)]
- pub fn memory_size(&self) -> usize {
- self.key_bytes_in_index
- + self.key_buffer.buffer_memory_size()
- + self
- .dict_blocks
- .iter()
- .map(|block| block.buffer_memory_size())
- .sum::()
- }
-
- /// Finishes the builder. The key of the second BTreeMap is sparse-encoded bytes.
- pub fn finish(&mut self) -> Option<(KeyDict, BTreeMap, PkIndex>)> {
- if self.key_buffer.is_empty() {
- return None;
- }
- let mut pk_to_index_map = BTreeMap::new();
-
- // Finishes current dict block and resets the pk index.
- let dict_block = self.key_buffer.finish(true);
- self.dict_blocks.push(dict_block);
- // Computes key position and then alter pk index.
- let mut key_positions = vec![0; self.pk_to_index.len()];
-
- for (i, (full_pk, (pk_index, sparse_key))) in (std::mem::take(&mut self.pk_to_index))
- .into_iter()
- .enumerate()
- {
- // The position of the i-th key is the old pk index.
- key_positions[i] = pk_index;
- if let Some(sparse_key) = sparse_key {
- pk_to_index_map.insert(sparse_key, i as PkIndex);
- }
- pk_to_index_map.insert(full_pk, i as PkIndex);
- }
-
- self.num_keys = 0;
- let key_bytes_in_index = self.key_bytes_in_index;
- self.key_bytes_in_index = 0;
-
- Some((
- KeyDict {
- dict_blocks: std::mem::take(&mut self.dict_blocks),
- key_positions,
- key_bytes_in_index,
- },
- pk_to_index_map,
- ))
- }
-
- /// Reads the builder.
- pub fn read(&self) -> DictBuilderReader {
- let sorted_pk_indices = self.pk_to_index.values().map(|v| v.0).collect();
- let block = self.key_buffer.finish_cloned();
- let mut blocks = Vec::with_capacity(self.dict_blocks.len() + 1);
- blocks.extend_from_slice(&self.dict_blocks);
- blocks.push(block);
-
- DictBuilderReader::new(blocks, sorted_pk_indices)
- }
-}
-
-impl Drop for KeyDictBuilder {
- fn drop(&mut self) {
- MEMTABLE_DICT_BYTES.sub(self.key_bytes_in_index as i64);
- }
-}
-
-/// Reader to scan the [KeyDictBuilder].
-#[derive(Default)]
-pub struct DictBuilderReader {
- blocks: Vec,
- sorted_pk_indices: Vec,
-}
-
-impl DictBuilderReader {
- fn new(blocks: Vec, sorted_pk_indices: Vec) -> Self {
- Self {
- blocks,
- sorted_pk_indices,
- }
- }
-
- /// Returns the number of keys.
- #[cfg(test)]
- pub fn num_keys(&self) -> usize {
- self.sorted_pk_indices.len()
- }
-
- /// Gets the i-th pk index.
- #[cfg(test)]
- pub fn pk_index(&self, offset: usize) -> PkIndex {
- self.sorted_pk_indices[offset]
- }
-
- /// Gets the i-th key.
- #[cfg(test)]
- pub fn key(&self, offset: usize) -> &[u8] {
- let pk_index = self.pk_index(offset);
- self.key_by_pk_index(pk_index)
- }
-
- /// Gets the key by the pk index.
- pub fn key_by_pk_index(&self, pk_index: PkIndex) -> &[u8] {
- let block_idx = pk_index / MAX_KEYS_PER_BLOCK;
- self.blocks[block_idx as usize].key_by_pk_index(pk_index)
- }
-
- /// Returns pk weights to sort a data part and replaces pk indices.
- pub(crate) fn pk_weights_to_sort_data(&self, pk_weights: &mut Vec) {
- compute_pk_weights(&self.sorted_pk_indices, pk_weights)
- }
-}
-
-/// Returns pk weights to sort a data part and replaces pk indices.
-fn compute_pk_weights(sorted_pk_indices: &[PkIndex], pk_weights: &mut Vec) {
- pk_weights.resize(sorted_pk_indices.len(), 0);
- for (weight, pk_index) in sorted_pk_indices.iter().enumerate() {
- pk_weights[*pk_index as usize] = weight as u16;
- }
-}
-
-/// A key dictionary.
-#[derive(Default)]
-pub struct KeyDict {
- // TODO(yingwen): We can use key_positions to do a binary search.
- /// Unsorted key blocks.
- dict_blocks: Vec,
- /// Maps pk index to position of the key in [Self::dict_blocks].
- key_positions: Vec,
- /// Bytes of keys in the index.
- key_bytes_in_index: usize,
-}
-
-pub type KeyDictRef = Arc;
-
-impl KeyDict {
- /// Gets the primary key by its index.
- ///
- /// # Panics
- /// Panics if the index is invalid.
- pub fn key_by_pk_index(&self, index: PkIndex) -> &[u8] {
- let position = self.key_positions[index as usize];
- let block_index = position / MAX_KEYS_PER_BLOCK;
- self.dict_blocks[block_index as usize].key_by_pk_index(position)
- }
-
- /// Returns pk weights to sort a data part and replaces pk indices.
- pub(crate) fn pk_weights_to_sort_data(&self) -> Vec {
- let mut pk_weights = Vec::with_capacity(self.key_positions.len());
- compute_pk_weights(&self.key_positions, &mut pk_weights);
- pk_weights
- }
-
- /// Returns the shared memory size.
- pub(crate) fn shared_memory_size(&self) -> usize {
- self.key_bytes_in_index
- + self
- .dict_blocks
- .iter()
- .map(|block| block.buffer_memory_size())
- .sum::()
- }
-}
-
-impl Drop for KeyDict {
- fn drop(&mut self) {
- MEMTABLE_DICT_BYTES.sub(self.key_bytes_in_index as i64);
- }
-}
-
-/// Buffer to store unsorted primary keys.
-struct KeyBuffer {
- key_builder: BinaryBuilder,
- next_pk_index: usize,
-}
-
-impl KeyBuffer {
- fn new(item_capacity: usize) -> Self {
- Self {
- key_builder: BinaryBuilder::with_capacity(item_capacity, 0),
- next_pk_index: 0,
- }
- }
-
- /// Pushes a new key and returns its pk index.
- ///
- /// # Panics
- /// Panics if the [PkIndex] type cannot represent the index.
- fn push_key(&mut self, key: &[u8]) -> PkIndex {
- let pk_index = self.next_pk_index.try_into().unwrap();
- self.next_pk_index += 1;
- self.key_builder.append_value(key);
-
- pk_index
- }
-
- /// Returns number of items in the buffer.
- fn len(&self) -> usize {
- self.key_builder.len()
- }
-
- /// Returns whether the buffer is empty.
- fn is_empty(&self) -> bool {
- self.key_builder.is_empty()
- }
-
- /// Returns the buffer size of the builder.
- #[cfg(test)]
- fn buffer_memory_size(&self) -> usize {
- self.key_builder.values_slice().len()
- + std::mem::size_of_val(self.key_builder.offsets_slice())
- + self
- .key_builder
- .validity_slice()
- .map(|v| v.len())
- .unwrap_or(0)
- }
-
- fn finish(&mut self, reset_index: bool) -> DictBlock {
- let primary_key = self.key_builder.finish();
- // Reserve capacity for the new builder. `finish()` the builder will leave the builder
- // empty with capacity 0.
- // TODO(yingwen): Do we need to reserve capacity for data?
- self.key_builder = BinaryBuilder::with_capacity(primary_key.len(), 0);
- if reset_index {
- self.next_pk_index = 0;
- }
-
- DictBlock::new(primary_key)
- }
-
- fn finish_cloned(&self) -> DictBlock {
- let primary_key = self.key_builder.finish_cloned();
-
- DictBlock::new(primary_key)
- }
-}
-
-/// A block in the key dictionary.
-///
-/// The block is cheap to clone. Keys in the block are unsorted.
-#[derive(Clone)]
-struct DictBlock {
- /// Container of keys in the block.
- keys: BinaryArray,
-}
-
-impl DictBlock {
- fn new(keys: BinaryArray) -> Self {
- let buffer_size = keys.get_buffer_memory_size();
- MEMTABLE_DICT_BYTES.add(buffer_size as i64);
-
- Self { keys }
- }
-
- fn key_by_pk_index(&self, index: PkIndex) -> &[u8] {
- let pos = index % MAX_KEYS_PER_BLOCK;
- self.keys.value(pos as usize)
- }
-
- fn buffer_memory_size(&self) -> usize {
- self.keys.get_buffer_memory_size()
- }
-}
-
-impl Drop for DictBlock {
- fn drop(&mut self) {
- let buffer_size = self.keys.get_buffer_memory_size();
- MEMTABLE_DICT_BYTES.sub(buffer_size as i64);
- }
-}
-
-#[cfg(test)]
-mod tests {
- use rand::Rng;
-
- use super::*;
-
- fn prepare_input_keys(num_keys: usize) -> Vec> {
- let prefix = ["a", "b", "c", "d", "e", "f"];
- let mut rng = rand::rng();
- let mut keys = Vec::with_capacity(num_keys);
- for i in 0..num_keys {
- let prefix_idx = rng.random_range(0..prefix.len());
- // We don't need to decode the primary key in index's test so we format the string
- // into the key.
- let key = format!("{}{}", prefix[prefix_idx], i);
- keys.push(key.into_bytes());
- }
-
- keys
- }
-
- #[test]
- fn test_write_scan_builder() {
- let num_keys = MAX_KEYS_PER_BLOCK * 2 + MAX_KEYS_PER_BLOCK / 2;
- let keys = prepare_input_keys(num_keys.into());
-
- let mut builder = KeyDictBuilder::new((MAX_KEYS_PER_BLOCK * 3).into());
- let mut last_pk_index = None;
- let mut metrics = WriteMetrics::default();
- for key in &keys {
- assert!(!builder.is_full());
- let pk_index = builder.insert_key(key, None, &mut metrics);
- last_pk_index = Some(pk_index);
- }
- assert_eq!(num_keys - 1, last_pk_index.unwrap());
- let key_bytes: usize = keys.iter().map(|key| key.len() * 2).sum();
- assert_eq!(key_bytes, metrics.key_bytes);
-
- let mut expect: Vec<_> = keys
- .into_iter()
- .enumerate()
- .map(|(i, key)| (key, i as PkIndex))
- .collect();
- expect.sort_unstable_by(|a, b| a.0.cmp(&b.0));
-
- let mut result = Vec::with_capacity(expect.len());
- let reader = builder.read();
- for i in 0..reader.num_keys() {
- result.push((reader.key(i).to_vec(), reader.pk_index(i)));
- }
- assert_eq!(expect, result);
- }
-
- #[test]
- fn test_dict_memory_size() {
- let mut builder = KeyDictBuilder::new((MAX_KEYS_PER_BLOCK * 3).into());
- let mut metrics = WriteMetrics::default();
- // 513 keys
- let num_keys = MAX_KEYS_PER_BLOCK * 2 + 1;
- // Writes 2 blocks
- for i in 0..num_keys {
- // Each key is 5 bytes.
- let key = format!("{i:05}");
- builder.insert_key(key.as_bytes(), None, &mut metrics);
- }
- let key_bytes = num_keys as usize * 5;
- assert_eq!(key_bytes * 2, metrics.key_bytes);
- assert_eq!(key_bytes, builder.key_bytes_in_index);
- assert_eq!(8730, builder.memory_size());
-
- let (dict, _) = builder.finish().unwrap();
- assert_eq!(0, builder.key_bytes_in_index);
- assert_eq!(key_bytes, dict.key_bytes_in_index);
- assert!(dict.shared_memory_size() > key_bytes);
- }
-
- #[test]
- fn test_builder_finish() {
- let mut builder = KeyDictBuilder::new((MAX_KEYS_PER_BLOCK * 2).into());
- let mut metrics = WriteMetrics::default();
- for i in 0..MAX_KEYS_PER_BLOCK * 2 {
- let key = format!("{i:010}");
- assert!(!builder.is_full());
- builder.insert_key(key.as_bytes(), None, &mut metrics);
- }
- assert!(builder.is_full());
- builder.finish();
-
- assert!(!builder.is_full());
- assert_eq!(0, builder.insert_key(b"a0", None, &mut metrics));
- }
-
- #[test]
- fn test_builder_finish_with_sparse_key() {
- let mut builder = KeyDictBuilder::new((MAX_KEYS_PER_BLOCK * 2).into());
- let mut metrics = WriteMetrics::default();
- let full_key = "42".to_string();
- let sparse_key = &[42u8];
-
- builder.insert_key(full_key.as_bytes(), Some(sparse_key), &mut metrics);
- let (dict, pk_to_pk_id) = builder.finish().unwrap();
- assert_eq!(dict.key_positions.len(), 1);
- assert_eq!(dict.dict_blocks.len(), 1);
- assert_eq!(
- pk_to_pk_id.get(sparse_key.as_slice()),
- pk_to_pk_id.get(full_key.as_bytes())
- );
- }
-}
diff --git a/src/mito2/src/memtable/partition_tree/merger.rs b/src/mito2/src/memtable/partition_tree/merger.rs
deleted file mode 100644
index 5a74934bdc6a..000000000000
--- a/src/mito2/src/memtable/partition_tree/merger.rs
+++ /dev/null
@@ -1,554 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::cmp::{Ordering, Reverse};
-use std::collections::BinaryHeap;
-use std::fmt::Debug;
-use std::ops::Range;
-
-use crate::error::Result;
-use crate::memtable::partition_tree::PkIndex;
-use crate::memtable::partition_tree::data::{DataBatch, DataBufferReader, DataPartReader};
-
-/// Nodes of merger's heap.
-pub trait Node: Ord {
- /// Returns true if current node is not exhausted.
- fn is_valid(&self) -> bool;
-
- /// Whether the other node is behind (exclusive) current node.
- fn is_behind(&self, other: &Self) -> bool;
-
- /// Advances `len` rows from current batch. If current batch is empty it fetches
- /// next batch from the node.
- ///
- /// # Panics
- /// If the node is invalid.
- fn advance(&mut self, len: usize) -> Result<()>;
-
- /// Length of current item.
- fn current_item_len(&self) -> usize;
-
- /// Searches first key of `other` in current item and returns the index.
- fn search_key_in_current_item(&self, other: &Self) -> Result;
-}
-
-pub struct Merger {
- /// Heap to find node to read.
- ///
- /// Nodes in the heap are always valid.
- heap: BinaryHeap,
- /// Current node to read.
- ///
- /// The node is always valid if it is not None.
- current_node: Option,
- /// The number of rows in current node that are valid to read.
- current_rows: usize,
-}
-
-impl Merger {
- pub(crate) fn try_new(nodes: Vec) -> Result {
- let mut heap = BinaryHeap::with_capacity(nodes.len());
- for node in nodes {
- if node.is_valid() {
- heap.push(node);
- }
- }
- let mut merger = Merger {
- heap,
- current_node: None,
- current_rows: 0,
- };
- merger.next()?;
- Ok(merger)
- }
-
- /// Returns true if current merger is still valid.
- pub(crate) fn is_valid(&self) -> bool {
- self.current_node.is_some()
- }
-
- /// Returns current node to read. Only [Self::current_rows] rows in current node
- /// are valid to read.
- ///
- /// # Panics
- /// Panics if the merger is invalid.
- pub(crate) fn current_node(&self) -> &T {
- self.current_node.as_ref().unwrap()
- }
-
- /// Returns rows of current node to read.
- pub(crate) fn current_rows(&self) -> usize {
- self.current_rows
- }
-
- /// Advances the merger to the next item.
- pub(crate) fn next(&mut self) -> Result<()> {
- self.maybe_advance_current_node()?;
- debug_assert!(self.current_node.is_none());
-
- // Finds node and range to read from the heap.
- let Some(top_node) = self.heap.pop() else {
- // Heap is empty.
- return Ok(());
- };
- if let Some(next_node) = self.heap.peek() {
- if next_node.is_behind(&top_node) {
- // Does not overlap.
- self.current_rows = top_node.current_item_len();
- } else {
- // Note that the heap ensures the top node always has the minimal row.
- match top_node.search_key_in_current_item(next_node) {
- Ok(pos) => {
- if pos == 0 {
- // If the first item of top node has duplicate key with the next node,
- // we can simply return the first row in the top node as it must be the one
- // with max sequence.
- self.current_rows = 1;
- } else {
- // We don't know which one has the larger sequence so we use the range before
- // the duplicate pos.
- self.current_rows = pos;
- }
- }
- Err(pos) => {
- // No duplication. Output rows before pos.
- debug_assert!(pos > 0);
- self.current_rows = pos;
- }
- }
- }
- } else {
- // Top is the only node left. We can read all rows in it.
- self.current_rows = top_node.current_item_len();
- }
- self.current_node = Some(top_node);
-
- Ok(())
- }
-
- fn maybe_advance_current_node(&mut self) -> Result<()> {
- let Some(mut node) = self.current_node.take() else {
- return Ok(());
- };
-
- // Advances current node.
- node.advance(self.current_rows)?;
- self.current_rows = 0;
- if !node.is_valid() {
- return Ok(());
- }
-
- // Puts the node into the heap.
- self.heap.push(node);
- Ok(())
- }
-}
-
-#[derive(Debug)]
-pub(crate) struct DataBatchKey {
- pub(crate) pk_index: PkIndex,
- pub(crate) timestamp: i64,
-}
-
-pub(crate) enum DataSource {
- Buffer(DataBufferReader),
- Part(DataPartReader),
-}
-
-impl DataSource {
- fn current_data_batch(&self) -> DataBatch<'_> {
- match self {
- DataSource::Buffer(buffer) => buffer.current_data_batch(),
- DataSource::Part(p) => p.current_data_batch(),
- }
- }
-
- fn is_valid(&self) -> bool {
- match self {
- DataSource::Buffer(b) => b.is_valid(),
- DataSource::Part(p) => p.is_valid(),
- }
- }
-
- fn next(&mut self) -> Result<()> {
- match self {
- DataSource::Buffer(b) => b.next(),
- DataSource::Part(p) => p.next(),
- }
- }
-}
-
-pub(crate) struct DataNode {
- source: DataSource,
- /// Current range of the batch in the source.
- current_range: Option>,
-}
-
-impl DataNode {
- pub(crate) fn new(source: DataSource) -> Self {
- let current_range = source
- .is_valid()
- .then(|| 0..source.current_data_batch().range().len());
-
- Self {
- source,
- current_range,
- }
- }
-
- pub(crate) fn current_data_batch(&self) -> DataBatch<'_> {
- let range = self.current_range();
- let batch = self.source.current_data_batch();
- batch.slice(range.start, range.len())
- }
-
- fn current_range(&self) -> Range {
- self.current_range.clone().unwrap()
- }
-}
-
-impl Ord for DataNode {
- fn cmp(&self, other: &Self) -> Ordering {
- let weight = self.current_data_batch().pk_index();
- let (ts_start, sequence) = self.current_data_batch().first_row();
- let other_weight = other.current_data_batch().pk_index();
- let (other_ts_start, other_sequence) = other.current_data_batch().first_row();
- (weight, ts_start, Reverse(sequence))
- .cmp(&(other_weight, other_ts_start, Reverse(other_sequence)))
- .reverse()
- }
-}
-
-impl Eq for DataNode {}
-
-impl PartialEq for DataNode {
- fn eq(&self, other: &Self) -> bool {
- self.current_data_batch()
- .first_row()
- .eq(&other.current_data_batch().first_row())
- }
-}
-
-impl PartialOrd for DataNode {
- fn partial_cmp(&self, other: &Self) -> Option {
- Some(self.cmp(other))
- }
-}
-
-impl Node for DataNode {
- fn is_valid(&self) -> bool {
- self.current_range.is_some()
- }
-
- fn is_behind(&self, other: &Self) -> bool {
- let pk_weight = self.current_data_batch().pk_index();
- let (start, seq) = self.current_data_batch().first_row();
- let other_pk_weight = other.current_data_batch().pk_index();
- let (other_end, other_seq) = other.current_data_batch().last_row();
- (pk_weight, start, Reverse(seq)) > (other_pk_weight, other_end, Reverse(other_seq))
- }
-
- fn advance(&mut self, len: usize) -> Result<()> {
- let mut range = self.current_range();
- debug_assert!(range.len() >= len);
-
- let remaining = range.len() - len;
- if remaining == 0 {
- // Nothing remains, we need to fetch next batch to ensure the current batch is not empty.
- self.source.next()?;
- if self.source.is_valid() {
- self.current_range = Some(0..self.source.current_data_batch().range().len());
- } else {
- // The node is exhausted.
- self.current_range = None;
- }
- } else {
- range.start += len;
- self.current_range = Some(range);
- }
-
- Ok(())
- }
-
- fn current_item_len(&self) -> usize {
- self.current_range.clone().unwrap().len()
- }
-
- fn search_key_in_current_item(&self, other: &Self) -> Result {
- let key = other.current_data_batch().first_key();
- self.current_data_batch().search_key(&key)
- }
-}
-
-#[cfg(test)]
-mod tests {
- use datatypes::arrow::array::UInt64Array;
- use store_api::metadata::RegionMetadataRef;
-
- use super::*;
- use crate::memtable::partition_tree::data::{DataBuffer, timestamp_array_to_i64_slice};
- use crate::test_util::memtable_util::{build_key_values_with_ts_seq_values, metadata_for_test};
-
- fn write_rows_to_buffer(
- buffer: &mut DataBuffer,
- schema: &RegionMetadataRef,
- pk_index: u16,
- ts: Vec,
- sequence: &mut u64,
- ) {
- let rows = ts.len() as u64;
- let v0 = ts.iter().map(|v| Some(*v as f64)).collect::>();
- let kvs = build_key_values_with_ts_seq_values(
- schema,
- "whatever".to_string(),
- 1,
- ts.into_iter(),
- v0.into_iter(),
- *sequence,
- );
-
- for kv in kvs.iter() {
- buffer.write_row(pk_index, &kv);
- }
-
- *sequence += rows;
- }
-
- fn check_merger_read(nodes: Vec, expected: &[(u16, Vec<(i64, u64)>)]) {
- let mut merger = Merger::try_new(nodes).unwrap();
-
- let mut res = vec![];
- while merger.is_valid() {
- let data_batch = merger.current_node().current_data_batch();
- let data_batch = data_batch.slice(0, merger.current_rows());
- let batch = data_batch.slice_record_batch();
- let ts_array = batch.column(1);
- let ts_values: Vec<_> = timestamp_array_to_i64_slice(ts_array).to_vec();
- let ts_and_seq = ts_values
- .into_iter()
- .zip(
- batch
- .column(2)
- .as_any()
- .downcast_ref::()
- .unwrap()
- .iter(),
- )
- .map(|(ts, seq)| (ts, seq.unwrap()))
- .collect::>();
-
- res.push((data_batch.pk_index(), ts_and_seq));
- merger.next().unwrap();
- }
- assert_eq!(expected, &res);
- }
-
- #[test]
- fn test_merger() {
- let metadata = metadata_for_test();
- let mut buffer1 = DataBuffer::with_capacity(metadata.clone(), 10, true);
- let weight = &[2, 1, 0];
- let mut seq = 0;
- write_rows_to_buffer(&mut buffer1, &metadata, 1, vec![2, 3], &mut seq);
- write_rows_to_buffer(&mut buffer1, &metadata, 0, vec![1, 2], &mut seq);
- let node1 = DataNode::new(DataSource::Part(
- buffer1.freeze(Some(weight), true).unwrap().read().unwrap(),
- ));
-
- let mut buffer2 = DataBuffer::with_capacity(metadata.clone(), 10, true);
- write_rows_to_buffer(&mut buffer2, &metadata, 1, vec![3], &mut seq);
- write_rows_to_buffer(&mut buffer2, &metadata, 0, vec![1], &mut seq);
- let node2 = DataNode::new(DataSource::Part(
- buffer2.freeze(Some(weight), true).unwrap().read().unwrap(),
- ));
-
- check_merger_read(
- vec![node1, node2],
- &[
- (1, vec![(2, 0)]),
- (1, vec![(3, 4)]),
- (1, vec![(3, 1)]),
- (2, vec![(1, 5)]),
- (2, vec![(1, 2), (2, 3)]),
- ],
- );
- }
-
- #[test]
- fn test_merger2() {
- let metadata = metadata_for_test();
- let mut buffer1 = DataBuffer::with_capacity(metadata.clone(), 10, true);
- let weight = &[2, 1, 0];
- let mut seq = 0;
- write_rows_to_buffer(&mut buffer1, &metadata, 1, vec![2, 3], &mut seq);
- write_rows_to_buffer(&mut buffer1, &metadata, 0, vec![1, 2], &mut seq);
- let node1 = DataNode::new(DataSource::Part(
- buffer1.freeze(Some(weight), true).unwrap().read().unwrap(),
- ));
-
- let mut buffer2 = DataBuffer::with_capacity(metadata.clone(), 10, true);
- write_rows_to_buffer(&mut buffer2, &metadata, 1, vec![3], &mut seq);
- let node2 = DataNode::new(DataSource::Part(
- buffer2.freeze(Some(weight), true).unwrap().read().unwrap(),
- ));
-
- let mut buffer3 = DataBuffer::with_capacity(metadata.clone(), 10, true);
- write_rows_to_buffer(&mut buffer3, &metadata, 0, vec![2, 3], &mut seq);
- let node3 = DataNode::new(DataSource::Part(
- buffer3.freeze(Some(weight), true).unwrap().read().unwrap(),
- ));
-
- check_merger_read(
- vec![node1, node3, node2],
- &[
- (1, vec![(2, 0)]),
- (1, vec![(3, 4)]),
- (1, vec![(3, 1)]),
- (2, vec![(1, 2)]),
- (2, vec![(2, 5)]),
- (2, vec![(2, 3)]),
- (2, vec![(3, 6)]),
- ],
- );
- }
-
- #[test]
- fn test_merger_overlapping() {
- let metadata = metadata_for_test();
- let mut buffer1 = DataBuffer::with_capacity(metadata.clone(), 10, true);
- let weight = &[0, 1, 2];
- let mut seq = 0;
- write_rows_to_buffer(&mut buffer1, &metadata, 0, vec![1, 2, 3], &mut seq);
- let node1 = DataNode::new(DataSource::Part(
- buffer1.freeze(Some(weight), true).unwrap().read().unwrap(),
- ));
-
- let mut buffer2 = DataBuffer::with_capacity(metadata.clone(), 10, true);
- write_rows_to_buffer(&mut buffer2, &metadata, 1, vec![2, 3], &mut seq);
- let node2 = DataNode::new(DataSource::Part(
- buffer2.freeze(Some(weight), true).unwrap().read().unwrap(),
- ));
-
- let mut buffer3 = DataBuffer::with_capacity(metadata.clone(), 10, true);
- write_rows_to_buffer(&mut buffer3, &metadata, 0, vec![2, 3], &mut seq);
- let node3 = DataNode::new(DataSource::Part(
- buffer3.freeze(Some(weight), true).unwrap().read().unwrap(),
- ));
-
- check_merger_read(
- vec![node1, node3, node2],
- &[
- (0, vec![(1, 0)]),
- (0, vec![(2, 5)]),
- (0, vec![(2, 1)]),
- (0, vec![(3, 6)]),
- (0, vec![(3, 2)]),
- (1, vec![(2, 3), (3, 4)]),
- ],
- );
- }
-
- #[test]
- fn test_merger_parts_and_buffer() {
- let metadata = metadata_for_test();
- let mut buffer1 = DataBuffer::with_capacity(metadata.clone(), 10, true);
- let weight = &[0, 1, 2];
- let mut seq = 0;
- write_rows_to_buffer(&mut buffer1, &metadata, 0, vec![1, 2, 3], &mut seq);
- let node1 = DataNode::new(DataSource::Buffer(
- buffer1.read().unwrap().build(Some(weight)).unwrap(),
- ));
-
- let mut buffer2 = DataBuffer::with_capacity(metadata.clone(), 10, true);
- write_rows_to_buffer(&mut buffer2, &metadata, 1, vec![2, 3], &mut seq);
- let node2 = DataNode::new(DataSource::Part(
- buffer2.freeze(Some(weight), true).unwrap().read().unwrap(),
- ));
-
- let mut buffer3 = DataBuffer::with_capacity(metadata.clone(), 10, true);
- write_rows_to_buffer(&mut buffer3, &metadata, 0, vec![2, 3], &mut seq);
- let node3 = DataNode::new(DataSource::Part(
- buffer3.freeze(Some(weight), true).unwrap().read().unwrap(),
- ));
-
- check_merger_read(
- vec![node1, node3, node2],
- &[
- (0, vec![(1, 0)]),
- (0, vec![(2, 5)]),
- (0, vec![(2, 1)]),
- (0, vec![(3, 6)]),
- (0, vec![(3, 2)]),
- (1, vec![(2, 3), (3, 4)]),
- ],
- );
- }
-
- #[test]
- fn test_merger_overlapping_2() {
- let metadata = metadata_for_test();
- let mut buffer1 = DataBuffer::with_capacity(metadata.clone(), 10, true);
- let weight = &[0, 1, 2];
- let mut seq = 0;
- write_rows_to_buffer(&mut buffer1, &metadata, 0, vec![1, 2, 2], &mut seq);
- let node1 = DataNode::new(DataSource::Part(
- buffer1.freeze(Some(weight), true).unwrap().read().unwrap(),
- ));
-
- let mut buffer2 = DataBuffer::with_capacity(metadata.clone(), 10, true);
- write_rows_to_buffer(&mut buffer2, &metadata, 0, vec![2], &mut seq);
- let node2 = DataNode::new(DataSource::Part(
- buffer2.freeze(Some(weight), true).unwrap().read().unwrap(),
- ));
-
- let mut buffer3 = DataBuffer::with_capacity(metadata.clone(), 10, true);
- write_rows_to_buffer(&mut buffer3, &metadata, 0, vec![2], &mut seq);
- let node3 = DataNode::new(DataSource::Part(
- buffer3.freeze(Some(weight), true).unwrap().read().unwrap(),
- ));
-
- check_merger_read(
- vec![node1, node2, node3],
- &[
- (0, vec![(1, 0)]),
- (0, vec![(2, 4)]),
- (0, vec![(2, 3)]),
- (0, vec![(2, 2)]),
- ],
- );
- }
-
- #[test]
- fn test_merger_overlapping_3() {
- let metadata = metadata_for_test();
- let mut buffer1 = DataBuffer::with_capacity(metadata.clone(), 10, true);
- let weight = &[0, 1, 2];
- let mut seq = 0;
- write_rows_to_buffer(&mut buffer1, &metadata, 0, vec![0, 1], &mut seq);
- let node1 = DataNode::new(DataSource::Part(
- buffer1.freeze(Some(weight), true).unwrap().read().unwrap(),
- ));
-
- let mut buffer2 = DataBuffer::with_capacity(metadata.clone(), 10, true);
- write_rows_to_buffer(&mut buffer2, &metadata, 0, vec![1], &mut seq);
- let node2 = DataNode::new(DataSource::Part(
- buffer2.freeze(Some(weight), true).unwrap().read().unwrap(),
- ));
-
- check_merger_read(
- vec![node1, node2],
- &[(0, vec![(0, 0)]), (0, vec![(1, 2)]), (0, vec![(1, 1)])],
- );
- }
-}
diff --git a/src/mito2/src/memtable/partition_tree/partition.rs b/src/mito2/src/memtable/partition_tree/partition.rs
deleted file mode 100644
index 0ffbce486761..000000000000
--- a/src/mito2/src/memtable/partition_tree/partition.rs
+++ /dev/null
@@ -1,590 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Partition of a partition tree.
-//!
-//! We only support partitioning the tree by pre-defined internal columns.
-
-use std::collections::{HashMap, HashSet};
-use std::sync::{Arc, RwLock};
-use std::time::{Duration, Instant};
-
-use api::v1::SemanticType;
-use common_recordbatch::filter::SimpleFilterEvaluator;
-use mito_codec::key_values::KeyValue;
-use mito_codec::primary_key_filter::is_partition_column;
-use mito_codec::row_converter::{PrimaryKeyCodec, PrimaryKeyFilter};
-use snafu::ResultExt;
-use store_api::codec::PrimaryKeyEncoding;
-use store_api::metadata::RegionMetadataRef;
-use store_api::metric_engine_consts::DATA_SCHEMA_TABLE_ID_COLUMN_NAME;
-use store_api::storage::ColumnId;
-
-use crate::error::{EncodeSnafu, Result};
-use crate::memtable::partition_tree::data::{DATA_INIT_CAP, DataBatch, DataParts};
-use crate::memtable::partition_tree::dedup::DedupReader;
-use crate::memtable::partition_tree::shard::{
- BoxedDataBatchSource, Shard, ShardMerger, ShardNode, ShardSource,
-};
-use crate::memtable::partition_tree::shard_builder::ShardBuilder;
-use crate::memtable::partition_tree::{PartitionTreeConfig, PkId};
-use crate::memtable::stats::WriteMetrics;
-use crate::metrics::PARTITION_TREE_READ_STAGE_ELAPSED;
-use crate::read::{Batch, BatchBuilder};
-
-/// Key of a partition.
-pub type PartitionKey = u32;
-
-/// A tree partition.
-pub struct Partition {
- inner: RwLock,
- /// Whether to dedup batches.
- dedup: bool,
-}
-
-pub type PartitionRef = Arc;
-
-impl Partition {
- /// Creates a new partition.
- pub fn new(metadata: RegionMetadataRef, config: &PartitionTreeConfig) -> Self {
- Partition {
- inner: RwLock::new(Inner::new(metadata, config)),
- dedup: config.dedup,
- }
- }
-
- /// Writes to the partition with a primary key.
- pub fn write_with_key(
- &self,
- primary_key: &mut Vec,
- row_codec: &dyn PrimaryKeyCodec,
- key_value: KeyValue,
- re_encode: bool,
- metrics: &mut WriteMetrics,
- ) -> Result<()> {
- let mut inner = self.inner.write().unwrap();
- // Freeze the shard builder if needed.
- if inner.shard_builder.should_freeze() {
- inner.freeze_active_shard()?;
- }
-
- // Finds key in shards, now we ensure one key only exists in one shard.
- if let Some(pk_id) = inner.find_key_in_shards(primary_key) {
- inner.write_to_shard(pk_id, &key_value)?;
- inner.num_rows += 1;
- return Ok(());
- }
-
- // Key does not yet exist in shard or builder, encode and insert the full primary key.
- if re_encode {
- match row_codec.encoding() {
- PrimaryKeyEncoding::Dense => {
- // `primary_key` is sparse, re-encode the full primary key.
- let sparse_key = primary_key.clone();
- primary_key.clear();
- row_codec
- .encode_key_value(&key_value, primary_key)
- .context(EncodeSnafu)?;
- let pk_id = inner.shard_builder.write_with_key(
- primary_key,
- Some(&sparse_key),
- &key_value,
- metrics,
- );
- inner.pk_to_pk_id.insert(sparse_key, pk_id);
- }
- PrimaryKeyEncoding::Sparse => {
- let sparse_key = primary_key.clone();
- let pk_id = inner.shard_builder.write_with_key(
- primary_key,
- Some(&sparse_key),
- &key_value,
- metrics,
- );
- inner.pk_to_pk_id.insert(sparse_key, pk_id);
- }
- }
- } else {
- // `primary_key` is already the full primary key.
- let pk_id = inner
- .shard_builder
- .write_with_key(primary_key, None, &key_value, metrics);
- inner.pk_to_pk_id.insert(std::mem::take(primary_key), pk_id);
- };
-
- inner.num_rows += 1;
- Ok(())
- }
-
- /// Writes to the partition without a primary key.
- pub fn write_no_key(&self, key_value: KeyValue) -> Result<()> {
- let mut inner = self.inner.write().unwrap();
- // If no primary key, always write to the first shard.
- debug_assert!(!inner.shards.is_empty());
- debug_assert_eq!(1, inner.shard_builder.current_shard_id());
-
- // A dummy pk id.
- let pk_id = PkId {
- shard_id: 0,
- pk_index: 0,
- };
- inner.shards[0].write_with_pk_id(pk_id, &key_value)?;
- inner.num_rows += 1;
-
- Ok(())
- }
-
- fn build_primary_key_filter(
- need_prune_key: bool,
- metadata: &RegionMetadataRef,
- row_codec: &dyn PrimaryKeyCodec,
- filters: &Arc>,
- ) -> Option> {
- if need_prune_key {
- // TODO(yingwen): Remove `skip_partition_column` after dropping PartitionTreeMemtable.
- let filter = row_codec.primary_key_filter(metadata, filters.clone(), true);
- Some(filter)
- } else {
- None
- }
- }
-
- /// Scans data in the partition.
- pub fn read(&self, mut context: ReadPartitionContext) -> Result {
- let start = Instant::now();
- let (builder_source, shard_reader_builders) = {
- let inner = self.inner.read().unwrap();
- let mut shard_source = Vec::with_capacity(inner.shards.len() + 1);
- let builder_reader = if !inner.shard_builder.is_empty() {
- let builder_reader = inner.shard_builder.read(&mut context.pk_weights)?;
- Some(builder_reader)
- } else {
- None
- };
- for shard in &inner.shards {
- if !shard.is_empty() {
- let shard_reader_builder = shard.read()?;
- shard_source.push(shard_reader_builder);
- }
- }
- (builder_reader, shard_source)
- };
-
- context.metrics.num_shards += shard_reader_builders.len();
-
- let mut nodes = shard_reader_builders
- .into_iter()
- .map(|builder| {
- let primary_key_filter = Self::build_primary_key_filter(
- context.need_prune_key,
- &context.metadata,
- context.row_codec.as_ref(),
- &context.filters,
- );
- Ok(ShardNode::new(ShardSource::Shard(
- builder.build(primary_key_filter)?,
- )))
- })
- .collect::>>()?;
-
- if let Some(builder) = builder_source {
- context.metrics.num_builder += 1;
- let primary_key_filter = Self::build_primary_key_filter(
- context.need_prune_key,
- &context.metadata,
- context.row_codec.as_ref(),
- &context.filters,
- );
- // Move the initialization of ShardBuilderReader out of read lock.
- let shard_builder_reader =
- builder.build(Some(&context.pk_weights), primary_key_filter)?;
- nodes.push(ShardNode::new(ShardSource::Builder(shard_builder_reader)));
- }
-
- // Creating a shard merger will invoke next so we do it outside the lock.
- let merger = ShardMerger::try_new(nodes)?;
- if self.dedup {
- let source = DedupReader::try_new(merger)?;
- context.metrics.build_partition_reader += start.elapsed();
- PartitionReader::new(context, Box::new(source))
- } else {
- context.metrics.build_partition_reader += start.elapsed();
- PartitionReader::new(context, Box::new(merger))
- }
- }
-
- /// Freezes the partition.
- pub fn freeze(&self) -> Result<()> {
- let mut inner = self.inner.write().unwrap();
- inner.freeze_active_shard()?;
- Ok(())
- }
-
- /// Forks the partition.
- ///
- /// Must freeze the partition before fork.
- pub fn fork(&self, metadata: &RegionMetadataRef, config: &PartitionTreeConfig) -> Partition {
- let (shards, shard_builder) = {
- let inner = self.inner.read().unwrap();
- debug_assert!(inner.shard_builder.is_empty());
- let shard_builder = ShardBuilder::new(
- metadata.clone(),
- config,
- inner.shard_builder.current_shard_id(),
- );
- let shards = inner
- .shards
- .iter()
- .map(|shard| shard.fork(metadata.clone()))
- .collect();
-
- (shards, shard_builder)
- };
- let pk_to_pk_id = {
- let mut inner = self.inner.write().unwrap();
- std::mem::take(&mut inner.pk_to_pk_id)
- };
-
- Partition {
- inner: RwLock::new(Inner {
- metadata: metadata.clone(),
- shard_builder,
- shards,
- num_rows: 0,
- pk_to_pk_id,
- frozen: false,
- }),
- dedup: self.dedup,
- }
- }
-
- /// Returns true if the partition has data.
- pub fn has_data(&self) -> bool {
- let inner = self.inner.read().unwrap();
- inner.num_rows > 0
- }
-
- /// Gets the stats of the partition.
- pub(crate) fn stats(&self) -> PartitionStats {
- let inner = self.inner.read().unwrap();
- let num_rows = inner.num_rows;
- let shard_num = inner.shards.len();
- let shared_memory_size = inner
- .shards
- .iter()
- .map(|shard| shard.shared_memory_size())
- .sum();
- PartitionStats {
- num_rows,
- shard_num,
- shared_memory_size,
- }
- }
-
- /// Get partition key from the key value.
- pub(crate) fn get_partition_key(key_value: &KeyValue, is_partitioned: bool) -> PartitionKey {
- if !is_partitioned {
- return PartitionKey::default();
- }
-
- key_value.partition_key()
- }
-
- /// Returns true if the region can be partitioned.
- pub(crate) fn has_multi_partitions(metadata: &RegionMetadataRef) -> bool {
- metadata
- .primary_key_columns()
- .next()
- .map(|meta| meta.column_schema.name == DATA_SCHEMA_TABLE_ID_COLUMN_NAME)
- .unwrap_or(false)
- }
-
- pub(crate) fn series_count(&self) -> usize {
- self.inner.read().unwrap().series_count()
- }
-}
-
-pub(crate) struct PartitionStats {
- pub(crate) num_rows: usize,
- pub(crate) shard_num: usize,
- pub(crate) shared_memory_size: usize,
-}
-
-#[derive(Default)]
-struct PartitionReaderMetrics {
- build_partition_reader: Duration,
- read_source: Duration,
- data_batch_to_batch: Duration,
- num_builder: usize,
- num_shards: usize,
-}
-
-/// Reader to scan rows in a partition.
-///
-/// It can merge rows from multiple shards.
-pub struct PartitionReader {
- context: ReadPartitionContext,
- source: BoxedDataBatchSource,
-}
-
-impl PartitionReader {
- fn new(context: ReadPartitionContext, source: BoxedDataBatchSource) -> Result {
- let reader = Self { context, source };
-
- Ok(reader)
- }
-
- /// Returns true if the reader is valid.
- pub fn is_valid(&self) -> bool {
- self.source.is_valid()
- }
-
- /// Advances the reader.
- ///
- /// # Panics
- /// Panics if the reader is invalid.
- pub fn next(&mut self) -> Result<()> {
- self.advance_source()
- }
-
- /// Converts current data batch into a [Batch].
- ///
- /// # Panics
- /// Panics if the reader is invalid.
- pub fn convert_current_batch(&mut self) -> Result {
- let start = Instant::now();
- let data_batch = self.source.current_data_batch();
- let batch = data_batch_to_batch(
- &self.context.metadata,
- &self.context.projection,
- self.source.current_key(),
- data_batch,
- )?;
- self.context.metrics.data_batch_to_batch += start.elapsed();
- Ok(batch)
- }
-
- pub(crate) fn into_context(self) -> ReadPartitionContext {
- self.context
- }
-
- fn advance_source(&mut self) -> Result<()> {
- let read_source = Instant::now();
- self.source.next()?;
- self.context.metrics.read_source += read_source.elapsed();
- Ok(())
- }
-}
-
-/// Structs to reuse across readers to avoid allocating for each reader.
-pub(crate) struct ReadPartitionContext {
- metadata: RegionMetadataRef,
- row_codec: Arc,
- projection: HashSet,
- filters: Arc>,
- /// Buffer to store pk weights.
- pk_weights: Vec,
- need_prune_key: bool,
- metrics: PartitionReaderMetrics,
-}
-
-impl Drop for ReadPartitionContext {
- fn drop(&mut self) {
- let partition_read_source = self.metrics.read_source.as_secs_f64();
- PARTITION_TREE_READ_STAGE_ELAPSED
- .with_label_values(&["partition_read_source"])
- .observe(partition_read_source);
- let partition_data_batch_to_batch = self.metrics.data_batch_to_batch.as_secs_f64();
- PARTITION_TREE_READ_STAGE_ELAPSED
- .with_label_values(&["partition_data_batch_to_batch"])
- .observe(partition_data_batch_to_batch);
-
- common_telemetry::debug!(
- "TreeIter partitions metrics, \
- num_builder: {}, \
- num_shards: {}, \
- build_partition_reader: {}s, \
- partition_read_source: {}s, \
- partition_data_batch_to_batch: {}s",
- self.metrics.num_builder,
- self.metrics.num_shards,
- self.metrics.build_partition_reader.as_secs_f64(),
- partition_read_source,
- partition_data_batch_to_batch,
- );
- }
-}
-
-impl ReadPartitionContext {
- pub(crate) fn new(
- metadata: RegionMetadataRef,
- row_codec: Arc,
- projection: HashSet,
- filters: Arc>,
- ) -> ReadPartitionContext {
- let need_prune_key = Self::need_prune_key(&metadata, &filters);
- ReadPartitionContext {
- metadata,
- row_codec,
- projection,
- filters,
- pk_weights: Vec::new(),
- need_prune_key,
- metrics: Default::default(),
- }
- }
-
- /// Does filter contain predicate on primary key columns after pruning the
- /// partition column.
- fn need_prune_key(metadata: &RegionMetadataRef, filters: &[SimpleFilterEvaluator]) -> bool {
- for filter in filters {
- // We already pruned partitions before so we skip the partition column.
- if is_partition_column(filter.column_name()) {
- continue;
- }
- let Some(column) = metadata.column_by_name(filter.column_name()) else {
- continue;
- };
- if column.semantic_type != SemanticType::Tag {
- continue;
- }
-
- return true;
- }
-
- false
- }
-}
-
-// TODO(yingwen): Pushdown projection to shard readers.
-/// Converts a [DataBatch] to a [Batch].
-fn data_batch_to_batch(
- metadata: &RegionMetadataRef,
- projection: &HashSet,
- key: Option<&[u8]>,
- data_batch: DataBatch,
-) -> Result {
- let record_batch = data_batch.slice_record_batch();
- let primary_key = key.map(|k| k.to_vec()).unwrap_or_default();
- let mut builder = BatchBuilder::new(primary_key);
- builder
- .timestamps_array(record_batch.column(1).clone())?
- .sequences_array(record_batch.column(2).clone())?
- .op_types_array(record_batch.column(3).clone())?;
-
- if record_batch.num_columns() <= 4 {
- // No fields.
- return builder.build();
- }
-
- // Iterate all field columns.
- for (array, field) in record_batch
- .columns()
- .iter()
- .zip(record_batch.schema().fields().iter())
- .skip(4)
- {
- // TODO(yingwen): Avoid finding column by name. We know the schema of a DataBatch.
- // Safety: metadata should contain all fields.
- let column_id = metadata.column_by_name(field.name()).unwrap().column_id;
- if !projection.contains(&column_id) {
- continue;
- }
- builder.push_field_array(column_id, array.clone())?;
- }
-
- builder.build()
-}
-
-/// Inner struct of the partition.
-///
-/// A key only exists in one shard.
-struct Inner {
- metadata: RegionMetadataRef,
- /// Map to index pk to pk id.
- pk_to_pk_id: HashMap, PkId>,
- /// Shard whose dictionary is active.
- shard_builder: ShardBuilder,
- /// Shards with frozen dictionary.
- shards: Vec,
- num_rows: usize,
- frozen: bool,
-}
-
-impl Inner {
- fn new(metadata: RegionMetadataRef, config: &PartitionTreeConfig) -> Self {
- let (shards, current_shard_id) = if metadata.primary_key.is_empty() {
- let data_parts = DataParts::new(metadata.clone(), DATA_INIT_CAP, config.dedup);
- (
- vec![Shard::new(
- 0,
- None,
- data_parts,
- config.dedup,
- config.data_freeze_threshold,
- )],
- 1,
- )
- } else {
- (Vec::new(), 0)
- };
- let shard_builder = ShardBuilder::new(metadata.clone(), config, current_shard_id);
- Self {
- metadata,
- pk_to_pk_id: HashMap::new(),
- shard_builder,
- shards,
- num_rows: 0,
- frozen: false,
- }
- }
-
- fn find_key_in_shards(&self, primary_key: &[u8]) -> Option {
- assert!(!self.frozen);
- self.pk_to_pk_id.get(primary_key).copied()
- }
-
- fn write_to_shard(&mut self, pk_id: PkId, key_value: &KeyValue) -> Result<()> {
- if pk_id.shard_id == self.shard_builder.current_shard_id() {
- self.shard_builder.write_with_pk_id(pk_id, key_value);
- return Ok(());
- }
-
- // Safety: We find the shard by shard id.
- let shard = self
- .shards
- .iter_mut()
- .find(|shard| shard.shard_id == pk_id.shard_id)
- .unwrap();
- shard.write_with_pk_id(pk_id, key_value)?;
- self.num_rows += 1;
-
- Ok(())
- }
-
- fn freeze_active_shard(&mut self) -> Result<()> {
- if let Some(shard) = self
- .shard_builder
- .finish(self.metadata.clone(), &mut self.pk_to_pk_id)?
- {
- self.shards.push(shard);
- }
- Ok(())
- }
-
- /// Returns count of timeseries.
- fn series_count(&self) -> usize {
- self.pk_to_pk_id.len()
- }
-}
diff --git a/src/mito2/src/memtable/partition_tree/shard.rs b/src/mito2/src/memtable/partition_tree/shard.rs
deleted file mode 100644
index c5dc25f573ac..000000000000
--- a/src/mito2/src/memtable/partition_tree/shard.rs
+++ /dev/null
@@ -1,580 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Shard in a partition.
-
-use std::cmp::Ordering;
-use std::time::{Duration, Instant};
-
-use mito_codec::key_values::KeyValue;
-use mito_codec::row_converter::PrimaryKeyFilter;
-use snafu::ResultExt;
-use store_api::metadata::RegionMetadataRef;
-
-use crate::error::{DecodeSnafu, Result};
-use crate::memtable::partition_tree::data::{
- DATA_INIT_CAP, DataBatch, DataParts, DataPartsReader, DataPartsReaderBuilder,
-};
-use crate::memtable::partition_tree::dict::KeyDictRef;
-use crate::memtable::partition_tree::merger::{Merger, Node};
-use crate::memtable::partition_tree::shard_builder::ShardBuilderReader;
-use crate::memtable::partition_tree::{PkId, PkIndex, ShardId};
-use crate::metrics::PARTITION_TREE_READ_STAGE_ELAPSED;
-
-/// Shard stores data related to the same key dictionary.
-pub struct Shard {
- pub(crate) shard_id: ShardId,
- /// Key dictionary of the shard. `None` if the schema of the tree doesn't have a primary key.
- key_dict: Option,
- /// Data in the shard.
- data_parts: DataParts,
- dedup: bool,
- /// Number of rows to freeze a data part.
- data_freeze_threshold: usize,
-}
-
-impl Shard {
- /// Returns a new shard.
- pub fn new(
- shard_id: ShardId,
- key_dict: Option,
- data_parts: DataParts,
- dedup: bool,
- data_freeze_threshold: usize,
- ) -> Shard {
- Shard {
- shard_id,
- key_dict,
- data_parts,
- dedup,
- data_freeze_threshold,
- }
- }
-
- /// Writes a key value into the shard.
- ///
- /// It will freezes the active buffer if it is full.
- pub fn write_with_pk_id(&mut self, pk_id: PkId, key_value: &KeyValue) -> Result<()> {
- debug_assert_eq!(self.shard_id, pk_id.shard_id);
-
- if self.data_parts.num_active_rows() >= self.data_freeze_threshold {
- self.data_parts.freeze()?;
- }
-
- self.data_parts.write_row(pk_id.pk_index, key_value);
- Ok(())
- }
-
- /// Scans the shard.
- // TODO(yingwen): Push down projection to data parts.
- pub fn read(&self) -> Result {
- let parts_reader = self.data_parts.read()?;
-
- Ok(ShardReaderBuilder {
- shard_id: self.shard_id,
- key_dict: self.key_dict.clone(),
- inner: parts_reader,
- })
- }
-
- /// Forks a shard.
- pub fn fork(&self, metadata: RegionMetadataRef) -> Shard {
- Shard {
- shard_id: self.shard_id,
- key_dict: self.key_dict.clone(),
- data_parts: DataParts::new(metadata, DATA_INIT_CAP, self.dedup),
- dedup: self.dedup,
- data_freeze_threshold: self.data_freeze_threshold,
- }
- }
-
- /// Returns true if the shard is empty (No data).
- pub fn is_empty(&self) -> bool {
- self.data_parts.is_empty()
- }
-
- /// Returns the memory size of the shard part.
- pub(crate) fn shared_memory_size(&self) -> usize {
- self.key_dict
- .as_ref()
- .map(|dict| dict.shared_memory_size())
- .unwrap_or(0)
- }
-}
-
-/// Source that returns [DataBatch].
-pub trait DataBatchSource {
- /// Returns whether current source is still valid.
- fn is_valid(&self) -> bool;
-
- /// Advances source to next data batch.
- fn next(&mut self) -> Result<()>;
-
- /// Returns current pk id.
- /// # Panics
- /// If source is not valid.
- fn current_pk_id(&self) -> PkId;
-
- /// Returns the current primary key bytes or None if it doesn't have primary key.
- ///
- /// # Panics
- /// If source is not valid.
- fn current_key(&self) -> Option<&[u8]>;
-
- /// Returns the data part.
- /// # Panics
- /// If source is not valid.
- fn current_data_batch(&self) -> DataBatch<'_>;
-}
-
-pub type BoxedDataBatchSource = Box;
-
-pub struct ShardReaderBuilder {
- shard_id: ShardId,
- key_dict: Option,
- inner: DataPartsReaderBuilder,
-}
-
-impl ShardReaderBuilder {
- pub(crate) fn build(
- self,
- key_filter: Option>,
- ) -> Result {
- let ShardReaderBuilder {
- shard_id,
- key_dict,
- inner,
- } = self;
- let now = Instant::now();
- let parts_reader = inner.build()?;
- ShardReader::new(shard_id, key_dict, parts_reader, key_filter, now.elapsed())
- }
-}
-
-/// Reader to read rows in a shard.
-pub struct ShardReader {
- shard_id: ShardId,
- key_dict: Option,
- parts_reader: DataPartsReader,
- key_filter: Option>,
- last_yield_pk_index: Option,
- keys_before_pruning: usize,
- keys_after_pruning: usize,
- prune_pk_cost: Duration,
- data_build_cost: Duration,
-}
-
-impl ShardReader {
- fn new(
- shard_id: ShardId,
- key_dict: Option,
- parts_reader: DataPartsReader,
- key_filter: Option>,
- data_build_cost: Duration,
- ) -> Result {
- let has_pk = key_dict.is_some();
- let mut reader = Self {
- shard_id,
- key_dict,
- parts_reader,
- key_filter: if has_pk { key_filter } else { None },
- last_yield_pk_index: None,
- keys_before_pruning: 0,
- keys_after_pruning: 0,
- prune_pk_cost: Duration::default(),
- data_build_cost,
- };
- reader.prune_batch_by_key()?;
-
- Ok(reader)
- }
-
- fn is_valid(&self) -> bool {
- self.parts_reader.is_valid()
- }
-
- fn next(&mut self) -> Result<()> {
- self.parts_reader.next()?;
- self.prune_batch_by_key()
- }
-
- fn current_key(&self) -> Option<&[u8]> {
- let pk_index = self.parts_reader.current_data_batch().pk_index();
- self.key_dict
- .as_ref()
- .map(|dict| dict.key_by_pk_index(pk_index))
- }
-
- fn current_pk_id(&self) -> PkId {
- let pk_index = self.parts_reader.current_data_batch().pk_index();
- PkId {
- shard_id: self.shard_id,
- pk_index,
- }
- }
-
- fn current_data_batch(&self) -> DataBatch<'_> {
- self.parts_reader.current_data_batch()
- }
-
- fn prune_batch_by_key(&mut self) -> Result<()> {
- let Some(key_filter) = &mut self.key_filter else {
- return Ok(());
- };
-
- while self.parts_reader.is_valid() {
- let pk_index = self.parts_reader.current_data_batch().pk_index();
- if let Some(yield_pk_index) = self.last_yield_pk_index
- && pk_index == yield_pk_index
- {
- break;
- }
- self.keys_before_pruning += 1;
- // Safety: `key_filter` is some so the shard has primary keys.
- let key = self.key_dict.as_ref().unwrap().key_by_pk_index(pk_index);
- let now = Instant::now();
- if key_filter.matches(key).context(DecodeSnafu)? {
- self.prune_pk_cost += now.elapsed();
- self.last_yield_pk_index = Some(pk_index);
- self.keys_after_pruning += 1;
- break;
- }
- self.prune_pk_cost += now.elapsed();
- self.parts_reader.next()?;
- }
-
- Ok(())
- }
-}
-
-impl Drop for ShardReader {
- fn drop(&mut self) {
- let shard_prune_pk = self.prune_pk_cost.as_secs_f64();
- PARTITION_TREE_READ_STAGE_ELAPSED
- .with_label_values(&["shard_prune_pk"])
- .observe(shard_prune_pk);
- if self.keys_before_pruning > 0 {
- common_telemetry::debug!(
- "ShardReader metrics, data parts: {}, before pruning: {}, after pruning: {}, prune cost: {}s, build cost: {}s",
- self.parts_reader.num_parts(),
- self.keys_before_pruning,
- self.keys_after_pruning,
- shard_prune_pk,
- self.data_build_cost.as_secs_f64(),
- );
- }
- }
-}
-
-/// A merger that merges batches from multiple shards.
-pub(crate) struct ShardMerger {
- merger: Merger,
-}
-
-impl ShardMerger {
- pub(crate) fn try_new(nodes: Vec) -> Result {
- let merger = Merger::try_new(nodes)?;
- Ok(ShardMerger { merger })
- }
-}
-
-impl DataBatchSource for ShardMerger {
- fn is_valid(&self) -> bool {
- self.merger.is_valid()
- }
-
- fn next(&mut self) -> Result<()> {
- self.merger.next()
- }
-
- fn current_pk_id(&self) -> PkId {
- self.merger.current_node().current_pk_id()
- }
-
- fn current_key(&self) -> Option<&[u8]> {
- self.merger.current_node().current_key()
- }
-
- fn current_data_batch(&self) -> DataBatch<'_> {
- let batch = self.merger.current_node().current_data_batch();
- batch.slice(0, self.merger.current_rows())
- }
-}
-
-pub(crate) enum ShardSource {
- Builder(ShardBuilderReader),
- Shard(ShardReader),
-}
-
-impl ShardSource {
- fn is_valid(&self) -> bool {
- match self {
- ShardSource::Builder(r) => r.is_valid(),
- ShardSource::Shard(r) => r.is_valid(),
- }
- }
-
- fn next(&mut self) -> Result<()> {
- match self {
- ShardSource::Builder(r) => r.next(),
- ShardSource::Shard(r) => r.next(),
- }
- }
-
- fn current_pk_id(&self) -> PkId {
- match self {
- ShardSource::Builder(r) => r.current_pk_id(),
- ShardSource::Shard(r) => r.current_pk_id(),
- }
- }
-
- fn current_key(&self) -> Option<&[u8]> {
- match self {
- ShardSource::Builder(r) => r.current_key(),
- ShardSource::Shard(r) => r.current_key(),
- }
- }
-
- fn current_data_batch(&self) -> DataBatch<'_> {
- match self {
- ShardSource::Builder(r) => r.current_data_batch(),
- ShardSource::Shard(r) => r.current_data_batch(),
- }
- }
-}
-
-/// Node for the merger to get items.
-pub(crate) struct ShardNode {
- source: ShardSource,
-}
-
-impl ShardNode {
- pub(crate) fn new(source: ShardSource) -> Self {
- Self { source }
- }
-
- fn current_pk_id(&self) -> PkId {
- self.source.current_pk_id()
- }
-
- fn current_key(&self) -> Option<&[u8]> {
- self.source.current_key()
- }
-
- fn current_data_batch(&self) -> DataBatch<'_> {
- self.source.current_data_batch()
- }
-}
-
-impl PartialEq for ShardNode {
- fn eq(&self, other: &Self) -> bool {
- self.source.current_key() == other.source.current_key()
- }
-}
-
-impl Eq for ShardNode {}
-
-impl Ord for ShardNode {
- fn cmp(&self, other: &Self) -> Ordering {
- self.source
- .current_key()
- .cmp(&other.source.current_key())
- .reverse()
- }
-}
-
-impl PartialOrd for ShardNode {
- fn partial_cmp(&self, other: &Self) -> Option {
- Some(self.cmp(other))
- }
-}
-
-impl Node for ShardNode {
- fn is_valid(&self) -> bool {
- self.source.is_valid()
- }
-
- fn is_behind(&self, other: &Self) -> bool {
- // We expect a key only belongs to one shard.
- debug_assert_ne!(self.source.current_key(), other.source.current_key());
- self.source.current_key() < other.source.current_key()
- }
-
- fn advance(&mut self, len: usize) -> Result<()> {
- debug_assert_eq!(self.source.current_data_batch().num_rows(), len);
- self.source.next()
- }
-
- fn current_item_len(&self) -> usize {
- self.current_data_batch().num_rows()
- }
-
- fn search_key_in_current_item(&self, _other: &Self) -> Result {
- Err(self.source.current_data_batch().num_rows())
- }
-}
-
-#[cfg(test)]
-mod tests {
- use std::sync::Arc;
-
- use super::*;
- use crate::memtable::KeyValues;
- use crate::memtable::partition_tree::PkIndex;
- use crate::memtable::partition_tree::data::timestamp_array_to_i64_slice;
- use crate::memtable::partition_tree::dict::KeyDictBuilder;
- use crate::memtable::stats::WriteMetrics;
- use crate::test_util::memtable_util::{
- build_key_values_with_ts_seq_values, encode_keys, metadata_for_test,
- };
-
- /// Returns key values and expect pk index.
- fn input_with_key(metadata: &RegionMetadataRef) -> Vec<(KeyValues, PkIndex)> {
- vec![
- (
- build_key_values_with_ts_seq_values(
- metadata,
- "shard".to_string(),
- 2,
- [20, 21].into_iter(),
- [Some(0.0), Some(1.0)].into_iter(),
- 0,
- ),
- 2,
- ),
- (
- build_key_values_with_ts_seq_values(
- metadata,
- "shard".to_string(),
- 0,
- [0, 1].into_iter(),
- [Some(0.0), Some(1.0)].into_iter(),
- 1,
- ),
- 0,
- ),
- (
- build_key_values_with_ts_seq_values(
- metadata,
- "shard".to_string(),
- 1,
- [10, 11].into_iter(),
- [Some(0.0), Some(1.0)].into_iter(),
- 2,
- ),
- 1,
- ),
- ]
- }
-
- fn new_shard_with_dict(
- shard_id: ShardId,
- metadata: RegionMetadataRef,
- input: &[(KeyValues, PkIndex)],
- data_freeze_threshold: usize,
- ) -> Shard {
- let mut dict_builder = KeyDictBuilder::new(1024);
- let mut metrics = WriteMetrics::default();
- let mut keys = Vec::with_capacity(input.len());
- for (kvs, _) in input {
- encode_keys(&metadata, kvs, &mut keys);
- }
- for key in &keys {
- dict_builder.insert_key(key, None, &mut metrics);
- }
-
- let (dict, _) = dict_builder.finish().unwrap();
- let data_parts = DataParts::new(metadata, DATA_INIT_CAP, true);
-
- Shard::new(
- shard_id,
- Some(Arc::new(dict)),
- data_parts,
- true,
- data_freeze_threshold,
- )
- }
-
- fn collect_timestamps(shard: &Shard) -> Vec {
- let mut reader = shard.read().unwrap().build(None).unwrap();
- let mut timestamps = Vec::new();
- while reader.is_valid() {
- let rb = reader.current_data_batch().slice_record_batch();
- let ts_array = rb.column(1);
- let ts_slice = timestamp_array_to_i64_slice(ts_array);
- timestamps.extend_from_slice(ts_slice);
-
- reader.next().unwrap();
- }
- timestamps
- }
-
- #[test]
- fn test_write_read_shard() {
- let metadata = metadata_for_test();
- let input = input_with_key(&metadata);
- let mut shard = new_shard_with_dict(8, metadata, &input, 100);
- assert!(shard.is_empty());
- for (key_values, pk_index) in &input {
- for kv in key_values.iter() {
- let pk_id = PkId {
- shard_id: shard.shard_id,
- pk_index: *pk_index,
- };
- shard.write_with_pk_id(pk_id, &kv).unwrap();
- }
- }
- assert!(!shard.is_empty());
-
- let timestamps = collect_timestamps(&shard);
- assert_eq!(vec![0, 1, 10, 11, 20, 21], timestamps);
- }
-
- #[test]
- fn test_shard_freeze() {
- let metadata = metadata_for_test();
- let kvs = build_key_values_with_ts_seq_values(
- &metadata,
- "shard".to_string(),
- 0,
- [0].into_iter(),
- [Some(0.0)].into_iter(),
- 0,
- );
- let mut shard = new_shard_with_dict(8, metadata.clone(), &[(kvs, 0)], 50);
- let expected: Vec<_> = (0..200).collect();
- for i in &expected {
- let kvs = build_key_values_with_ts_seq_values(
- &metadata,
- "shard".to_string(),
- 0,
- [*i].into_iter(),
- [Some(0.0)].into_iter(),
- *i as u64,
- );
- let pk_id = PkId {
- shard_id: shard.shard_id,
- pk_index: *i as PkIndex,
- };
- for kv in kvs.iter() {
- shard.write_with_pk_id(pk_id, &kv).unwrap();
- }
- }
- assert!(!shard.is_empty());
- assert_eq!(3, shard.data_parts.frozen_len());
-
- let timestamps = collect_timestamps(&shard);
- assert_eq!(expected, timestamps);
- }
-}
diff --git a/src/mito2/src/memtable/partition_tree/shard_builder.rs b/src/mito2/src/memtable/partition_tree/shard_builder.rs
deleted file mode 100644
index 78eeb463c63b..000000000000
--- a/src/mito2/src/memtable/partition_tree/shard_builder.rs
+++ /dev/null
@@ -1,418 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//! Builder of a shard.
-
-use std::collections::HashMap;
-use std::sync::Arc;
-use std::time::{Duration, Instant};
-
-use mito_codec::key_values::KeyValue;
-use mito_codec::row_converter::PrimaryKeyFilter;
-use snafu::ResultExt;
-use store_api::metadata::RegionMetadataRef;
-
-use crate::error::{DecodeSnafu, Result};
-use crate::memtable::partition_tree::data::{
- DATA_INIT_CAP, DataBatch, DataBuffer, DataBufferReader, DataBufferReaderBuilder, DataParts,
-};
-use crate::memtable::partition_tree::dict::{DictBuilderReader, KeyDictBuilder};
-use crate::memtable::partition_tree::shard::Shard;
-use crate::memtable::partition_tree::{PartitionTreeConfig, PkId, PkIndex, ShardId};
-use crate::memtable::stats::WriteMetrics;
-use crate::metrics::PARTITION_TREE_READ_STAGE_ELAPSED;
-
-/// Builder to write keys and data to a shard that the key dictionary
-/// is still active.
-pub struct ShardBuilder {
- /// Id of the current shard to build.
- current_shard_id: ShardId,
- /// Builder for the key dictionary.
- dict_builder: KeyDictBuilder,
- /// Buffer to store data.
- data_buffer: DataBuffer,
- /// Number of rows to freeze a data part.
- data_freeze_threshold: usize,
- dedup: bool,
-}
-
-impl ShardBuilder {
- /// Returns a new builder.
- pub fn new(
- metadata: RegionMetadataRef,
- config: &PartitionTreeConfig,
- shard_id: ShardId,
- ) -> ShardBuilder {
- ShardBuilder {
- current_shard_id: shard_id,
- dict_builder: KeyDictBuilder::new(config.index_max_keys_per_shard),
- data_buffer: DataBuffer::with_capacity(metadata, DATA_INIT_CAP, config.dedup),
- data_freeze_threshold: config.data_freeze_threshold,
- dedup: config.dedup,
- }
- }
-
- /// Write a key value with given pk_index (caller must ensure the pk_index exist in dict_builder)
- pub fn write_with_pk_id(&mut self, pk_id: PkId, key_value: &KeyValue) {
- assert_eq!(self.current_shard_id, pk_id.shard_id);
- self.data_buffer.write_row(pk_id.pk_index, key_value);
- }
-
- /// Write a key value with its encoded primary key.
- pub fn write_with_key(
- &mut self,
- full_primary_key: &[u8],
- sparse_key: Option<&[u8]>,
- key_value: &KeyValue,
- metrics: &mut WriteMetrics,
- ) -> PkId {
- // Safety: we check whether the builder need to freeze before.
- let pk_index = self
- .dict_builder
- .insert_key(full_primary_key, sparse_key, metrics);
- self.data_buffer.write_row(pk_index, key_value);
- PkId {
- shard_id: self.current_shard_id,
- pk_index,
- }
- }
-
- /// Returns true if the builder need to freeze.
- pub fn should_freeze(&self) -> bool {
- self.dict_builder.is_full() || self.data_buffer.num_rows() == self.data_freeze_threshold
- }
-
- /// Returns the current shard id of the builder.
- pub fn current_shard_id(&self) -> ShardId {
- self.current_shard_id
- }
-
- /// Builds a new shard and resets the builder.
- ///
- /// Returns `None` if the builder is empty.
- pub fn finish(
- &mut self,
- metadata: RegionMetadataRef,
- pk_to_pk_id: &mut HashMap, PkId>,
- ) -> Result