diff --git a/docs/admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md b/docs/admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md index 53b2af25b4358..69d9388eb079b 100644 --- a/docs/admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md +++ b/docs/admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md @@ -18,46 +18,48 @@ View the metadata cache information of the External Catalog in the currently con ## Table Information -| Column Name | Type | Description | -| ------------ | ---- | ----------------------- | -| CATALOG_NAME | text | The name of the Catalog | -| CACHE_NAME | text | The name of the cache | -| METRIC_NAME | text | The name of the metric | -| METRIC_VALUE | text | The value of the metric | +One row represents one cache entry on one FE for one external catalog. + +| Column Name | Type | Description | +| ------------ | ---- | ----------- | +| FE_HOST | text | FE host that reports the stats | +| CATALOG_NAME | text | Catalog name | +| ENGINE_NAME | text | Meta cache engine name, such as `hive`, `iceberg`, `paimon` | +| ENTRY_NAME | text | Cache entry name inside the engine, such as `schema`, `file`, `manifest` | +| EFFECTIVE_ENABLED | boolean | Whether the cache is effectively enabled after evaluating `enable` / `ttl-second` / `capacity` | +| CONFIG_ENABLED | boolean | Raw `enable` flag from the cache config | +| AUTO_REFRESH | boolean | Whether async refresh-after-write is enabled for this entry | +| TTL_SECOND | bigint | TTL in seconds. `0` means disabled; `-1` means no expiration | +| CAPACITY | bigint | Max entry count | +| ESTIMATED_SIZE | bigint | Estimated current cache size | +| REQUEST_COUNT | bigint | Total requests | +| HIT_COUNT | bigint | Cache hits | +| MISS_COUNT | bigint | Cache misses | +| HIT_RATE | double | Hit rate | +| LOAD_SUCCESS_COUNT | bigint | Successful loads | +| LOAD_FAILURE_COUNT | bigint | Failed loads | +| TOTAL_LOAD_TIME_MS | bigint | Total load time in milliseconds | +| AVG_LOAD_PENALTY_MS | double | Average load time in milliseconds | +| EVICTION_COUNT | bigint | Evicted entries | +| INVALIDATE_COUNT | bigint | Explicit invalidations | +| LAST_LOAD_SUCCESS_TIME | text | Last successful load time | +| LAST_LOAD_FAILURE_TIME | text | Last failed load time | +| LAST_ERROR | text | Latest load error message | ## Usage Example -```text -+----------------------+-----------------------------+----------------------+---------------------+ -| CATALOG_NAME | CACHE_NAME | METRIC_NAME | METRIC_VALUE | -+----------------------+-----------------------------+----------------------+---------------------+ -| hive_iceberg_minio | iceberg_table_cache | eviction_count | 0 | -| hive_iceberg_minio | iceberg_table_cache | hit_ratio | 0.8235294117647058 | -| hive_iceberg_minio | iceberg_table_cache | average_load_penalty | 5.480102048333334E8 | -| hive_iceberg_minio | iceberg_table_cache | estimated_size | 6 | -| hive_iceberg_minio | iceberg_table_cache | hit_count | 28 | -| hive_iceberg_minio | iceberg_table_cache | read_count | 34 | -| hive_iceberg_minio | iceberg_snapshot_list_cache | eviction_count | 0 | -| hive_iceberg_minio | iceberg_snapshot_list_cache | hit_ratio | 1.0 | -| hive_iceberg_minio | iceberg_snapshot_list_cache | average_load_penalty | 0.0 | -| hive_iceberg_minio | iceberg_snapshot_list_cache | estimated_size | 0 | -| hive_iceberg_minio | iceberg_snapshot_list_cache | hit_count | 0 | -| hive_iceberg_minio | iceberg_snapshot_list_cache | read_count | 0 | -| hive_iceberg_minio | iceberg_snapshot_cache | eviction_count | 0 | -| hive_iceberg_minio | iceberg_snapshot_cache | hit_ratio | 0.45454545454545453 | -| hive_iceberg_minio | iceberg_snapshot_cache | average_load_penalty | 5.604907246666666E8 | -| hive_iceberg_minio | iceberg_snapshot_cache | estimated_size | 6 | -| hive_iceberg_minio | iceberg_snapshot_cache | hit_count | 5 | -| hive_iceberg_minio | iceberg_snapshot_cache | read_count | 11 | +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, last_error +FROM information_schema.catalog_meta_cache_statistics +ORDER BY catalog_name, engine_name, entry_name; ``` -The METRIC_NAME column contains the following Caffeine cache performance metrics: -- eviction_count: The number of entries that have been evicted from the cache -- hit_ratio: The ratio of cache requests which were hits (ranges from 0.0 to 1.0) -- average_load_penalty: The average time spent loading new values (in nanoseconds) -- estimated_size: The approximate number of entries in the cache -- hit_count: The number of times cache lookup methods have returned a cached value -- read_count: The total number of times cache lookup methods have been called +Typical usage: +- Use `ENGINE_NAME` + `ENTRY_NAME` to identify one logical cache entry. +- Use `EFFECTIVE_ENABLED`, `TTL_SECOND`, and `CAPACITY` to confirm the applied cache policy. +- Use `HIT_RATE`, `ESTIMATED_SIZE`, `LOAD_FAILURE_COUNT`, and `LAST_ERROR` to diagnose behavior. diff --git a/docs/lakehouse/catalogs/hive-catalog.mdx b/docs/lakehouse/catalogs/hive-catalog.mdx index 190dd3bce7c7d..2e936d43f72b7 100644 --- a/docs/lakehouse/catalogs/hive-catalog.mdx +++ b/docs/lakehouse/catalogs/hive-catalog.mdx @@ -76,6 +76,78 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is for entering common attributes. Please see the "Common Properties" section in the [Catalog Overview](../catalog-overview.md). +## Metadata Cache {#meta-cache} + +To improve the performance of accessing external data sources, Apache Doris caches Hive metadata. Metadata includes table structure (Schema), partition lists, partition properties, and file lists. + +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, Hive Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. +::: + +### Unified Property Model (4.1.x+) {#meta-cache-unified-model} + +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | +|---|---|---| +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | + +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. + +### Cache Modules {#meta-cache-unified-modules} + +Hive Catalog includes the following cache modules: + +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.hive.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `partition_values` | `meta.cache.hive.partition_values.` | Caches partition values/names list. Impact: Partition pruning and enumeration. If disabled, new external partitions can be seen in real-time. | +| `partition` | `meta.cache.hive.partition.` | Caches partition properties (Location, input format, etc.). Impact: Specific metadata of partitions. | +| `file` | `meta.cache.hive.file.` | Caches file lists. Impact: Reduces remote LIST operation overhead. If disabled, file changes can be seen in real-time. | + +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} + +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: + +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.hive.schema.ttl-second` | Expiration time of table structure cache | +| `partition.cache.ttl-second` | `meta.cache.hive.partition_values.ttl-second` | Expiration time of partition value cache | +| `file.meta.cache.ttl-second` | `meta.cache.hive.file.ttl-second` | Expiration time of file list cache | + +### Best Practices {#meta-cache-best-practices} + +* **Real-time access to the latest data**: If you want each query to see the latest partition or file changes in the external data source, you can set the corresponding `ttl-second` to `0`. + ```sql + -- Disable file list cache to see file changes in real-time + ALTER CATALOG hive_ctl SET PROPERTIES ("meta.cache.hive.file.ttl-second" = "0"); + -- Disable partition value cache to see new partitions in real-time + ALTER CATALOG hive_ctl SET PROPERTIES ("meta.cache.hive.partition_values.ttl-second" = "0"); + ``` +* **Performance optimization**: For scenarios where metadata changes are infrequent, it is recommended to appropriately increase `capacity` and `ttl-second` to reduce access pressure on Hive Metastore and file systems. + +:::caution +**Hive Catalog Note**: Changes to `meta.cache.hive.*` properties **do not support hot-reload**. To ensure new configurations take effect, you must recreate the catalog or restart the FE node. +::: + +### Observability {#meta-cache-unified-observability} + +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'hive_ctl' AND engine_name = 'hive' +ORDER BY entry_name; +``` + +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported Hive Versions Supports Hive 1.x, 2.x, 3.x, and 4.x. diff --git a/docs/lakehouse/catalogs/hudi-catalog.md b/docs/lakehouse/catalogs/hudi-catalog.md index 22b8f227ac30d..7ac444f9b5f1f 100644 --- a/docs/lakehouse/catalogs/hudi-catalog.md +++ b/docs/lakehouse/catalogs/hudi-catalog.md @@ -51,6 +51,70 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( | ------------------------------- | -------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | | `hudi.use_hive_sync_partition` | `use_hive_sync_partition` | Whether to use the partition information already synchronized by Hive Metastore. If true, partition information will be obtained directly from Hive Metastore. Otherwise, it will be obtained from the metadata file of the file system. Obtaining information from Hive Metastore is more efficient, but users need to ensure that the latest metadata has been synchronized to Hive Metastore. | false | +## Metadata Cache {#meta-cache} + +To improve the performance of accessing external data sources, Apache Doris caches Hudi metadata. Metadata includes table structure (Schema), partition information, FS View, and Meta Client objects. + +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, Hudi-related external metadata cache is configured using the unified `meta.cache.*` keys. +::: + +### Unified Property Model (4.1.x+) {#meta-cache-unified-model} + +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | +|---|---|---| +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | + +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. + +### Cache Modules {#meta-cache-unified-modules} + +Hudi Catalog includes the following cache modules: + +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.hudi.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `partition` | `meta.cache.hudi.partition.` | Caches Hudi partition-related metadata. Impact: Used for partition discovery and pruning. | +| `fs_view` | `meta.cache.hudi.fs_view.` | Caches Hudi filesystem view related metadata. | +| `meta_client` | `meta.cache.hudi.meta_client.` | Caches Hudi Meta Client objects. Impact: Reduces redundant loading of Hudi metadata. | + +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} + +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: + +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.hudi.schema.ttl-second` | Expiration time of table structure cache | + +### Best Practices {#meta-cache-best-practices} + +* **Real-time access to the latest data**: If you want each query to see the latest data changes or schema changes for Hudi tables, you can set the `ttl-second` for `schema` or `partition` to `0`. + ```sql + -- Disable partition metadata cache to detect the latest partition changes in Hudi tables + ALTER CATALOG hudi_ctl SET PROPERTIES ("meta.cache.hudi.partition.ttl-second" = "0"); + ``` +* **Performance optimization**: Changes via `ALTER CATALOG ... SET PROPERTIES` support hot-reload in Hudi (via the HMS catalog property update path). + +### Observability {#meta-cache-unified-observability} + +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'hudi_ctl' AND engine_name = 'hudi' +ORDER BY entry_name; +``` + +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported Hudi Versions The current dependent Hudi version is 0.15. It is recommended to access Hudi data version 0.14 and above. diff --git a/docs/lakehouse/catalogs/iceberg-catalog.mdx b/docs/lakehouse/catalogs/iceberg-catalog.mdx index f8c2b78d05181..0b471a3b66548 100644 --- a/docs/lakehouse/catalogs/iceberg-catalog.mdx +++ b/docs/lakehouse/catalogs/iceberg-catalog.mdx @@ -85,6 +85,78 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is for entering general properties. See the [Catalog Overview](../catalog-overview.md) for details on common properties. +## Metadata Cache {#meta-cache} + +To improve the performance of accessing external data sources, Apache Doris caches Iceberg metadata. Metadata includes table structure (Schema), table objects, view objects, and manifest details. + +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, Iceberg Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. +::: + +### Unified Property Model (4.1.x+) {#meta-cache-unified-model} + +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | +|---|---|---| +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | + +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. + +### Cache Modules {#meta-cache-unified-modules} + +Iceberg Catalog includes the following cache modules: + +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.iceberg.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `table` | `meta.cache.iceberg.table.` | Caches Iceberg table metadata objects. Impact: Reduces Catalog/Metastore round-trips. | +| `view` | `meta.cache.iceberg.view.` | Caches Iceberg View metadata objects. | +| `manifest` | `meta.cache.iceberg.manifest.` | Caches manifest details. Impact: Reduces repeated manifest access overhead. Note: This module is disabled by default and must be enabled manually. | + +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} + +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: + +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.iceberg.schema.ttl-second` | Expiration time of table structure cache | + +### Best Practices {#meta-cache-best-practices} + +* **Real-time access to the latest data**: If you want each query to see the latest snapshots or schema changes for Iceberg tables, you can set the `ttl-second` for `schema` or `table` to `0`. + ```sql + -- Disable table object cache to detect snapshot changes + ALTER CATALOG iceberg_ctl SET PROPERTIES ("meta.cache.iceberg.table.ttl-second" = "0"); + ``` +* **Performance optimization**: + * Enabling manifest cache can significantly speed up query planning for large tables: + ```sql + ALTER CATALOG iceberg_ctl SET PROPERTIES ( + "meta.cache.iceberg.manifest.enable" = "true", + "meta.cache.iceberg.manifest.ttl-second" = "600" + ); + ``` + * Changes via `ALTER CATALOG ... SET PROPERTIES` support hot-reload in Iceberg Catalog. + +### Observability {#meta-cache-unified-observability} + +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'iceberg_ctl' AND engine_name = 'iceberg' +ORDER BY entry_name; +``` + +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported Iceberg Versions | Doris Version | Iceberg SDK Version | diff --git a/docs/lakehouse/catalogs/maxcompute-catalog.md b/docs/lakehouse/catalogs/maxcompute-catalog.md index 1bea22a762869..c5c8b7fb2ab01 100644 --- a/docs/lakehouse/catalogs/maxcompute-catalog.md +++ b/docs/lakehouse/catalogs/maxcompute-catalog.md @@ -68,6 +68,68 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is used to fill in common properties. Please refer to the "Common Properties" section in [Catalog Overview](../catalog-overview.md). +## Metadata Cache {#meta-cache} + +To improve the performance of accessing external data sources, Apache Doris caches MaxCompute metadata. Metadata includes table structure (Schema) and partition lists. + +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, MaxCompute Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. +::: + +### Unified Property Model (4.1.x+) {#meta-cache-unified-model} + +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | +|---|---|---| +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | + +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. + +### Cache Modules {#meta-cache-unified-modules} + +MaxCompute Catalog includes the following cache modules: + +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.maxcompute.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `partition_values` | `meta.cache.maxcompute.partition_values.` | Caches partition value lists. Impact: Partition pruning and enumeration. If disabled, new external partitions can be seen in real-time. | + +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} + +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: + +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.maxcompute.schema.ttl-second` | Expiration time of table structure cache | + +### Best Practices {#meta-cache-best-practices} + +* **Real-time access to the latest data**: If you want each query to see the latest partition or schema changes for MaxCompute tables, you can set the `ttl-second` for `schema` or `partition_values` to `0`. + ```sql + -- Disable partition value cache to detect the latest partitions in MaxCompute tables + ALTER CATALOG mc_ctl SET PROPERTIES ("meta.cache.maxcompute.partition_values.ttl-second" = "0"); + ``` +* **Note**: `meta.cache.maxcompute.*` currently does not have a dedicated hot-reload hook. After changing the configuration, it is recommended to recreate the Catalog or restart FE to ensure it takes effect. + +### Observability {#meta-cache-unified-observability} + +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'mc_ctl' AND engine_name = 'maxcompute' +ORDER BY entry_name; +``` + +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported MaxCompute Versions Only the public cloud version of MaxCompute is supported. For private cloud version support, please contact Doris community support. diff --git a/docs/lakehouse/catalogs/paimon-catalog.mdx b/docs/lakehouse/catalogs/paimon-catalog.mdx index 9a07d2ed16268..f333e50eb6a99 100644 --- a/docs/lakehouse/catalogs/paimon-catalog.mdx +++ b/docs/lakehouse/catalogs/paimon-catalog.mdx @@ -88,6 +88,68 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( The CommonProperties section is used to fill in common properties. Please refer to the [Catalog Overview](../catalog-overview.md) section on [Common Properties]. +## Metadata Cache {#meta-cache} + +To improve the performance of accessing external data sources, Apache Doris caches Paimon metadata. Metadata includes table structure (Schema) and table objects. + +:::tip +For versions before Doris 4.1.x, metadata caching is mainly controlled globally by FE configuration items. For details, see [Metadata Cache](../meta-cache.md). +Starting from Doris 4.1.x, Paimon Catalog's external metadata cache is configured using the unified `meta.cache.*` keys. +::: + +### Unified Property Model (4.1.x+) {#meta-cache-unified-model} + +Each engine's cache entry uses a unified configuration key format: `meta.cache...{enable,ttl-second,capacity}`. + +| Property | Example | Meaning | +|---|---|---| +| `enable` | `true/false` | Whether to enable this cache module. | +| `ttl-second` | `600`, `0`, `-1` | `0` means disable cache (takes effect immediately, can be used to see the latest data); `-1` means never expire; other positive integers mean TTL in seconds based on access time. | +| `capacity` | `10000` | Maximum number of cache entries (by count). `0` means disable. | + +**Effective Logic:** The module cache only takes effect when `enable=true`, `ttl-second != 0`, and `capacity > 0`. + +### Cache Modules {#meta-cache-unified-modules} + +Paimon Catalog includes the following cache modules: + +| Module (``) | Property Key Prefix | Cached Content and Impact | +|---|---|---| +| `schema` | `meta.cache.paimon.schema.` | Caches table structure. Impact: Visibility of table column information. If disabled, the latest Schema is pulled for each query. | +| `table` | `meta.cache.paimon.table.` | Caches Paimon table metadata objects. Impact: Reduces metadata loading overhead during query planning. | + +### Legacy Parameter Mapping and Conversion {#meta-cache-mapping} + +In version 4.1.x and later, unified keys are recommended. The following is the mapping between legacy Catalog properties and 4.1.x+ unified keys: + +| Legacy Property Key | 4.1.x+ Unified Key | Description | +|---|---|---| +| `schema.cache.ttl-second` | `meta.cache.paimon.schema.ttl-second` | Expiration time of table structure cache | + +### Best Practices {#meta-cache-best-practices} + +* **Real-time access to the latest data**: If you want each query to see the latest data changes or schema changes for Paimon tables, you can set the `ttl-second` for `schema` or `table` to `0`. + ```sql + -- Disable table object cache to detect the latest snapshots of Paimon tables + ALTER CATALOG paimon_ctl SET PROPERTIES ("meta.cache.paimon.table.ttl-second" = "0"); + ``` +* **Performance optimization**: Changes via `ALTER CATALOG ... SET PROPERTIES` support hot-reload in Paimon Catalog. + +### Observability {#meta-cache-unified-observability} + +Cache metrics can be observed through the `information_schema.catalog_meta_cache_statistics` system table: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'paimon_ctl' AND engine_name = 'paimon' +ORDER BY entry_name; +``` + +See the documentation for this system table: [catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md). + ### Supported Paimon Versions The currently dependent Paimon version is 1.0.0. diff --git a/docs/lakehouse/meta-cache.md b/docs/lakehouse/meta-cache.md index 16985562ea399..5a720093b96bf 100644 --- a/docs/lakehouse/meta-cache.md +++ b/docs/lakehouse/meta-cache.md @@ -6,18 +6,19 @@ } --- +:::tip +This document applies to versions before 4.1.x. +For Doris 4.1.x and later, external meta cache has been refactored with unified configuration keys `meta.cache.*`. Please refer to the "Metadata Cache" section in each [Catalog](./catalog-overview.md) document. +::: + To improve the performance of accessing external data sources, Apache Doris caches the **metadata** of external data sources. Metadata includes information such as databases, tables, columns, partitions, snapshots, file lists, etc. -This article details the types, strategies, and related parameter configurations of cached metadata. +This article details the types, strategies, and related parameter configurations of cached metadata for legacy versions (pre-4.1). For **data cache**, refer to the [data cache documentation](./data-cache.md). -:::tip -This document applies to versions after 2.1.6. -::: - ## Cache Strategies Most caches have the following three strategy indicators: @@ -52,6 +53,19 @@ Most caches have the following three strategy indicators: ## Cache Types +The following sections describe representative FE-level defaults and legacy cache controls. +They should not be read as the complete cache entry matrix for Doris 4.1.x+. + +| Category | Scope | Main FE defaults | Notes | +|---|---|---|---| +| Database / table name lists | Per catalog / per database | `external_cache_expire_time_seconds_after_access`, `external_cache_refresh_time_minutes` | Used by `SHOW DATABASES` / `SHOW TABLES` | +| Database / table objects | Per catalog / per database | `max_meta_object_cache_num`, `external_cache_expire_time_seconds_after_access`, `external_cache_refresh_time_minutes` | Object cache can diverge temporarily from name-list cache | +| Table schema | Per catalog | `max_external_schema_cache_num`, `external_cache_expire_time_seconds_after_access`, `external_cache_refresh_time_minutes` | Legacy per-catalog override: `schema.cache.ttl-second` | +| Hive partition values | Per Hive catalog | `max_hive_partition_table_cache_num`, `external_cache_expire_time_seconds_after_access`, `external_cache_refresh_time_minutes` | Legacy per-catalog override: `partition.cache.ttl-second` | +| Hive partition properties | Per Hive catalog | `max_hive_partition_cache_num`, `external_cache_expire_time_seconds_after_access` | No legacy per-catalog TTL override | +| Hive file lists | Per Hive catalog | `max_external_file_cache_num`, `external_cache_expire_time_seconds_after_access`, `external_cache_refresh_time_minutes` | Legacy per-catalog override: `file.meta.cache.ttl-second` | +| Hudi / Iceberg / Paimon legacy table-level metadata | Per catalog | `max_external_table_cache_num`, `external_cache_expire_time_seconds_after_access`, `external_cache_refresh_time_minutes` | For Doris 4.1.x+, use the catalog pages for current cache entries such as `fs_view`, `meta_client`, `view`, and `manifest` | + ### Database and Table Name Lists The database name list refers to the list of all database names under a Catalog. @@ -84,7 +98,7 @@ Note that the list of objects in this cache may be inconsistent with the **datab For example, through the `SHOW TABLES` command, you get tables `A`, `B`, and `C` from the name list cache. Suppose table `D` is added to the external data source at this time, then `SELECT * FROM D` can access table `D`, and the [table object] cache will add the table `D` object, but the [table name list] cache may still be `A`, `B`, `C`. Only when the [table name list] cache is refreshed will it become `A`, `B`, `C`, `D`. -Each Catalog has a database name list cache. Each database has a table name list cache. +Each Catalog has a database object cache. Each database has a table object cache. - Maximum cache count @@ -104,7 +118,7 @@ Each Catalog has a database name list cache. Each database has a table name list Caches the schema information of tables, such as column names. This cache is mainly used to load the schema of accessed tables on demand, to prevent synchronizing a large number of unnecessary table schemas and occupying FE memory. -This cache is shared by all Catalogs and is globally unique. +This cache is managed per catalog. - Maximum cache count @@ -172,7 +186,7 @@ Used to cache the file list information under a single partition of a Hive table - Maximum cache count - Controlled by the FE configuration item `max_external_file_cache_num`, default is 100000. + Controlled by the FE configuration item `max_external_file_cache_num`, default is 10000. You can adjust this parameter appropriately according to the number of files to be accessed. @@ -186,13 +200,14 @@ Used to cache the file list information under a single partition of a Hive table - Minimum refresh time - Controlled by the FE configuration item `external_cache_expire_time_minutes_after_access`, in minutes. Default is 10 minutes. Reducing this time allows you to see the latest partition properties in Doris more in real time, but increases the frequency of accessing external data sources. + Controlled by the FE configuration item `external_cache_expire_time_minutes_after_access`, in minutes. Default is 10 minutes. Reducing this time allows you to see the latest file list in Doris more in real time, but increases the frequency of accessing external data sources. After version 3.0.7, the configuration item name is changed to `external_cache_refresh_time_minutes`. The default value remains unchanged. ### Hudi Table Partitions -Used to cache partition information of Hudi tables. +Legacy summary of Hudi partition metadata caching. +Current Hudi cache entries in Doris 4.1.x+ also include `fs_view` and `meta_client`; see [Hudi Catalog](./catalogs/hudi-catalog.md#meta-cache-unified). This cache, each Hudi Catalog has one. @@ -214,7 +229,8 @@ This cache, each Hudi Catalog has one. ### Iceberg Table Information -Used to cache Iceberg table objects. The object is loaded and constructed through the Iceberg API. +Legacy summary of Iceberg table metadata caching. The table object is loaded and constructed through the Iceberg API. +For Doris 4.1.x+, the current observable cache entries are documented in [Iceberg Catalog](./catalogs/iceberg-catalog.mdx#meta-cache-unified). This cache, each Iceberg Catalog has one. @@ -234,10 +250,10 @@ This cache, each Iceberg Catalog has one. After version 3.0.7, the configuration item name is changed to `external_cache_refresh_time_minutes`. The default value remains unchanged. -### Iceberg Table Snapshot +### Iceberg Snapshot-Related Metadata -Used to cache the snapshot list of Iceberg tables. The object is loaded and constructed through the Iceberg API. -This cache, each Iceberg Catalog has one. +Legacy summary of snapshot-related metadata derived from Iceberg table metadata. +In current implementations, this should not be read as a separate 4.1.x cache entry alongside `table`, `view`, or `manifest`. - Maximum cache count @@ -257,37 +273,19 @@ This cache, each Iceberg Catalog has one. ## Cache Refresh -In addition to the refresh and eviction strategies of each cache above, users can also directly refresh the metadata cache manually or on a schedule. +In addition to the refresh and eviction strategies above, users can also refresh metadata manually or on a schedule. ### Manual Refresh -Users can manually refresh metadata using the `REFRESH` command. +Use the `REFRESH` statement to invalidate catalog, database, or table metadata. +For current syntax, privileges, and examples, see [REFRESH](../sql-manual/sql-statements/catalog/REFRESH.md). -1. REFRESH CATALOG +Behavior summary: - Refresh the specified Catalog. - - `REFRESH CATALOG ctl1 PROPERTIES("invalid_cache" = "true");` - - - This command refreshes the database list, table column names, and all cache information of the specified Catalog. - - `invalid_cache` indicates whether to refresh caches such as partitions and file lists. The default is true. If false, only the database and table lists of the Catalog will be refreshed, but not caches such as partitions and file lists. This parameter is suitable for cases where the user only wants to synchronize newly added or deleted databases and tables, and can be set to false. - -2. REFRESH DATABASE - - Refresh the specified Database. - - `REFRESH DATABASE [ctl.]db1 PROPERTIES("invalid_cache" = "true");` - - - This command refreshes the table column names and all cache information under the specified Database. - - The meaning of the `invalid_cache` property is the same as above. The default is true. If false, only the table list of the Database will be refreshed, but not the cache information. This parameter is suitable for cases where the user only wants to synchronize newly added or deleted tables. - -3. REFRESH TABLE - - Refresh the specified Table. - - `REFRESH TABLE [ctl.][db.]tbl1;` - - - This command refreshes all cache information under the specified Table. +- `REFRESH CATALOG` invalidates catalog-level object caches and, by default, lower-level metadata caches. +- `REFRESH DATABASE` invalidates metadata under one database. +- `REFRESH TABLE` invalidates metadata for one table. +- For `REFRESH CATALOG`, `invalid_cache = false` keeps lower-level caches and refreshes only object/name lists. ### Scheduled Refresh @@ -303,7 +301,7 @@ CREATE CATALOG hive PROPERTIES ( In the above example, `metadata_refresh_interval_sec` means the Catalog is refreshed every 3600 seconds. This is equivalent to automatically executing once every 3600 seconds: -`REFRESH CATALOG ctl1 PROPERTIES("invalid_cache" = "true");` +`REFRESH CATALOG ctl1;` ## Best Practices @@ -328,11 +326,11 @@ For all types of External Catalogs, if you want to see the latest Table Schema i max_external_schema_cache_num=0 // Disable Schema cache. ``` -- Disable at Catalog level +- Legacy catalog-level property ```text -- Catalog property - "schema.cache.ttl-second" = "0" // For a specific Catalog, disable Schema cache (supported in 2.1.11, 3.0.6) + "schema.cache.ttl-second" = "0" // Legacy property, supported in 2.1.11 / 3.0.6 ``` After setting, Doris will see the latest Table Schema in real time. However, this setting may increase the pressure on the metadata service. @@ -347,20 +345,21 @@ For Hive Catalog, if you want to disable the cache to query real-time updated da -- fe.conf max_external_file_cache_num=0 // Disable file list cache max_hive_partition_table_cache_num=0 // Disable partition list cache + max_hive_partition_cache_num=0 // Disable partition property cache ``` -- Disable at Catalog level +- Legacy catalog-level properties ```text -- Catalog property - "file.meta.cache.ttl-second" = "0" // For a specific Catalog, disable file list cache - "partition.cache.ttl-second" = "0" // For a specific Catalog, disable partition list cache (supported in 2.1.11, 3.0.6) + "file.meta.cache.ttl-second" = "0" // Disable file list cache + "partition.cache.ttl-second" = "0" // Disable partition list cache (supported in 2.1.11 / 3.0.6) ``` After setting the above parameters: - New partitions in the external data source can be queried in real time. - Changes in partition data files can be queried in real time. +- Changes in partition properties require disabling the partition property cache as well. But this will increase the access pressure on external data sources (such as Hive Metastore and HDFS), which may cause unstable metadata access latency and other phenomena. - diff --git a/docs/sql-manual/sql-statements/catalog/REFRESH.md b/docs/sql-manual/sql-statements/catalog/REFRESH.md index 7c7b520211363..fcf2c613560bc 100644 --- a/docs/sql-manual/sql-statements/catalog/REFRESH.md +++ b/docs/sql-manual/sql-statements/catalog/REFRESH.md @@ -13,7 +13,8 @@ This statement refreshes the metadata of the specified Catalog/Database/Table. ## Syntax ```sql -REFRESH CATALOG ; +REFRESH CATALOG + [PROPERTIES ("invalid_cache" = "true" | "false")]; REFRESH DATABASE [.]; REFRESH TABLE [[.].]; ``` @@ -41,6 +42,13 @@ The name of the table within the catalog that needs to be refreshed. ## Usage Notes When the Catalog is refreshed, the object-related Cache is forced to be invalidated. Including Partition Cache, Schema Cache, File Cache, etc. +`invalid_cache` controls whether lower-level metadata caches are invalidated during `REFRESH CATALOG`: + +- `true`: invalidate catalog object caches and lower-level caches such as partition, schema, and file caches. This is the default behavior. +- `false`: refresh only catalog-level object/name metadata and keep lower-level caches. + +`invalid_cache` currently applies to `REFRESH CATALOG`. + ## Example 1. Refresh hive catalog @@ -49,14 +57,20 @@ When the Catalog is refreshed, the object-related Cache is forced to be invalida REFRESH CATALOG hive; ``` -2. Refresh database1 +2. Refresh hive catalog without invalidating lower-level caches + + ```sql + REFRESH CATALOG hive PROPERTIES("invalid_cache" = "false"); + ``` + +3. Refresh database1 ```sql REFRESH DATABASE ctl.database1; REFRESH DATABASE database1; ``` -3. Refresh table1 +4. Refresh table1 ```sql REFRESH TABLE ctl.db.table1; @@ -64,4 +78,3 @@ When the Catalog is refreshed, the object-related Cache is forced to be invalida REFRESH TABLE table1; ``` - diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md index b2deb486b69d3..4aa3cff765b86 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md @@ -18,41 +18,47 @@ ## 表信息 -| 列名 | 类型 | 说明 | +该表中一行表示“某个 FE 上、某个 external catalog 的一个 cache entry”的统计快照。 + +| 列名 | 类型 | 说明 | | :----------- | :--- | :----------- | +| FE_HOST | text | 上报该统计的 FE 主机 | | CATALOG_NAME | text | Catalog 名字 | -| CACHE_NAME | text | 缓存名字 | -| METRIC_NAME | text | 指标名字 | -| METRIC_VALUE | text | 指标值 | +| ENGINE_NAME | text | Meta cache 引擎名,如 `hive`、`iceberg`、`paimon` | +| ENTRY_NAME | text | 引擎内部的 cache entry 名,如 `schema`、`file`、`manifest` | +| EFFECTIVE_ENABLED | boolean | 综合 `enable` / `ttl-second` / `capacity` 后,该缓存是否真正生效 | +| CONFIG_ENABLED | boolean | 配置中的原始 `enable` 值 | +| AUTO_REFRESH | boolean | 该 entry 是否启用异步 refresh-after-write | +| TTL_SECOND | bigint | TTL 秒数。`0` 表示关闭,`-1` 表示永不过期 | +| CAPACITY | bigint | 最大条目数 | +| ESTIMATED_SIZE | bigint | 当前缓存条目估计数 | +| REQUEST_COUNT | bigint | 总请求数 | +| HIT_COUNT | bigint | 命中次数 | +| MISS_COUNT | bigint | 未命中次数 | +| HIT_RATE | double | 命中率 | +| LOAD_SUCCESS_COUNT | bigint | 成功加载次数 | +| LOAD_FAILURE_COUNT | bigint | 失败加载次数 | +| TOTAL_LOAD_TIME_MS | bigint | 总加载耗时,单位毫秒 | +| AVG_LOAD_PENALTY_MS | double | 平均加载耗时,单位毫秒 | +| EVICTION_COUNT | bigint | 被驱逐条目数 | +| INVALIDATE_COUNT | bigint | 显式失效次数 | +| LAST_LOAD_SUCCESS_TIME | text | 最近一次成功加载时间 | +| LAST_LOAD_FAILURE_TIME | text | 最近一次失败加载时间 | +| LAST_ERROR | text | 最近一次加载失败错误信息 | ## 使用示例 -```text -mysql> select * from catalog_meta_cache_statistics; -+----------------------+-----------------------------+----------------------+----------------------+ -| CATALOG_NAME | CACHE_NAME | METRIC_NAME | METRIC_VALUE | -+----------------------+-----------------------------+----------------------+----------------------+ -| hive_iceberg_minio | iceberg_table_cache | eviction_count | 0 | -| hive_iceberg_minio | iceberg_table_cache | hit_ratio | 0.2413793103448276 | -| hive_iceberg_minio | iceberg_table_cache | average_load_penalty | 2.4654859845454547E8 | -| hive_iceberg_minio | iceberg_table_cache | estimated_size | 22 | -| hive_iceberg_minio | iceberg_table_cache | hit_count | 7 | -| hive_iceberg_minio | iceberg_table_cache | read_count | 29 | -| hive_iceberg_minio | iceberg_snapshot_cache | eviction_count | 0 | -| hive_iceberg_minio | iceberg_snapshot_cache | hit_ratio | 1.0 | -| hive_iceberg_minio | iceberg_snapshot_cache | average_load_penalty | 0.0 | -| hive_iceberg_minio | iceberg_snapshot_cache | estimated_size | 0 | -| hive_iceberg_minio | iceberg_snapshot_cache | hit_count | 0 | -| hive_iceberg_minio | iceberg_snapshot_cache | read_count | 0 | -+----------------------+-----------------------------+----------------------+----------------------+ +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, last_error +FROM information_schema.catalog_meta_cache_statistics +ORDER BY catalog_name, engine_name, entry_name; ``` -METRIC_NAME 列包含以下 Caffeine 缓存性能指标: +常见用法: -- eviction_count:从缓存中驱逐的条目数量 -- hit_ratio:缓存命中率,范围从 0.0 到 1.0 -- average_load_penalty:加载新值的平均耗时(纳秒) -- estimated_size:缓存中条目的估计数量 -- hit_count:缓存查找方法返回缓存值的次数 -- read_count:缓存查找方法被调用的总次数 \ No newline at end of file +- 用 `ENGINE_NAME` + `ENTRY_NAME` 定位具体的逻辑缓存。 +- 用 `EFFECTIVE_ENABLED`、`TTL_SECOND`、`CAPACITY` 确认实际生效的缓存策略。 +- 用 `HIT_RATE`、`ESTIMATED_SIZE`、`LOAD_FAILURE_COUNT`、`LAST_ERROR` 排查缓存行为。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalog-overview.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalog-overview.md index 5b096d18f0096..a071dbd83d2c8 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalog-overview.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalog-overview.md @@ -324,7 +324,8 @@ REFRESH TABLE catalog_name.db_name.table_name; Doris 也支持关闭元数据缓存,以便能够实时访问到最新的元数据。 -关于元数据缓存的详细介绍和配置,请参阅:[元数据缓存](./meta-cache.md) +- Doris 4.1.x 之前:请参阅[元数据缓存](./meta-cache.md)。 +- Doris 4.1.x 及之后:请参阅各 Catalog 文档中的“元数据缓存”章节,例如 [Hive Catalog](./catalogs/hive-catalog.md#meta-cache)、[Iceberg Catalog](./catalogs/iceberg-catalog.mdx#meta-cache)、[Hudi Catalog](./catalogs/hudi-catalog.md#meta-cache)、[Paimon Catalog](./catalogs/paimon-catalog.mdx#meta-cache)、[MaxCompute Catalog](./catalogs/maxcompute-catalog.md#meta-cache)。 ## 修改数据目录 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx index 34ec561730e8d..2fef1adbd7a21 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hive-catalog.mdx @@ -78,6 +78,82 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[ 数据目录概述 ](../catalog-overview.md)中【通用属性】部分。 +## 元数据缓存 {#meta-cache} + +为了提升访问外部数据源的性能,Apache Doris 会对 Hive 的元数据进行缓存。元数据包括表结构(Schema)、分区列表、分区属性和文件列表等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 +从 Doris 4.1.x 开始,Hive Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +::: + +### 统一属性模型(4.1.x+) {#meta-cache-unified-model} + +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 + +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 + +### 缓存模块 {#meta-cache-unified-modules} + +Hive Catalog 包含以下缓存模块: + +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.hive.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `partition_values` | `meta.cache.hive.partition_values.` | 缓存分区值/分区名称列表。影响:`SHOW PARTITIONS`、分区枚举、分区裁剪,以及外部新增/删除分区何时在 Doris 中可见。若关闭,可实时查看到分区变动。 | +| `partition` | `meta.cache.hive.partition.` | 缓存分区属性(Location、InputFormat、Serde 等)。影响:单个分区位置、格式、Serde 等属性变更的可见性。 | +| `file` | `meta.cache.hive.file.` | 缓存文件列表。影响:新增/删除文件、文件大小变化被 Doris 感知的时效性,同时减少远端 LIST 操作开销。若关闭,每次查询都会重新加载文件列表。 | + +### 旧参数映射与转换 {#meta-cache-mapping} + +在 4.1.x 之前,Hive 元数据缓存一部分通过 Catalog 兼容属性控制,一部分仍受 FE 全局缓存参数控制,详见[元数据缓存](../meta-cache.md)。升级到 4.1.x 后,建议显式改写为 `meta.cache.hive.*`,不要继续沿用旧键名。 + +| 4.1 前属性键 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 Hive Catalog 兼容属性 | `meta.cache.hive.schema.ttl-second` | 仅对应 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`;`enable` 和 `capacity` 需按需单独配置。 | +| `partition.cache.ttl-second` | 4.1 前 Hive Catalog 兼容属性 | `meta.cache.hive.partition_values.ttl-second` | 仅对应分区列表新鲜度。若希望新增/删除分区每次查询立即可见,设置为 `0`。 | +| `file.meta.cache.ttl-second` | 4.1 前 Hive Catalog 兼容属性 | `meta.cache.hive.file.ttl-second` | 仅对应文件列表新鲜度。若希望新增/删除文件每次查询立即可见,设置为 `0`。 | + +`meta.cache.hive.partition.*` 是 4.1.x 中单独可调的新模块,4.1 前没有一一对应的 Catalog 级 TTL 键。若您关心分区 Location、Serde、InputFormat 等属性变更的可见性,需要在升级后单独设置它。 + +4.1.x 的统一模型把每个缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧键只表达 TTL,不表达是否启用和容量上限。升级后如果只完成键名替换而不评估 `enable/capacity`,则其余行为会落到 4.1.x 的默认值。 + +### 最佳实践 {#meta-cache-best-practices} + +* **实时查看最新数据**:如果您希望每次查询都能看到外部数据源的最新分区或文件变动,可以将对应的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭文件列表缓存,实时看到文件变动 + ALTER CATALOG hive_ctl SET PROPERTIES ("meta.cache.hive.file.ttl-second" = "0"); + -- 关闭分区值缓存,实时看到新增分区 + ALTER CATALOG hive_ctl SET PROPERTIES ("meta.cache.hive.partition_values.ttl-second" = "0"); + ``` +* **性能优化**:对于元数据变动不频繁的场景,建议适当增大 `capacity` 和 `ttl-second` 以减少对 Hive Metastore 和文件系统的访问压力。 + +:::caution +**Hive Catalog 注意事项**:Hive 的 `meta.cache.hive.*` 属性修改**不支持热生效**。修改配置后,必须重建 Catalog 或重启 FE 节点才能应用新的缓存配置。 +::: + +### 可观测性 {#meta-cache-unified-observability} + +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'hive_ctl' AND engine_name = 'hive' +ORDER BY entry_name; +``` + +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 Hive 版本 支持 Hive 1.x,2.x,3.x,4.x。 @@ -1101,4 +1177,3 @@ DROP DATABASE [IF EXISTS] hive_ctl.hive_db; | -------- | ------------------------------------ | | 2.1.6 | 支持 Hive 表数据写回 | | 3.0.4 | 支持 JsonSerDe 格式的 Hive 表。支持 Hive4 的事务表。 | - diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md index cb4ac7cc702bc..d70163d1eb720 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/hudi-catalog.md @@ -51,6 +51,74 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( | ------------------------------- | -------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | ----- | | `hudi.use_hive_sync_partition` | `use_hive_sync_partition` | 是否使用 Hive Metastore 已同步的分区信息。如果为 true,则会直接从 Hive Metastore 中获取分区信息。否则,会从文件系统的元数据文件中获取分区信息。通过 Hive Metastore 获取信息性能更好,但需要用户保证最新的元数据已经同步到了 Hive Metastore。 | false | +## 元数据缓存 {#meta-cache} + +为了提升访问外部数据源的性能,Apache Doris 会对 Hudi 的元数据进行缓存。元数据包括表结构(Schema)、分区信息、FS View 和 Meta Client 对象等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 +从 Doris 4.1.x 开始,Hudi 相关外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +::: + +### 统一属性模型(4.1.x+) {#meta-cache-unified-model} + +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 + +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 + +### 缓存模块 {#meta-cache-unified-modules} + +Hudi Catalog 包含以下缓存模块: + +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.hudi.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `partition` | `meta.cache.hudi.partition.` | 缓存 Hudi 分区相关元数据。影响:分区发现、分区裁剪,以及新增/删除分区何时在 Doris 中可见。 | +| `fs_view` | `meta.cache.hudi.fs_view.` | 缓存 Hudi 文件系统视图相关元数据。影响:查询规划时选择到的最新 base file / log file 以及 file slice 视图的新鲜度。 | +| `meta_client` | `meta.cache.hudi.meta_client.` | 缓存 Hudi Meta Client 对象。影响:时间线(timeline)、表配置等底层元数据重新加载的频率,以及提交/表配置变更何时被感知。 | + +### 旧参数映射与转换 {#meta-cache-mapping} + +在 4.1.x 之前,Hudi 的 Schema 缓存有 Catalog 兼容属性,分区与表级元数据则主要遵循旧的 FE 全局缓存模型,详见[元数据缓存](../meta-cache.md)。升级到 4.1.x 后,建议统一改写为 `meta.cache.hudi.*`,并分别配置分区、FS View 和 Meta Client。 + +| 4.1 前属性键/旧模型 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 Hudi Catalog 兼容属性 | `meta.cache.hudi.schema.ttl-second` | 控制 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`。 | +| Hudi 分区旧模型 | 4.1 前 FE 全局缓存策略(见旧版元数据缓存文档) | `meta.cache.hudi.partition.ttl-second` | 控制分区发现与分区可见性。若希望新增/删除分区每次查询立即可见,设置为 `0`。 | +| 无一一对应的旧 Catalog 键 | 4.1 前未单独暴露 `fs_view` / `meta_client` TTL | `meta.cache.hudi.fs_view.*`、`meta.cache.hudi.meta_client.*` | 这是 4.1.x 中拆分出的新模块。若希望更快感知最新 file slice 或提交时间线,分别调低对应 `ttl-second`。 | + +4.1.x 的统一模型把缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧模型主要描述 TTL/全局缓存行为。升级后如果仍沿用旧理解,容易遗漏 `fs_view`、`meta_client` 这类新模块的单独配置。 + +### 最佳实践 {#meta-cache-best-practices} + +* **实时查看最新数据**:如果您希望每次查询都能看到 Hudi 表的最新数据变动或 Schema 变更,可以将 `schema` 或 `partition` 的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭分区元数据缓存,以感知 Hudi 表的最新分区变动 + ALTER CATALOG hudi_ctl SET PROPERTIES ("meta.cache.hudi.partition.ttl-second" = "0"); + ``` +* **性能优化**:`ALTER CATALOG ... SET PROPERTIES` 的修改在 Hudi 中支持热生效(通过 HMS catalog 属性更新路径)。 + +### 可观测性 {#meta-cache-unified-observability} + +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'hudi_ctl' AND engine_name = 'hudi' +ORDER BY entry_name; +``` + +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 Hudi 版本 当前依赖的 Hudi 版本为 0.15。推荐访问 0.14 版本以上的 Hudi 数据。 @@ -226,4 +294,3 @@ SELECT * from hudi_table@incr('beginTime'='xxx', ['endTime'='xxx'], ['hoodie.rea | Doris 版本 | 功能支持 | | ----------- | ----------------------------------------- | | 2.1.8/3.0.4 | Hudi 依赖升级到 0.15。新增 Hadoop Hudi JNI Scanner。 | - diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx index b41d97363498f..8b1cd5f1e91e9 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/iceberg-catalog.mdx @@ -87,6 +87,82 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中【通用属性】部分。 +## 元数据缓存 {#meta-cache} + +为了提升访问外部数据源的性能,Apache Doris 会对 Iceberg 的元数据进行缓存。元数据包括表结构(Schema)、表对象、View 对象和 Manifest 详情等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 +从 Doris 4.1.x 开始,Iceberg Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +::: + +### 统一属性模型(4.1.x+) {#meta-cache-unified-model} + +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 + +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 + +### 缓存模块 {#meta-cache-unified-modules} + +Iceberg Catalog 包含以下缓存模块: + +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.iceberg.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `table` | `meta.cache.iceberg.table.` | 缓存 Iceberg 表元数据对象。影响:最新 Snapshot、Partition Spec、Sort Order、表属性等表级元数据在 Doris 中的可见性;若关闭,每次规划都会重新加载表元数据。 | +| `view` | `meta.cache.iceberg.view.` | 缓存 Iceberg View 元数据对象。影响:View 定义、Schema、属性变更在 Doris 中的可见性。 | +| `manifest` | `meta.cache.iceberg.manifest.` | 缓存 Manifest 详情。主要影响查询规划时重复读取 Manifest 文件的开销,通常不直接决定表或 Snapshot 是否可见。注意:该模块默认关闭,需手动启用。 | + +### 旧参数映射与转换 {#meta-cache-mapping} + +在 4.1.x 之前,Iceberg 表级元数据主要受 FE 全局缓存策略控制,`schema.cache.ttl-second` 是常见的 Catalog 兼容属性;详见[元数据缓存](../meta-cache.md)。升级到 4.1.x 后,建议改写为 `meta.cache.iceberg.*`,并按需要分别配置表、View 和 Manifest 缓存。 + +| 4.1 前属性键/旧模型 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 Iceberg Catalog 兼容属性 | `meta.cache.iceberg.schema.ttl-second` | 控制 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`。 | +| Iceberg 表信息旧模型 | 4.1 前 FE 全局缓存策略(见旧版元数据缓存文档) | `meta.cache.iceberg.table.ttl-second` | 控制表级元数据新鲜度。若希望每次查询都读取最新 Snapshot/表属性,设置为 `0`。 | +| 无一一对应的旧 Catalog 键 | 4.1 前未单独暴露 View / Manifest TTL | `meta.cache.iceberg.view.*`、`meta.cache.iceberg.manifest.*` | 这是 4.1.x 中拆分出的新模块。升级后如需保证最新 View 定义,单独调低 `view.ttl-second`;`manifest` 主要用于性能优化。 | + +4.1.x 的统一模型把缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧模型主要描述 TTL/全局缓存行为,不覆盖这些新模块的独立开关和容量上限。升级时建议一并评估是否需要补充 `enable/capacity`。 + +### 最佳实践 {#meta-cache-best-practices} + +* **实时查看最新数据**:如果您希望每次查询都能看到 Iceberg 表的最新快照或 Schema 变动,可以将 `schema` 或 `table` 的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭表对象缓存,以便感知快照变动 + ALTER CATALOG iceberg_ctl SET PROPERTIES ("meta.cache.iceberg.table.ttl-second" = "0"); + ``` +* **性能优化**: + * 启用 Manifest 缓存可以显著提升大表的查询规划速度: + ```sql + ALTER CATALOG iceberg_ctl SET PROPERTIES ( + "meta.cache.iceberg.manifest.enable" = "true", + "meta.cache.iceberg.manifest.ttl-second" = "600" + ); + ``` + * `ALTER CATALOG ... SET PROPERTIES` 的修改在 Iceberg Catalog 中支持热生效。 + +### 可观测性 {#meta-cache-unified-observability} + +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'iceberg_ctl' AND engine_name = 'iceberg' +ORDER BY entry_name; +``` + +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 Iceberg 版本 | Doris 版本 | Iceberg SDK 版本 | diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md index 856633f99447e..4329bc9b0073c 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/maxcompute-catalog.md @@ -68,6 +68,71 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中「通用属性」部分。 +## 元数据缓存 {#meta-cache} + +为了提升访问外部数据源的性能,Apache Doris 会对 MaxCompute 的元数据进行缓存。元数据包括表结构(Schema)和分区列表等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 +从 Doris 4.1.x 开始,MaxCompute Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +::: + +### 统一属性模型(4.1.x+) {#meta-cache-unified-model} + +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 + +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 + +### 缓存模块 {#meta-cache-unified-modules} + +MaxCompute Catalog 包含以下缓存模块: + +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.maxcompute.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `partition_values` | `meta.cache.maxcompute.partition_values.` | 缓存分区值列表。影响:分区裁剪、分区枚举,以及新增/删除分区何时在 Doris 中可见。若关闭,可实时查看到分区变动。 | + +### 旧参数映射与转换 {#meta-cache-mapping} + +在 4.1.x 之前,MaxCompute 的 Schema 和分区相关缓存主要通过 Catalog 兼容属性或 FE 全局缓存策略控制。升级到 4.1.x 后,建议统一改写为 `meta.cache.maxcompute.*`。 + +| 4.1 前属性键/旧模型 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 MaxCompute Catalog 兼容属性 | `meta.cache.maxcompute.schema.ttl-second` | 控制 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`。 | +| MaxCompute 分区值旧模型 | 4.1 前 FE 全局缓存策略 | `meta.cache.maxcompute.partition_values.ttl-second` | 控制分区枚举与分区可见性。若希望新增/删除分区每次查询立即可见,设置为 `0`。 | + +4.1.x 的统一模型把缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧模型主要表达 TTL/全局缓存行为。升级后如果只迁移 TTL 而不评估 `enable/capacity`,其余行为会使用 4.1.x 的默认值。 + +### 最佳实践 {#meta-cache-best-practices} + +* **实时查看最新数据**:如果您希望每次查询都能看到 MaxCompute 表的最新分区变动或 Schema 变更,可以将 `schema` 或 `partition_values` 的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭分区值缓存,以感知 MaxCompute 表的最新分区 + ALTER CATALOG mc_ctl SET PROPERTIES ("meta.cache.maxcompute.partition_values.ttl-second" = "0"); + ``` +* **注意**:`meta.cache.maxcompute.*` 目前没有专门的热生效 hook。修改配置后,建议重建 Catalog 或重启 FE 以确保生效。 + +### 可观测性 {#meta-cache-unified-observability} + +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'mc_ctl' AND engine_name = 'maxcompute' +ORDER BY entry_name; +``` + +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 MaxCompute 版本 仅支持公有云版本的 MaxCompute。私有云版本支持请联系 Doris 社区支持。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx index 3f209206b15f6..75e96499a32a1 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/catalogs/paimon-catalog.mdx @@ -88,6 +88,71 @@ CREATE CATALOG [IF NOT EXISTS] catalog_name PROPERTIES ( CommonProperties 部分用于填写通用属性。请参阅[数据目录概述](../catalog-overview.md)中【通用属性】部分。 +## 元数据缓存 {#meta-cache} + +为了提升访问外部数据源的性能,Apache Doris 会对 Paimon 的元数据进行缓存。元数据包括表结构(Schema)和表对象等。 + +:::tip +对于 Doris 4.1.x 之前的版本,元数据缓存主要由 FE 配置项全局控制,详情请参阅[元数据缓存](../meta-cache.md)。 +从 Doris 4.1.x 开始,Paimon Catalog 的外表元数据缓存使用统一键 `meta.cache.*` 进行配置。 +::: + +### 统一属性模型(4.1.x+) {#meta-cache-unified-model} + +各引擎 cache entry 使用统一的配置键格式:`meta.cache...{enable,ttl-second,capacity}`。 + +| 属性 | 示例 | 含义 | +|---|---|---| +| `enable` | `true/false` | 是否启用该缓存模块。 | +| `ttl-second` | `600`、`0`、`-1` | `0` 表示关闭缓存(即刻生效,可用于查看最新数据);`-1` 表示永不过期;其他正整数表示按访问时间计算 TTL(秒)。 | +| `capacity` | `10000` | 最大缓存条目数(按条目数量计)。`0` 表示关闭。 | + +**生效逻辑说明:** 只有当 `enable=true` 且 `ttl-second != 0` 且 `capacity > 0` 时,该模块缓存才会生效。 + +### 缓存模块 {#meta-cache-unified-modules} + +Paimon Catalog 包含以下缓存模块: + +| 模块 (``) | 属性键前缀 | 缓存内容与影响 | +|---|---|---| +| `schema` | `meta.cache.paimon.schema.` | 缓存表结构。影响:列新增、删除、类型变更在 Doris 中的可见性。若关闭,每次查询都会拉取最新 Schema。 | +| `table` | `meta.cache.paimon.table.` | 缓存 Paimon 表元数据对象。影响:最新 Snapshot、Schema 演进、分支/标签引用等表级元数据在 Doris 中的可见性,同时减少查询规划时的元数据加载开销。 | + +### 旧参数映射与转换 {#meta-cache-mapping} + +在 4.1.x 之前,Paimon 的 Schema 与表级元数据主要遵循旧的 Catalog 兼容属性或 FE 全局缓存模型,详见[元数据缓存](../meta-cache.md)。升级到 4.1.x 后,建议改写为 `meta.cache.paimon.*`,并单独评估表级缓存是否需要更强的新鲜度。 + +| 4.1 前属性键/旧模型 | 适用范围 | 4.1.x+ 统一键 | 升级建议与影响 | +|---|---|---|---| +| `schema.cache.ttl-second` | 4.1 前 Paimon Catalog 兼容属性 | `meta.cache.paimon.schema.ttl-second` | 控制 Schema 新鲜度。若希望列变更每次查询立即可见,设置为 `0`。 | +| Paimon 表级旧模型 | 4.1 前 FE 全局缓存策略(见旧版元数据缓存文档中的旧表级元数据模型说明) | `meta.cache.paimon.table.ttl-second` | 控制最新 Snapshot/表级元数据的可见性。若希望每次查询都读取最新表快照,设置为 `0`。 | + +4.1.x 的统一模型把缓存拆分为 `enable`、`ttl-second`、`capacity` 三个维度;旧模型主要表达 TTL/全局缓存行为。升级后建议不要只替换键名,还要同时评估是否需要显式配置 `enable/capacity`。 + +### 最佳实践 {#meta-cache-best-practices} + +* **实时查看最新数据**:如果您希望每次查询都能看到 Paimon 表的最新数据变动或 Schema 变更,可以将 `schema` 或 `table` 的 `ttl-second` 设置为 `0`。 + ```sql + -- 关闭表对象缓存,以感知 Paimon 表的最新快照 + ALTER CATALOG paimon_ctl SET PROPERTIES ("meta.cache.paimon.table.ttl-second" = "0"); + ``` +* **性能优化**:`ALTER CATALOG ... SET PROPERTIES` 的修改在 Paimon Catalog 中支持热生效。 + +### 可观测性 {#meta-cache-unified-observability} + +可以通过 `information_schema.catalog_meta_cache_statistics` 系统表观测缓存指标: + +```sql +SELECT catalog_name, engine_name, entry_name, + effective_enabled, ttl_second, capacity, + estimated_size, hit_rate, load_failure_count, last_error +FROM information_schema.catalog_meta_cache_statistics +WHERE catalog_name = 'paimon_ctl' AND engine_name = 'paimon' +ORDER BY entry_name; +``` + +该系统表文档见:[catalog_meta_cache_statistics](../../admin-manual/system-tables/information_schema/catalog_meta_cache_statistics.md)。 + ### 支持的 Paimon 版本 当前依赖的 Paimon 版本为 1.0.0。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md index 1c35945043b6d..cf6145a5925a9 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/lakehouse/meta-cache.md @@ -6,18 +6,20 @@ } --- +:::tip +该文档主要适用于 Doris 4.1.x 之前的版本。 +对于 Doris 4.1.x 及之后版本,外表元数据缓存已重构并使用统一配置键 `meta.cache.*`,请直接参阅各 [Catalog](./catalog-overview.md) 文档中的“元数据缓存”章节。 +如果您正在从 4.1.x 之前的版本升级,请以各 Catalog 页中的“旧参数映射与转换”为准,将旧参数改写为 `meta.cache.*` 统一键。 +::: + 为了提升访问外部数据源的性能,Apache Doris 会对外部数据源的**元数据**进行缓存。 元数据包括库、表、列信息、分区信息、快照信息、文件列表等。 -本文详细介绍缓存的元数据的种类、策略和相关参数配置。 +本文详细介绍旧版本(pre-4.1)中缓存的元数据的种类、策略和相关参数配置。 关于**数据缓存**,可参阅[数据缓存文档](./data-cache.md)。 -:::tip -该文档适用于 2.1.6 之后的版本。 -::: - ## 缓存策略 大多数缓存都有如下三个策略指标: @@ -52,15 +54,27 @@ ## 缓存类型 +下面的内容主要描述代表性的 FE 默认值与旧模型兼容参数,不应理解为 Doris 4.1.x+ 的完整 cache entry 列表。 + +| 类别 | 作用域 | 主要 FE 默认值 | 说明 | +|---|---|---|---| +| 库 / 表名称列表 | 每个 catalog / 每个 database | `external_cache_expire_time_seconds_after_access`、`external_cache_refresh_time_minutes` | 用于 `SHOW DATABASES` / `SHOW TABLES` | +| 库 / 表对象 | 每个 catalog / 每个 database | `max_meta_object_cache_num`、`external_cache_expire_time_seconds_after_access`、`external_cache_refresh_time_minutes` | 对象缓存与名称列表缓存可能短暂不一致 | +| 表 schema | 每个 catalog | `max_external_schema_cache_num`、`external_cache_expire_time_seconds_after_access`、`external_cache_refresh_time_minutes` | 旧的 catalog 级兼容参数:`schema.cache.ttl-second` | +| Hive 分区值 | 每个 Hive catalog | `max_hive_partition_table_cache_num`、`external_cache_expire_time_seconds_after_access`、`external_cache_refresh_time_minutes` | 旧的 catalog 级兼容参数:`partition.cache.ttl-second` | +| Hive 分区属性 | 每个 Hive catalog | `max_hive_partition_cache_num`、`external_cache_expire_time_seconds_after_access` | 没有旧的 catalog 级 TTL 覆盖参数 | +| Hive 文件列表 | 每个 Hive catalog | `max_external_file_cache_num`、`external_cache_expire_time_seconds_after_access`、`external_cache_refresh_time_minutes` | 旧的 catalog 级兼容参数:`file.meta.cache.ttl-second` | +| Hudi / Iceberg / Paimon 旧表级元数据 | 每个 catalog | `max_external_table_cache_num`、`external_cache_expire_time_seconds_after_access`、`external_cache_refresh_time_minutes` | Doris 4.1.x+ 下的 `fs_view`、`meta_client`、`view`、`manifest` 等请看对应 Catalog 页 | + ### 库、表名称列表 库名称列表(Database name list)指的是一个 Catalog 下所有库的名称的列表。 表名称列表(Table name list)指的是一个库下所有表的名称列表。 -名称列表仅用于需要列举名称得操作,如 `SHOW TABLES` 或 `SHOW DATABASES` 语句。 +名称列表仅用于需要列举名称的操作,如 `SHOW TABLES` 或 `SHOW DATABASES` 语句。 -每个 Catalog 下都一个库名称列表缓存。每个库下都有一个表名称列表缓存。 +每个 Catalog 下都有一个库名称列表缓存。每个库下都有一个表名称列表缓存。 - 最大缓存数量 @@ -84,7 +98,7 @@ 比如通过 `SHOW TABLES` 命令,从名称列表缓存中获取到 `A`、`B`、`C` 三个表。假设此时外部数据源增加了表 `D`,那么 `SELECT * FROM D` 可以访问到表 `D`,同时【表对象】缓存里会增加表 `D` 对象,但【表名称列表】缓存中可能依然是 `A`、`B`、`C`。只有当【表名称列表】缓存刷新后,才会变成 `A`、`B`、`C`、`D`。 -每个 Catalog 下都一个库名称列表缓存。每个库下都有一个表名称列表缓存。 +每个 Catalog 下都有一个库对象缓存。每个库下都有一个表对象缓存。 - 最大缓存数量 @@ -96,7 +110,7 @@ - 最短刷新时间 - 由 FE 配置项 `external_cache_expire_time_minutes_after_access` 控制。单位为分钟。默认 10 分钟。减少该时间,可以更实时的在 Doris 中到最新的库或表,但会增加访问外部数据源的频率。 + 由 FE 配置项 `external_cache_expire_time_minutes_after_access` 控制。单位为分钟。默认 10 分钟。减少该时间,可以更实时地在 Doris 中看到最新的库或表,但会增加访问外部数据源的频率。 3.0.7 版本后,配置项名称修改为 `external_cache_refresh_time_minutes`。默认值不变。 @@ -104,7 +118,7 @@ 缓存表的 Schema 信息,如列名等。该缓存主要用于按需加载被访问到的表的 Schema,以防止同步大量不需要被访问的表的 Schema 而占用 FE 的内存。 -该缓存由所有 Catalog 共享,全局唯一。 +该缓存按 catalog 维度管理。 - 最大缓存数量 @@ -172,7 +186,7 @@ - 最大缓存数量 - 由 FE 配置项 `max_external_file_cache_num` 控制,默认为 100000。 + 由 FE 配置项 `max_external_file_cache_num` 控制,默认为 10000。 可以根据所需要访问的文件数量,适当调整这个参数。 @@ -186,13 +200,14 @@ - 最短刷新时间 - 由 FE 配置项 `external_cache_expire_time_minutes_after_access` 控制。单位为分钟。默认 10 分钟。减少该时间,可以更实时的在 Doris 中访问到最新的分区属性,但会增加访问外部数据源的频率。 + 由 FE 配置项 `external_cache_expire_time_minutes_after_access` 控制。单位为分钟。默认 10 分钟。减少该时间,可以更实时地在 Doris 中访问到最新的文件列表,但会增加访问外部数据源的频率。 3.0.7 版本后,配置项名称修改为 `external_cache_refresh_time_minutes`。默认值不变。 ### Hudi 表分区 -用于缓存 Hudi 表的分区信息。 +这里描述的是 Hudi 分区元数据缓存的旧模型摘要。 +对于 Doris 4.1.x+ 的当前 Hudi cache entry(如 `fs_view`、`meta_client`),请参阅 [Hudi Catalog](./catalogs/hudi-catalog.md#meta-cache)。 该缓存,每个 Hudi Catalog 有一个。 @@ -214,7 +229,8 @@ ### Iceberg 表信息 -用于缓存 Iceberg 表对象。该对象通过 Iceberg API 加载并构建。 +这里描述的是 Iceberg 表元数据缓存的旧模型摘要。表对象通过 Iceberg API 加载并构建。 +对于 Doris 4.1.x+ 的当前可观测 cache entry,请参阅 [Iceberg Catalog](./catalogs/iceberg-catalog.mdx#meta-cache)。 该缓存,每个 Iceberg Catalog 有一个。 @@ -234,10 +250,10 @@ 3.0.7 版本后,配置项名称修改为 `external_cache_refresh_time_minutes`。默认值不变。 -### Iceberg 表 Snapshot +### Iceberg Snapshot 相关元数据 -用于缓存 Iceberg 表的 Snapshot 列表。该对象通过 Iceberg API 加载并构建。 -该缓存,每个 Iceberg Catalog 有一个。 +这里描述的是从 Iceberg 表元数据派生出的 snapshot 相关缓存行为。 +在当前实现里,不应将它理解为 Doris 4.1.x 下和 `table`、`view`、`manifest` 并列的独立 cache entry。 - 最大缓存数量 @@ -257,37 +273,19 @@ ## 缓存刷新 -除了上述每个缓存各自的刷新和淘汰策略外,用户也可以通过手动或定时的方式直接刷新元数据缓存。 +除了上述刷新和淘汰策略外,用户也可以通过手动或定时方式刷新元数据。 ### 手动刷新 -用户可以通过 `REFRESH` 命令手动刷新元数据。 - -1. REFRESH CATALOG - - 刷新指定 Catalog。 - - `REFRESH CATALOG ctl1 PROPERTIES("invalid_cache" = "true");` - - - 该命令会刷新指定 Catalog 的库列表,表列名以及所有缓存信息等。 - - `invalid_cache` 表示是否要刷新分区和文件列表等缓存。默认为 true。如果为 false,则只会刷新 Catalog 的库、表列表,而不会刷新分区和文件列表等缓存信息。该参数适用于,用户只想同步新增删的库表信息时,可以设置为 false。 - -2. REFRESH DATABASE - - 刷新指定 Database。 - - `REFRESH DATABASE [ctl.]db1 PROPERTIES("invalid_cache" = "true");` - - - 该命令会刷新指定 Database 的表列名以及 Database 下的所有缓存信息等。 - - `invalid_cache` 属性含义同上。默认为 true。如果为 false,则只会刷新 Database 的表列表,而不会刷新缓存信息。该参数适用于,用户只想同步新增删的表信息时。 - -3. REFRESH TABLE - - 刷新指定 Table。 +使用 `REFRESH` 语句可以失效 catalog、database 或 table 级元数据。 +当前语法、权限要求与示例请参阅 [REFRESH](../sql-manual/sql-statements/catalog/REFRESH.md)。 - `REFRESH TABLE [ctl.][db.]tbl1;` +行为摘要: - - 该命令会刷新指定 Table 下的所有缓存信息等。 +- `REFRESH CATALOG` 会刷新 catalog 级对象缓存,并默认继续失效更细粒度的元数据缓存。 +- `REFRESH DATABASE` 会刷新一个 database 下的元数据。 +- `REFRESH TABLE` 会刷新单表元数据。 +- 对 `REFRESH CATALOG`,若设置 `invalid_cache = false`,则只刷新对象/名称列表,不继续失效更细粒度缓存。 ### 定时刷新 @@ -303,7 +301,7 @@ CREATE CATALOG hive PROPERTIES ( 在上例中,`metadata_refresh_interval_sec` 表示每 3600 秒刷新一次 Catalog。相当于每隔 3600 秒,自动执行一次: -`REFRESH CATALOG ctl1 PROPERTIES("invalid_cache" = "true");` +`REFRESH CATALOG ctl1;` ## 最佳实践 @@ -328,11 +326,11 @@ CREATE CATALOG hive PROPERTIES ( max_external_schema_cache_num=0 // 关闭 Schema 缓存。 ``` -- Catalog 级别关闭 +- 旧的 catalog 级兼容参数 ```text -- Catalog property - "schema.cache.ttl-second" = "0" // 针对某个 Catalog,关闭 Schema 缓存(2.1.11, 3.0.6 支持) + "schema.cache.ttl-second" = "0" // 旧参数,2.1.11 / 3.0.6 支持 ``` 设置完成后,Doris 会实时可见最新的 Table Schema。但此设置可能会增加元数据服务的压力。 @@ -347,19 +345,21 @@ CREATE CATALOG hive PROPERTIES ( -- fe.conf max_external_file_cache_num=0 // 关闭文件列表缓存 max_hive_partition_table_cache_num=0 // 关闭分区列表缓存 + max_hive_partition_cache_num=0 // 关闭分区属性缓存 ``` -- Catalog 级别关闭 +- 旧的 catalog 级兼容参数 ```text -- Catalog property - "file.meta.cache.ttl-second" = "0" // 针对某个 Catalog,关闭文件列表缓存 - "partition.cache.ttl-second" = "0" // 针对某个 Catalog,关闭分区列表缓存(2.1.11, 3.0.6 支持) + "file.meta.cache.ttl-second" = "0" // 关闭文件列表缓存 + "partition.cache.ttl-second" = "0" // 关闭分区列表缓存(2.1.11 / 3.0.6 支持) ``` 设置以上参数后: - 外部数据源新增分区可以实时查询到。 - 分区数据文件变动可以实时查询到。 +- 如果希望实时看到分区属性变化,也需要同时关闭分区属性缓存。 但会增加外部源数据(如 Hive Metastore 和 HDFS)的访问压力,可能导致元数据访问延迟不稳定等现象。 diff --git a/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-statements/catalog/REFRESH.md b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-statements/catalog/REFRESH.md index 47e18785778bb..dfd23011ff3b1 100644 --- a/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-statements/catalog/REFRESH.md +++ b/i18n/zh-CN/docusaurus-plugin-content-docs/current/sql-manual/sql-statements/catalog/REFRESH.md @@ -13,7 +13,8 @@ ## 语法 ```sql -REFRESH CATALOG ; +REFRESH CATALOG + [PROPERTIES ("invalid_cache" = "true" | "false")]; REFRESH DATABASE [.]; REFRESH TABLE [[.].]; ``` @@ -42,6 +43,13 @@ REFRESH TABLE [[.].]; ## 注意事项 刷新 Catalog 的同时,会强制使对象相关的 Cache 失效。包括 Partition Cache、Schema Cache、File Cache 等。 +`invalid_cache` 用于控制 `REFRESH CATALOG` 时是否继续失效更细粒度的元数据缓存: + +- `true`:失效 catalog 对象缓存,以及分区、schema、文件列表等更细粒度缓存。这也是默认行为。 +- `false`:只刷新 catalog 级对象/名称元数据,保留更细粒度缓存。 + +`invalid_cache` 当前用于 `REFRESH CATALOG`。 + ## 示例 1. 刷新 hive catalog @@ -50,14 +58,20 @@ REFRESH TABLE [[.].]; REFRESH CATALOG hive; ``` -2. 刷新 database1 +2. 刷新 hive catalog,但不失效更细粒度缓存 + + ```sql + REFRESH CATALOG hive PROPERTIES("invalid_cache" = "false"); + ``` + +3. 刷新 database1 ```sql REFRESH DATABASE ctl.database1; REFRESH DATABASE database1; ``` -3. 刷新 table1 +4. 刷新 table1 ```sql REFRESH TABLE ctl.db.table1; @@ -65,4 +79,3 @@ REFRESH TABLE [[.].]; REFRESH TABLE table1; ``` - diff --git a/sidebars.ts b/sidebars.ts index 9da150df0faab..bbb428d54062d 100644 --- a/sidebars.ts +++ b/sidebars.ts @@ -446,6 +446,7 @@ const sidebars: SidebarsConfig = { }, 'lakehouse/data-cache', 'lakehouse/meta-cache', + 'lakehouse/meta-cache/unified-meta-cache', 'lakehouse/compute-node', 'lakehouse/statistics', {