diff --git a/src/content/docs/guides/testing/write-tests.mdx b/src/content/docs/guides/testing/write-tests.mdx index ef087b8b7..30c9632d3 100644 --- a/src/content/docs/guides/testing/write-tests.mdx +++ b/src/content/docs/guides/testing/write-tests.mdx @@ -354,7 +354,7 @@ key and opt into graceful skipping with `skip: {on: capability-unavailable}`: # tests/gcs/test.yaml suite: gcs-smoke requires: - operators: [from_gcs] + operators: [from_google_cloud_storage] skip: on: capability-unavailable ``` @@ -382,7 +382,7 @@ You now have a project that owns its inputs, tests, fixtures, and baselines. From here you can: - testing/run-tests to learn about executing the suite, - selecting tests, and setting up CI. +selecting tests, and setting up CI. - Add custom runners under `runners/` when you need specialized logic around `tenzir` invocations. - Build Python fixtures that publish or verify data through the helper APIs in diff --git a/src/content/docs/integrations/google/cloud-storage.mdx b/src/content/docs/integrations/google/cloud-storage.mdx index 849436bb9..f6aad2b66 100644 --- a/src/content/docs/integrations/google/cloud-storage.mdx +++ b/src/content/docs/integrations/google/cloud-storage.mdx @@ -14,20 +14,20 @@ Default Credentials](https://google.aip.dev/auth/4110). ## Examples -Use from_gcs to read files from Cloud +Use from_google_cloud_storage to read files from Cloud Storage. It supports glob patterns and automatic format detection. For writing, use save_gcs with a print operator. ### Read events from a file in a bucket ```tql -from_gcs "gs://bucket/path/to/file.json" +from_google_cloud_storage "gs://bucket/path/to/file.json" ``` ### Read all JSON files from a bucket ```tql -from_gcs "gs://bucket/logs/**.json" +from_google_cloud_storage "gs://bucket/logs/**.json" ``` ### Write an event to a file in a bucket diff --git a/src/content/docs/reference/operators.mdx b/src/content/docs/reference/operators.mdx index d5390fbc9..58de31782 100644 --- a/src/content/docs/reference/operators.mdx +++ b/src/content/docs/reference/operators.mdx @@ -363,14 +363,14 @@ operators: description: 'Receives events via Fluent Bit.' example: 'from_fluent_bit "opentelemetry"' path: 'reference/operators/from_fluent_bit' - - name: 'from_gcs' - description: 'Reads one or multiple files from Google Cloud Storage.' - example: 'from_gcs "gs://my-bucket/data/**.json"' - path: 'reference/operators/from_gcs' - name: 'from_google_cloud_pubsub' description: 'Subscribes to a Google Cloud Pub/Sub subscription and yields events.' example: 'from_google_cloud_pubsub project_id="my-project", subscription_id="my-sub"' path: 'reference/operators/from_google_cloud_pubsub' + - name: 'from_google_cloud_storage' + description: 'Reads one or multiple files from Google Cloud Storage.' + example: 'from_google_cloud_storage "gs://my-bucket/data/**.json"' + path: 'reference/operators/from_google_cloud_storage' - name: 'from_http' description: 'Sends and receives HTTP/1.1 requests.' example: 'from_http "0.0.0.0:8080"' @@ -735,6 +735,10 @@ operators: description: 'Sends OCSF events to Amazon Security Lake.' example: 'to_amazon_security_lake "s3://…"' path: 'reference/operators/to_amazon_security_lake' + - name: 'to_azure_blob_storage' + description: 'Writes events to one or multiple blobs in Azure Blob Storage.' + example: 'to_azure_blob_storage "abfs://container/data/{uuid}.json" { write_ndjson }' + path: 'reference/operators/to_azure_blob_storage' - name: 'to_azure_log_analytics' description: 'Sends events to the Microsoft Azure Logs Ingestion API.' example: 'to_azure_log_analytics tenant_id="...", workspace_id="..."' @@ -743,6 +747,10 @@ operators: description: 'Sends events to a ClickHouse table.' example: 'to_clickhouse table="my_table"' path: 'reference/operators/to_clickhouse' + - name: 'to_file' + description: 'Writes events to one or multiple files on a filesystem.' + example: 'to_file "/tmp/out.json" { write_ndjson }' + path: 'reference/operators/to_file' - name: 'to_fluent_bit' description: 'Sends events via Fluent Bit.' example: 'to_fluent_bit "elasticsearch" …' @@ -755,6 +763,10 @@ operators: description: 'Publishes events to a Google Cloud Pub/Sub topic.' example: 'to_google_cloud_pubsub project_id="my-project", topic_id="alerts", message=text' path: 'reference/operators/to_google_cloud_pubsub' + - name: 'to_google_cloud_storage' + description: 'Writes events to one or multiple objects in Google Cloud Storage.' + example: 'to_google_cloud_storage "gs://my-bucket/data/{uuid}.json" { write_ndjson }' + path: 'reference/operators/to_google_cloud_storage' - name: 'to_google_secops' description: 'Sends unstructured events to a Google SecOps Chronicle instance.' example: 'to_google_secops …' @@ -775,6 +787,10 @@ operators: description: 'Sends events to an OpenSearch-compatible Bulk API.' example: 'to_opensearch "localhost:9200", …' path: 'reference/operators/to_opensearch' + - name: 'to_s3' + description: 'Writes events to one or multiple objects in Amazon S3.' + example: 'to_s3 "s3://my-bucket/data/{uuid}.json" { write_ndjson }' + path: 'reference/operators/to_s3' - name: 'to_sentinelone_data_lake' description: 'Sends security events to SentinelOne Singularity Data Lake via REST API.' example: 'to_sentinelone_data_lake "https://…", …' @@ -2239,18 +2255,18 @@ from_fluent_bit "opentelemetry" - + ```tql -from_gcs "gs://my-bucket/data/**.json" +from_google_cloud_pubsub project_id="my-project", subscription_id="my-sub" ``` - + ```tql -from_google_cloud_pubsub project_id="my-project", subscription_id="my-sub" +from_google_cloud_storage "gs://my-bucket/data/**.json" ``` @@ -2595,6 +2611,14 @@ to_amazon_security_lake "s3://…" + + +```tql +to_azure_blob_storage "abfs://container/data/{uuid}.json" { write_ndjson } +``` + + + ```tql @@ -2611,6 +2635,14 @@ to_clickhouse table="my_table" + + +```tql +to_file "/tmp/out.json" { write_ndjson } +``` + + + ```tql @@ -2635,6 +2667,14 @@ to_google_cloud_pubsub project_id="my-project", topic_id="alerts", message=text + + +```tql +to_google_cloud_storage "gs://my-bucket/data/{uuid}.json" { write_ndjson } +``` + + + ```tql @@ -2675,6 +2715,14 @@ to_opensearch "localhost:9200", … + + +```tql +to_s3 "s3://my-bucket/data/{uuid}.json" { write_ndjson } +``` + + + ```tql diff --git a/src/content/docs/reference/operators/from_azure_blob_storage.mdx b/src/content/docs/reference/operators/from_azure_blob_storage.mdx index 40f2d49a2..811525086 100644 --- a/src/content/docs/reference/operators/from_azure_blob_storage.mdx +++ b/src/content/docs/reference/operators/from_azure_blob_storage.mdx @@ -10,7 +10,7 @@ Reads one or multiple files from Azure Blob Storage. ```tql from_azure_blob_storage url:string, [account_key=string, watch=bool, - remove=bool, rename=string->string, path_field=field, max_age=duration] { … } + remove=bool, rename=string->string, max_age=duration] { … } ``` ## Description @@ -90,7 +90,10 @@ from_azure_blob_storage "abfs://input/**.json", ### Add source path to events ```tql -from_azure_blob_storage "abfs://data/**.json", path_field=source_file +from_azure_blob_storage "abfs://data/**.json" { + read_json + source_file = $file.path +} ``` ## See Also diff --git a/src/content/docs/reference/operators/from_file.mdx b/src/content/docs/reference/operators/from_file.mdx index 10ded2a93..5119ed4f3 100644 --- a/src/content/docs/reference/operators/from_file.mdx +++ b/src/content/docs/reference/operators/from_file.mdx @@ -10,7 +10,7 @@ Reads one or multiple files from a filesystem. ```tql from_file url:string, [watch=bool, remove=bool, rename=string->string, - path_field=field, max_age=duration, mmap=bool] { … } + max_age=duration, mmap=bool] { … } ``` ## Description diff --git a/src/content/docs/reference/operators/from_gcs.mdx b/src/content/docs/reference/operators/from_google_cloud_storage.mdx similarity index 74% rename from src/content/docs/reference/operators/from_gcs.mdx rename to src/content/docs/reference/operators/from_google_cloud_storage.mdx index f2748a5f6..bcd8949a4 100644 --- a/src/content/docs/reference/operators/from_gcs.mdx +++ b/src/content/docs/reference/operators/from_google_cloud_storage.mdx @@ -1,7 +1,7 @@ --- -title: from_gcs +title: from_google_cloud_storage category: Inputs/Events -example: 'from_gcs "gs://my-bucket/data/**.json"' +example: 'from_google_cloud_storage "gs://my-bucket/data/**.json"' --- import FromFileCommonParams from '@partials/operators/FromFileCommonParams.mdx'; @@ -9,13 +9,13 @@ import FromFileCommonParams from '@partials/operators/FromFileCommonParams.mdx'; Reads one or multiple files from Google Cloud Storage. ```tql -from_gcs url:string, [anonymous=bool, watch=bool, remove=bool, - rename=string->string, path_field=field, max_age=duration] { … } +from_google_cloud_storage url:string, [anonymous=bool, watch=bool, remove=bool, + rename=string->string, max_age=duration] { … } ``` ## Description -The `from_gcs` operator reads files from Google Cloud Storage, with support for +The `from_google_cloud_storage` operator reads files from Google Cloud Storage, with support for glob patterns, automatic format detection, and file monitoring. By default, authentication is handled by Google's Application Default @@ -56,19 +56,19 @@ Defaults to `false`. ### Read every JSON file from a bucket ```tql -from_gcs "gs://my-bucket/data/**.json" +from_google_cloud_storage "gs://my-bucket/data/**.json" ``` ### Read CSV files from a public bucket ```tql -from_gcs "gs://public-dataset/data.csv", anonymous=true +from_google_cloud_storage "gs://public-dataset/data.csv", anonymous=true ``` ### Read Zeek logs continuously ```tql -from_gcs "gs://logs/zeek/**.log", watch=true { +from_google_cloud_storage "gs://logs/zeek/**.log", watch=true { read_zeek_tsv } ``` @@ -76,13 +76,16 @@ from_gcs "gs://logs/zeek/**.log", watch=true { ### Add source path to events ```tql -from_gcs "gs://data-bucket/**.json", path_field=source_file +from_google_cloud_storage "gs://data-bucket/**.json" { + read_json + source_file = $file.path +} ``` ### Read Suricata EVE JSON logs with custom parsing ```tql -from_gcs "gs://security-logs/suricata/**.json" { +from_google_cloud_storage "gs://security-logs/suricata/**.json" { read_suricata } ``` diff --git a/src/content/docs/reference/operators/from_s3.mdx b/src/content/docs/reference/operators/from_s3.mdx index c91d5ce4b..c51ad9b7b 100644 --- a/src/content/docs/reference/operators/from_s3.mdx +++ b/src/content/docs/reference/operators/from_s3.mdx @@ -11,7 +11,7 @@ Reads one or multiple files from Amazon S3. ```tql from_s3 url:string, [anonymous=bool, aws_iam=record, watch=bool, - remove=bool, rename=string->string, path_field=field, max_age=duration] { … } + remove=bool, rename=string->string, max_age=duration] { … } ``` ## Description @@ -98,7 +98,10 @@ from_s3 "s3://input-bucket/**.json", ### Add source path to events ```tql -from_s3 "s3://data-bucket/**.json", path_field=source_file +from_s3 "s3://data-bucket/**.json" { + read_json + source_file = $file.path +} ``` ### Read Zeek logs with anonymous access diff --git a/src/content/docs/reference/operators/to_azure_blob_storage.mdx b/src/content/docs/reference/operators/to_azure_blob_storage.mdx new file mode 100644 index 000000000..520440dba --- /dev/null +++ b/src/content/docs/reference/operators/to_azure_blob_storage.mdx @@ -0,0 +1,110 @@ +--- +title: to_azure_blob_storage +category: Outputs/Events +example: 'to_azure_blob_storage "abfs://container/data/{uuid}.json" { write_ndjson }' +--- + +import ToArrowFsCommonParams from '@partials/operators/ToArrowFsCommonParams.mdx'; + +Writes events to one or multiple blobs in Azure Blob Storage. + +```tql +to_azure_blob_storage url:string, [account_key=string, max_size=int, + timeout=duration, partition_by=list] { … } +``` + +## Description + +The `to_azure_blob_storage` operator writes events to Azure Blob Storage, +automatically opening new blobs when a rotation condition triggers. It +supports hive-style partitioning through a `**` placeholder in the URL and +per-partition unique blob names through a `{uuid}` placeholder. + +By default, authentication is handled by the Azure SDK's credential chain which +may read from multiple environment variables, such as: + +- `AZURE_TENANT_ID` +- `AZURE_CLIENT_ID` +- `AZURE_CLIENT_SECRET` +- `AZURE_AUTHORITY_HOST` +- `AZURE_CLIENT_CERTIFICATE_PATH` +- `AZURE_FEDERATED_TOKEN_FILE` + +### `url: string` + +URL identifying the Azure Blob Storage location where data should be written. + +Supported URI formats: + +1. `abfs[s]://.blob.core.windows.net[/[/]]` +2. `abfs[s]://@.dfs.core.windows.net[/]` +3. `abfs[s]://[@][.][:][/[/]]` +4. `abfs[s]://[@][/]` + +(1) and (2) are compatible with the Azure Data Lake Storage Gen2 URIs, (3) is +for Azure Blob Storage compatible services including Azurite, and (4) is a +shorter version of (1) and (2). + +The path portion may contain two placeholders: + +- `**` marks the location where hive partition segments are inserted. When + present, `partition_by` must also be set, and vice versa. +- `{uuid}` expands to a fresh UUIDv7 for every blob. This is required when + partitioning or when rotation can produce multiple blobs, so that rotated + or per-partition blobs do not overwrite each other. + +:::tip[Authenticate with the Azure CLI] +Run `az login` on the command-line to authenticate the current user with Azure's +command-line arguments. +::: + +### `account_key = string (optional)` + +Account key for authenticating with Azure Blob Storage. + + + +## Examples + +### Write a single NDJSON blob + +```tql +to_azure_blob_storage "abfs://my-container/events/out_{uuid}.json" { + write_ndjson +} +``` + +### Partition events into Parquet blobs by date + +```tql +to_azure_blob_storage "abfs://my-container/events/**/data_{uuid}.parquet", + partition_by=[year, month] { + write_parquet +} +``` + +### Write with explicit account key + +```tql +to_azure_blob_storage "abfs://container/data/out_{uuid}.json", + account_key="your-account-key" { + write_ndjson +} +``` + +### Rotate to a new blob every 5 minutes + +```tql +to_azure_blob_storage "abfs://my-container/logs/events_{uuid}.json", + timeout=5min { + write_ndjson +} +``` + +## See Also + +- from_azure_blob_storage +- save_azure_blob_storage +- to_file +- to_hive +- microsoft/azure-blob-storage diff --git a/src/content/docs/reference/operators/to_file.mdx b/src/content/docs/reference/operators/to_file.mdx new file mode 100644 index 000000000..95a6b28b0 --- /dev/null +++ b/src/content/docs/reference/operators/to_file.mdx @@ -0,0 +1,99 @@ +--- +title: to_file +category: Outputs/Events +example: 'to_file "/tmp/out.json" { write_ndjson }' +--- + +import ToArrowFsCommonParams from '@partials/operators/ToArrowFsCommonParams.mdx'; + +Writes events to one or multiple files on a filesystem. + +```tql +to_file url:string, [max_size=int, timeout=duration, + partition_by=list] { … } +``` + +## Description + +The `to_file` operator writes events to local filesystems, automatically +opening new files when a rotation condition triggers. It supports hive-style +partitioning through a `**` placeholder in the URL and per-partition unique +filenames through a `{uuid}` placeholder. + +### `url: string` + +Path or URL identifying where files should be written. + +When `~` is the first character, it is substituted with the value of the +`$HOME` environment variable. Relative paths are resolved against the current +working directory. + +The path may contain two placeholders: + +- `**` marks the location where hive partition segments are inserted. When + present, `partition_by` must also be set, and vice versa. +- `{uuid}` expands to a fresh UUIDv7 for every file. This is required when + partitioning or when rotation can produce multiple files, so that rotated + or per-partition files do not overwrite each other. + + + +## Examples + +### Write all events to a single NDJSON file + +```tql +from {msg: "hello"}, {msg: "world"} +to_file "/tmp/out.json" { + write_ndjson +} +``` + +### Partition events by a field into separate files + +```tql +from {region: "us", n: 1}, + {region: "eu", n: 2}, + {region: "us", n: 3} +to_file "/tmp/events/**/data_{uuid}.json", partition_by=[region] { + write_ndjson +} +// Produces files under: +// /tmp/events/region=us/data_.json +// /tmp/events/region=eu/data_.json +``` + +### Partition by multiple fields + +```tql +to_file "/tmp/events/**/data_{uuid}.parquet", + partition_by=[year, month] { + write_parquet +} +// Produces files under: +// /tmp/events/year=/month=/data_.parquet +``` + +### Rotate files when they exceed a size + +```tql +to_file "/tmp/logs/events_{uuid}.json", max_size=10M { + write_ndjson +} +``` + +### Rotate files after a fixed duration + +```tql +to_file "/tmp/logs/events_{uuid}.json", timeout=1min { + write_ndjson +} +``` + +## See Also + +- from_file +- save_file +- to +- to_hive +- file diff --git a/src/content/docs/reference/operators/to_google_cloud_storage.mdx b/src/content/docs/reference/operators/to_google_cloud_storage.mdx new file mode 100644 index 000000000..146becef9 --- /dev/null +++ b/src/content/docs/reference/operators/to_google_cloud_storage.mdx @@ -0,0 +1,96 @@ +--- +title: to_google_cloud_storage +category: Outputs/Events +example: 'to_google_cloud_storage "gs://my-bucket/data/{uuid}.json" { write_ndjson }' +--- + +import ToArrowFsCommonParams from '@partials/operators/ToArrowFsCommonParams.mdx'; + +Writes events to one or multiple objects in Google Cloud Storage. + +```tql +to_google_cloud_storage url:string, [anonymous=bool, max_size=int, + timeout=duration, partition_by=list] { … } +``` + +## Description + +The `to_google_cloud_storage` operator writes events to Google Cloud Storage, +automatically opening new objects when a rotation condition triggers. It +supports hive-style partitioning through a `**` placeholder in the URL and +per-partition unique object names through a `{uuid}` placeholder. + +By default, authentication is handled by Google's Application Default +Credentials (ADC) chain, which may read from multiple sources: + +- `GOOGLE_APPLICATION_CREDENTIALS` environment variable pointing to a service + account key file +- User credentials from `gcloud auth application-default login` +- Service account attached to the compute instance (Compute Engine, GKE) +- Google Cloud SDK credentials + +### `url: string` + +URL identifying the Google Cloud Storage location where data should be written. + +The syntax is `gs:///(?)`. The +`` are query parameters. Per the [Arrow +documentation](https://arrow.apache.org/docs/r/articles/fs.html#connecting-directly-with-a-uri), +the following options exist: + +> For GCS, the supported parameters are `scheme`, `endpoint_override`, and +> `retry_limit_seconds`. + +The path portion may contain two placeholders: + +- `**` marks the location where hive partition segments are inserted. When + present, `partition_by` must also be set, and vice versa. +- `{uuid}` expands to a fresh UUIDv7 for every object. This is required when + partitioning or when rotation can produce multiple objects, so that rotated + or per-partition objects do not overwrite each other. + +### `anonymous = bool (optional)` + +Use anonymous credentials instead of any configured authentication. This only +works for publicly writable buckets. + +Defaults to `false`. + + + +## Examples + +### Write a single NDJSON object + +```tql +to_google_cloud_storage "gs://my-bucket/events/out_{uuid}.json" { + write_ndjson +} +``` + +### Partition events into Parquet objects by date + +```tql +to_google_cloud_storage "gs://my-bucket/events/**/data_{uuid}.parquet", + partition_by=[year, month] { + write_parquet +} +``` + +### Rotate to a new object every 100 MB + +```tql +to_google_cloud_storage "gs://my-bucket/logs/events_{uuid}.json.gz", + max_size=100M { + write_ndjson + compress_gzip +} +``` + +## See Also + +- from_google_cloud_storage +- save_gcs +- to_file +- to_hive +- google/cloud-storage diff --git a/src/content/docs/reference/operators/to_s3.mdx b/src/content/docs/reference/operators/to_s3.mdx new file mode 100644 index 000000000..26e677d64 --- /dev/null +++ b/src/content/docs/reference/operators/to_s3.mdx @@ -0,0 +1,122 @@ +--- +title: to_s3 +category: Outputs/Events +example: 'to_s3 "s3://my-bucket/data/{uuid}.json" { write_ndjson }' +--- + +import ToArrowFsCommonParams from '@partials/operators/ToArrowFsCommonParams.mdx'; +import AWSIAMOptions from '@partials/operators/AWSIAMOptions.mdx'; + +Writes events to one or multiple objects in Amazon S3. + +```tql +to_s3 url:string, [anonymous=bool, aws_iam=record, max_size=int, + timeout=duration, partition_by=list] { … } +``` + +## Description + +The `to_s3` operator writes events to Amazon S3, automatically opening new +objects when a rotation condition triggers. It supports hive-style +partitioning through a `**` placeholder in the URL and per-partition unique +object keys through a `{uuid}` placeholder. + +By default, authentication is handled by AWS's default credentials provider +chain, which may read from multiple environment variables and credential files: + +- `~/.aws/credentials` and `~/.aws/config` +- `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` +- `AWS_SESSION_TOKEN` +- EC2 instance metadata service +- ECS container credentials + +### `url: string` + +URL identifying the S3 location where data should be written. + +Supported URI format: +`s3://[:@]/(?)` + +Options can be appended to the path as query parameters: + +- `region`: AWS region (e.g., `us-east-1`) +- `scheme`: Connection scheme (`http` or `https`) +- `endpoint_override`: Custom S3-compatible endpoint +- `allow_bucket_creation`: Allow creating buckets if they don't exist +- `allow_bucket_deletion`: Allow deleting buckets + +The path portion may contain two placeholders: + +- `**` marks the location where hive partition segments are inserted. When + present, `partition_by` must also be set, and vice versa. +- `{uuid}` expands to a fresh UUIDv7 for every object. This is required when + partitioning or when rotation can produce multiple objects, so that rotated + or per-partition objects do not overwrite each other. + +### `anonymous = bool (optional)` + +Use anonymous credentials instead of any configured authentication. + +Defaults to `false`. + + + + + +## Examples + +### Write a single NDJSON object to S3 + +```tql +to_s3 "s3://my-bucket/events/out_{uuid}.json" { + write_ndjson +} +``` + +### Partition events by date into Parquet objects + +```tql +to_s3 "s3://my-bucket/events/**/data_{uuid}.parquet", + partition_by=[year, month, day] { + write_parquet +} +// Produces objects under: +// s3://my-bucket/events/year=/month=/day=/data_.parquet +``` + +### Write with explicit credentials + +```tql +to_s3 "s3://my-bucket/data/out_{uuid}.json", + aws_iam={ + access_key_id: "AKIAIOSFODNN7EXAMPLE", + secret_access_key: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY", + } { + write_ndjson +} +``` + +### Write to an S3-compatible endpoint + +```tql +to_s3 "s3://my-bucket/data/out_{uuid}.json?endpoint_override=minio.example.com:9000&scheme=http" { + write_ndjson +} +``` + +### Rotate to a new object every 10 MB + +```tql +to_s3 "s3://my-bucket/logs/events_{uuid}.json.gz", max_size=10M { + write_ndjson + compress_gzip +} +``` + +## See Also + +- from_s3 +- save_s3 +- to_file +- to_hive +- amazon/s3 diff --git a/src/content/docs/reference/test-framework/index.mdx b/src/content/docs/reference/test-framework/index.mdx index 27e2e6366..802240a43 100644 --- a/src/content/docs/reference/test-framework/index.mdx +++ b/src/content/docs/reference/test-framework/index.mdx @@ -692,7 +692,7 @@ only supported category is `operators`: # tests/gcs/test.yaml suite: gcs-integration requires: - operators: [from_gcs, to_gcs] + operators: [from_google_cloud_storage, to_google_cloud_storage] skip: on: capability-unavailable reason: requires GCS operators diff --git a/src/partials/operators/FromFileCommonParams.mdx b/src/partials/operators/FromFileCommonParams.mdx index 17c868a00..ebff60947 100644 --- a/src/partials/operators/FromFileCommonParams.mdx +++ b/src/partials/operators/FromFileCommonParams.mdx @@ -25,13 +25,6 @@ The operator automatically creates any intermediate directories required for the target path. If the target path ends with a trailing slash (`/`), the original filename will be automatically appended to create the final path. -### `path_field = field (optional)` - -This makes the operator insert the path to the file where an event originated -from before emitting it. - -By default, paths will not be inserted into the outgoing events. - ### `max_age = duration (optional)` Only process files that were modified within the specified duration from the @@ -51,8 +44,11 @@ following fields: | `path` | `string` | The absolute path of the file being read | | `mtime` | `time` | The last modification time of the file | -For example, to add the source path to each event: +For example, to attach the source path to each event: ```tql -source = $file.path +from_file "/data/*.json" { + read_json + source = $file.path +} ``` diff --git a/src/partials/operators/ToArrowFsCommonParams.mdx b/src/partials/operators/ToArrowFsCommonParams.mdx new file mode 100644 index 000000000..f2c28ac50 --- /dev/null +++ b/src/partials/operators/ToArrowFsCommonParams.mdx @@ -0,0 +1,34 @@ +### `max_size = int (optional)` + +Rotates to a new file after the current file exceeds this size in bytes. +Because rotation only fires after the threshold is crossed, individual files +may be slightly larger than `max_size`. + +Defaults to `100M`. + +### `timeout = duration (optional)` + +Rotates to a new file after the current file has been open for this duration. +Rotation is measured from the time the file was first opened. + +Defaults to `5min`. + +### `partition_by = list (optional)` + +A list of fields used to partition events into separate files. For every +distinct combination of partition-field values, a separate file (or group of +rotated files) is written. The URL must contain a `**` placeholder, which is +replaced by the hive-style path `field1=value1/field2=value2/…`. + +Unlike to_hive, the partitioning fields are **not** stripped from +the written events — they remain in each record. + +### `{ … }` + +Pipeline that transforms the incoming events into the byte stream that is +written to each file. The pipeline must return bytes, so it must end with a +writer such as `write_ndjson`, `write_parquet`, or `write_csv`, optionally +followed by a compressor such as `compress_gzip`. + +The subpipeline runs once per output file. When rotation or partitioning +produces a new file, a new instance of the subpipeline is spawned for it.