From b45d36874a9e9a3bf990eb9d5c4f37b70f063cd7 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 09:27:55 -0400 Subject: [PATCH 01/40] Specify task --- .task/AGENTS.md | 3 + .task/saacogs.json | 703 +++++++++++++++++++++++++++++++++++++++++++ .task/task.md | 723 +++++++++++++++++++++++++++++++++++++++++++++ AGENTS.md | 1 + CLAUDE.md | 1 + 5 files changed, 1431 insertions(+) create mode 100644 .task/AGENTS.md create mode 100644 .task/saacogs.json create mode 100644 .task/task.md create mode 120000 AGENTS.md create mode 120000 CLAUDE.md diff --git a/.task/AGENTS.md b/.task/AGENTS.md new file mode 100644 index 00000000..ff9f7032 --- /dev/null +++ b/.task/AGENTS.md @@ -0,0 +1,3 @@ +At the start of the conversation output the following so that I know you've read these instructions: + +📖 omes:saa-cogs diff --git a/.task/saacogs.json b/.task/saacogs.json new file mode 100644 index 00000000..8654452f --- /dev/null +++ b/.task/saacogs.json @@ -0,0 +1,703 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { "type": "grafana", "uid": "-- Grafana --" }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "id": 1, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", pod=~\"frontend-.*\"})", + "legendFormat": "frontend", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", pod=~\"history-.*\"})", + "legendFormat": "history", + "range": true, + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", pod=~\"matching-.*\"})", + "legendFormat": "matching", + "range": true, + "refId": "C" + } + ], + "title": "CPU per service (vCPU)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "id": 2, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "quantile(0.5, sum by (pod) (container_memory_working_set_bytes{cluster=\"$cluster\",container!=\"\",image!=\"\",namespace=\"temporal\"} * on (namespace, pod) group_left (workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"temporal\",workload=\"frontend\",workload_type=\"deployment\"}))", + "legendFormat": "frontend", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "quantile(0.5, sum by (pod) (container_memory_working_set_bytes{cluster=\"$cluster\",container!=\"\",image!=\"\",namespace=\"temporal\"} * on (namespace, pod) group_left (workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"temporal\",workload=\"history\",workload_type=\"deployment\"}))", + "legendFormat": "history", + "range": true, + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "quantile(0.5, sum by (pod) (container_memory_working_set_bytes{cluster=\"$cluster\",container!=\"\",image!=\"\",namespace=\"temporal\"} * on (namespace, pod) group_left (workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"temporal\",workload=\"matching\",workload_type=\"deployment\"}))", + "legendFormat": "matching", + "range": true, + "refId": "C" + } + ], + "title": "Memory per service (p50 working set)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 8 }, + "id": 3, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (operation)(rate(service_requests{cluster=\"$cluster\",temporal_service_type=\"frontend\"}[$__rate_interval]))", + "legendFormat": "{{operation}}", + "range": true, + "refId": "A" + } + ], + "title": "Frontend RPC by method", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 8 }, + "id": 4, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (operation)(rate(service_requests{cluster=\"$cluster\",temporal_service_type=\"history\"}[$__rate_interval]))", + "legendFormat": "{{operation}}", + "range": true, + "refId": "A" + } + ], + "title": "History RPC by method", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 8 }, + "id": 5, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (operation)(rate(service_requests{cluster=\"$cluster\",temporal_service_type=\"matching\"}[$__rate_interval]))", + "legendFormat": "{{operation}}", + "range": true, + "refId": "A" + } + ], + "title": "Matching RPC by method", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "id": 6, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (table)(rate(cassandra_query{cluster=\"$cluster\",verb!=\"select\"}[$__rate_interval]))", + "legendFormat": "query: {{table}}", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (table)(rate(cassandra_batch{cluster=\"$cluster\"}[$__rate_interval]))", + "legendFormat": "batch: {{table}}", + "range": true, + "refId": "B" + } + ], + "title": "Astra writes by table", + "description": "Validate r_Cass = 3/7 for writes. cassandra_query filtered to verb!=select; cassandra_batch is always writes.", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "id": 7, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (table)(rate(cassandra_query{cluster=\"$cluster\",verb=\"select\"}[$__rate_interval]))", + "legendFormat": "{{table}}", + "range": true, + "refId": "A" + } + ], + "title": "Astra reads by table", + "description": "Reads are not expected to differ much between SAW and SAA (similar caching, ~1 read on creation).", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "id": 8, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (walType)(rate(wal_latency_count{cluster=\"$cluster\"}[$__rate_interval]))", + "legendFormat": "{{walType}}", + "range": true, + "refId": "A" + } + ], + "title": "WAL operation rate by type", + "description": "Covers both reads and writes (no separate write-only metric). Expect HISTORY_EVENT_WAL activity for SAW only; both use MUTABLE_STATE_WAL.", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "id": 9, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (operation)(rate(visibility_persistence_requests{cluster=\"$cluster\"}[$__rate_interval]))", + "legendFormat": "{{operation}}", + "range": true, + "refId": "A" + } + ], + "title": "Visibility persistence rate by operation", + "description": "OSS visibility_persistence_requests counter, tagged by operation (RecordWorkflowExecutionStarted, RecordWorkflowExecutionClosed, UpsertWorkflowExecution, DeleteWorkflowExecution).", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 32 }, + "id": 10, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(syncmatch_latency_count{cluster=\"$cluster\"}[$__rate_interval]))", + "legendFormat": "sync match", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(asyncmatch_latency_count{cluster=\"$cluster\"}[$__rate_interval]))", + "legendFormat": "async match", + "range": true, + "refId": "B" + } + ], + "title": "Sync vs async match rate", + "description": "Health check on experimental conditions. Async match means tasks went through persistence/backlog rather than being dispatched directly to a waiting poller.", + "type": "timeseries" + } + ], + "preload": false, + "schemaVersion": 40, + "tags": [], + "templating": { + "list": [ + { + "current": { "text": "prod", "value": "prod" }, + "name": "env", + "options": [ + { "selected": true, "text": "prod", "value": "prod" }, + { "selected": false, "text": "dev", "value": "test" } + ], + "query": "prod : prod, dev : test", + "type": "custom" + }, + { + "current": { "text": "prod thanos", "value": "af7fe237-211e-413e-9723-41a73886bcbb" }, + "hide": 2, + "includeAll": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "${env:text}.*", + "type": "datasource" + }, + { + "current": {}, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(restarts,cluster)", + "includeAll": false, + "label": "Cluster", + "name": "cluster", + "options": [], + "query": { "query": "label_values(restarts,cluster)", "refId": "StandardVariableQuery" }, + "refresh": 2, + "regex": "", + "type": "query" + } + ] + }, + "time": { "from": "now-3h", "to": "now" }, + "timepicker": {}, + "timezone": "utc", + "title": "SAA COGS", + "uid": "saacogs", + "version": 1, + "weekStart": "" +} diff --git a/.task/task.md b/.task/task.md new file mode 100644 index 00000000..3e9d1e26 --- /dev/null +++ b/.task/task.md @@ -0,0 +1,723 @@ +For background context, please study the following documents carefully: + +START_DOCUMENT------------------------------------------------------------------------------ +# Temporal Activity Execution & saas-temporal Cloud Persistence: Implementation Overview + +## Part 1: Activity Execution Models in Temporal Server + +### 1.1 CHASM Standalone Activities (`chasm/lib/activity/`) + +CHASM standalone activities are first-class, independently-scheduled executions outside workflow context. They use **mutable state only** -- no history events. + +#### State Machine + +States defined in `chasm/lib/activity/proto/v1/activity_state.proto`: + +``` +UNSPECIFIED + → SCHEDULED + → STARTED + → COMPLETED (terminal) + → FAILED (terminal) + → CANCEL_REQUESTED → CANCELED (terminal) + → TIMED_OUT (terminal) + → TERMINATED (terminal) + → CANCEL_REQUESTED → CANCELED (terminal) + → TIMED_OUT (terminal) + → TERMINATED (terminal) + → SCHEDULED (retry path) +``` + +Lifecycle states (`activity.go:95-107`): +- `LifecycleStateRunning`: SCHEDULED, STARTED, CANCEL_REQUESTED +- `LifecycleStateCompleted`: COMPLETED +- `LifecycleStateFailed`: FAILED, TERMINATED, TIMED_OUT, CANCELED + +#### State Transitions (`statemachine.go`) + +| Transition | From | To | Trigger | +|---|---|---|---| +| TransitionScheduled (37-77) | UNSPECIFIED | SCHEDULED | Initial scheduling | +| TransitionRescheduled (87-127) | STARTED | SCHEDULED | Retry after failure | +| TransitionStarted (130-169) | SCHEDULED | STARTED | Worker accepts task | +| TransitionCompleted (177-202) | STARTED/CANCEL_REQUESTED | COMPLETED | Worker completes | +| TransitionFailed (210-237) | STARTED/CANCEL_REQUESTED | FAILED | Non-retryable failure | +| TransitionCancelRequested (278-295) | STARTED/SCHEDULED | CANCEL_REQUESTED | Cancel API called | +| TransitionCanceled (304-331) | CANCEL_REQUESTED | CANCELED | Worker acknowledges cancel | +| TransitionTerminated (246-275) | SCHEDULED/STARTED/CANCEL_REQUESTED | TERMINATED | Terminate API called | +| TransitionTimedOut (340-374) | SCHEDULED/STARTED/CANCEL_REQUESTED | TIMED_OUT | Timer task fires | + +#### Mutable State Structures + +**ActivityState** (proto): +- `activity_type`, `task_queue`, timeouts (`schedule_to_close`, `schedule_to_start`, `start_to_close`, `heartbeat`), `retry_policy`, `status`, `schedule_time`, `priority`, `cancel_state`, `terminate_state` + +**Activity Go Component** (`activity.go:52-68`): +- `ActivityState` (embedded proto) +- `Visibility: chasm.Field[*chasm.Visibility]` -- search attributes +- `LastAttempt: chasm.Field[*ActivityAttemptState]` -- attempt count, stamp, started_time, failure details, worker identity +- `LastHeartbeat: chasm.Field[*ActivityHeartbeatState]` -- heartbeat details and recorded_time +- `RequestData: chasm.Field[*ActivityRequestData]` -- input, header, user_metadata +- `Outcome: chasm.Field[*ActivityOutcome]` -- successful (output) or failed (failure) +- `Store: chasm.ParentPtr[ActivityStore]` -- parent workflow (nil for standalone) + +#### Task Flow + +1. **Scheduling** (`handler.go:51-104`): `StartActivityExecution()` → creates Activity → applies TransitionScheduled +2. **Dispatch** (`activity_tasks.go:21-79`): `activityDispatchTaskExecutor` pushes to matching service via `AddActivityTask()` +3. **Start** (`activity.go:173-191`): `HandleStarted()` applies TransitionStarted, schedules start-to-close and heartbeat timeout tasks +4. **Completion** (`activity.go:259-280`): `HandleCompleted()` applies TransitionCompleted +5. **Failure** (`activity.go:284-323`): `HandleFailed()` checks retryability → either `tryReschedule()` or TransitionFailed +6. **Heartbeat** (`activity.go:559-586`): Updates LastHeartbeat, reschedules heartbeat timeout task + +#### Timeout Tasks + +- **ScheduleToStartTimeoutTask** (`activity_tasks.go:81-116`): Non-retryable → TIMED_OUT +- **ScheduleToCloseTimeoutTask** (`activity_tasks.go:118-150`): Non-retryable → TIMED_OUT +- **StartToCloseTimeoutTask** (`activity_tasks.go:152-198`): Attempts retry via `tryReschedule()`; if not retryable → TIMED_OUT +- **HeartbeatTimeoutTask** (`activity_tasks.go:200-276`): Validates heartbeat recency; attempts retry; if not retryable → TIMED_OUT + +#### Retry Logic + +- `shouldRetry()` (`activity.go:504-514`): Checks TransitionRescheduled possible, attempt < max, enough time remaining +- `hasEnoughTimeForRetry()` (`activity.go:518-534`): Exponential backoff calculation against schedule-to-close deadline +- `tryReschedule()` (`activity.go:489-502`): Applies TransitionRescheduled (increments attempt, schedules dispatch with backoff) + +#### Cancellation + +- `RequestCancelActivityExecution` (`handler.go:273-296`): Applies TransitionCancelRequested + - If SCHEDULED: immediately applies TransitionCanceled (`activity.go:414-433`) + - If STARTED: stays CANCEL_REQUESTED; worker receives cancellation on next interaction + +--- + +### 1.2 Legacy Workflow Activities + +Activities executed as part of a workflow use **mutable state (ActivityInfo) plus history events**. + +#### History Events + +``` +EVENT_TYPE_ACTIVITY_TASK_SCHEDULED (10) +EVENT_TYPE_ACTIVITY_TASK_STARTED (11) +EVENT_TYPE_ACTIVITY_TASK_COMPLETED (12) +EVENT_TYPE_ACTIVITY_TASK_FAILED (13) +EVENT_TYPE_ACTIVITY_TASK_TIMED_OUT (14) +EVENT_TYPE_ACTIVITY_TASK_CANCEL_REQUESTED (15) +EVENT_TYPE_ACTIVITY_TASK_CANCELED (16) +``` + +#### ActivityInfo Mutable State (`persistence/v1/executions.proto:524-661`) + +Core: `activity_id`, `activity_type`, `task_queue`, `scheduled_time`, `started_time`, `started_event_id`, `scheduled_event_id` + +Timeouts: `schedule_to_close_timeout`, `schedule_to_start_timeout`, `start_to_close_timeout`, `heartbeat_timeout` + +Retry: `attempt`, `has_retry_policy`, `retry_initial_interval`, `retry_maximum_interval`, `retry_maximum_attempts`, `retry_backoff_coefficient`, `retry_expiration_time`, `retry_non_retryable_error_types`, `retry_last_failure` + +State flags: `cancel_requested`, `cancel_request_id`, `timer_task_status` (bit flags), `stamp`, `paused`, `pause_info` + +#### Pending Activity States (`activity.go:53-61`) + +- SCHEDULED: `StartedEventId == 0` +- STARTED: `StartedEventId != 0 && !CancelRequested` +- CANCEL_REQUESTED: `CancelRequested` +- PAUSED: `Paused && Scheduled` +- PAUSE_REQUESTED: `Paused && Started` + +#### Timer Task Status Flags + +```go +TimerTaskStatusCreatedScheduleToStart = 1 +TimerTaskStatusCreatedScheduleToClose = 2 +TimerTaskStatusCreatedStartToClose = 4 +TimerTaskStatusCreatedHeartbeat = 8 +``` + +#### Pause/Unpause/Reset (unique to legacy model) + +- **Pause** (`activity.go:254-284`): Sets `paused = true`, increments stamp if SCHEDULED +- **Unpause** (`activity.go:388-425`): Clears pause, regenerates retry task if SCHEDULED +- **Reset** (`activity.go:286-379`): Resets attempt to 1, optionally resets heartbeat/options + +#### API Handlers (`service/history/api/`) + +- `recordactivitytaskstarted/api.go`: Creates ActivityTaskStartedEvent +- `respondactivitytaskcompleted/api.go`: Creates ActivityTaskCompletedEvent +- `respondactivitytaskfailed/api.go`: Retry or ActivityTaskFailedEvent +- `respondactivitytaskcanceled/api.go`: Creates ActivityTaskCanceledEvent +- `recordactivitytaskheartbeat/api.go`: Updates heartbeat state, reschedules timeout + +--- + +### 1.3 Activity Metrics (Both Models) + +Defined in `common/metrics/metric_defs.go`. Both models emit the same metric names. + +**Counters:** +| Metric | Description | +|---|---| +| `activity_success` | Successful completions (excludes retries) | +| `activity_fail` | Final failures (retries exhausted) | +| `activity_task_fail` | Per-attempt failures (includes retries) | +| `activity_cancel` | Canceled activities | +| `activity_terminate` | Terminated activities (CHASM only) | +| `activity_timeout` | Terminal timeouts | +| `activity_task_timeout` | Per-timeout events (includes retries) | + +**Timers:** +| Metric | Description | +|---|---| +| `activity_start_to_close_latency` | StartedTime → completion/failure/timeout | +| `activity_schedule_to_close_latency` | ScheduleTime → completion/failure/timeout/cancel | + +**Tags:** `namespace`, `task_queue_family`, `operation`, `activity_type`, `versioning_behavior`, `workflow_type` (set to `__temporal_standalone_activity__` for CHASM). Timeout metrics additionally tagged with `timeout_type` (SCHEDULE_TO_START, SCHEDULE_TO_CLOSE, START_TO_CLOSE, HEARTBEAT). + +**Metric enrichment** (`activity.go:804-824`): `enrichMetricsHandler()` adds per-task-queue-family scoping via `metrics.GetPerTaskQueueFamilyScope()`. + +--- + +### 1.4 Key Differences + +| Aspect | CHASM Standalone | Legacy Workflow | +|---|---|---| +| Persistence | Mutable state only | Mutable state + history events | +| Parent context | Standalone execution | Part of workflow execution | +| State tracking | ActivityState + sub-components | ActivityInfo in workflow | +| Task dispatch | Direct to matching service | Via workflow task completion | +| Completion storage | Outcome field | History events | +| Cancellation | Explicit CANCEL_REQUESTED state | Boolean flag in ActivityInfo | +| Pause support | Not yet implemented | Full (pause, unpause, reset) | +| Search attributes | Visibility component (chasm) | Workflow search attributes | + +--- + +## Part 2: saas-temporal Cloud Integration + +### 2.1 Architecture Overview + +saas-temporal wraps the Temporal server to run in Temporal Cloud cells by replacing core persistence with Cloud Data Storage (CDS), backed by: +- **Datastax Astra Cassandra** for durable storage +- **Write-Ahead Logs (WALs)** for durability before Cassandra persistence +- **OpenSearch/Elasticsearch** for workflow visibility +- **Tiered Storage** (S3/GCS/Azure) for history archival + +### 2.2 Entry Point and Server Construction + +**Main:** `cmd/temporal-service/main.go` + +The `start` command: +1. Loads OSS Temporal configuration from YAML +2. Injects secrets (Astra, Elasticsearch credentials) +3. Sets up dynamic configuration +4. Optionally enables cloud metrics handler (Chronicle) +5. Configures authorization (SaaS Auth0 JWT + Temporal JWT) +6. Configures custom datastore with CDS +7. Creates server via `cds.NewServer()` + +**Server creation:** `cds/export/cds/server.go`: +```go +func NewServer(serviceFxOpts FxOptions, opts ...temporal.ServerOption) (temporal.Server, error) { + return newServerFx(TopLevelModule, serviceFxOpts, opts...) +} +``` + +Uses Uber FX dependency injection with modules for persistence factory, dynamic config, serialization, and per-service modules (history, matching, frontend, worker). + +### 2.3 CDS Factory Architecture (`cds/export/cds/factory.go`) + +**FactoryProvider** (lines 51-65): Implements `client.AbstractDataStoreFactory` +- `NumberOfShards`, `OrderedDatastoreConfigs` (shards → datastores) +- `HistoryDatastoreConfigs` (weighted distribution) +- `WALFollowerProviders` for WAL followers +- `Clock`, `DynamicConfig`, `ChasmRegistry` + +**Factory**: Manages three WAL pools: +- **MS WAL** (MutableState): Records mutable state mutations +- **HE WAL** (HistoryEvent): Records history events +- **LP WAL** (LargePayload): Records oversized payloads + +Plus store providers: `MultiDBStoreProvider` for ordinal datastores, separate history store provider with tiered storage, optional Walker integration. + +### 2.4 Astra Cassandra Integration (`cds/storage/cassandra/astra/`) + +**Session creation** (`gocql.go`): Wraps gocql with Astra-specific config (TLS, connection pooling, retry policies) via Datastax `gocql-astra`. + +**Query instrumentation** (`gocql_metrics.go:48-100`): `queryMetricsObserver` instruments every query with 150-entry LRU statement cache. + +**Cassandra Metrics:** +| Metric | Description | +|---|---| +| `CassandraConns` | Connection count | +| `CassandraQueryTotalLatency` | Query latency | +| `CassandraBatchTotalLatency` | Batch latency | +| `CassandraQuery` | Query count | +| `CassandraBytesTx` / `CassandraBytesTx` | Network bytes | +| `CassandraLargeResponse` / `CassandraLargeRequest` | Large payload detection | +| `CassandraRetries` | Retry histogram | +| `CassandraErrors` | Error counters | + +Tags: `OperationType` (INSERT/UPDATE/DELETE/SELECT), `TableName`, `CasTag` (CAS operation) + +### 2.5 Write-Ahead Logs (`cds/export/wal/`, `cds/stream/`) + +WALs provide durability guarantees before data reaches Cassandra. + +**WAL Client Interface** (`cds/export/wal/crud.go`): +```go +WriteMS(), WriteHE(), WriteLP() // Write operations per pool +ReadMS(), ReadHE(), ReadLP() // Read operations per pool +``` + +**Configuration** (`cds/config/configs.go:46-140`): +- Rate limiting: `WALReadsRate`, `WALReadsBurst` +- Timeouts: `WALDialTimeout`, `WALReadTimeout`, `WALWriteTimeout` +- Ledger rotation: `WALLedgerRotationBytesThreshold`, `WALLedgerRotationAgeThreshold` +- Retention: `WALLedgerLifetime` +- Parallelism: `WALMaxParallelReads` +- Feature flags: `WALReadV2Enabled`, `WALV2EncodingEnabled` + +**WAL Metrics** (`cds/metrics/metrics.go:34-56`): +| Metric | Description | +|---|---| +| `wal_latency` | Operation latency | +| `wal_stream_dial_attempt/success/error` | Connection establishment | +| `wal_stream_dns_latency` | DNS resolution | +| `wal_stream_connect_latency` | TCP connect | +| `wal_stream_handshake_latency` | TLS handshake | +| `wal_stream_send/receive_latency` | I/O latency | +| `wal_health_check_failed_count` | Connection health | +| `wal_write_timeout_count` | Timeout tracking | +| `wal_reader_page_latency` | Page read latency | +| `wal_entries_per_read` | Batch size histogram | +| `wal_compression_count` | Compression events | + +**Flush Metrics** (lines 13-27): +| Metric | Description | +|---|---| +| `flush_latency` | Time to flush to persistence | +| `flush_error` | Flush failures | +| `flush_snapshot_aborts` | Snapshot abort count | +| `flush_persistence_behindness_bytes/count/time` | Persistence lag | +| `flush_time_since_last_persist` | Staleness | +| `flush_reason_count` | Flush trigger reasons (by namespace) | + +**Recovery Metrics** (lines 57-70): +| Metric | Description | +|---|---| +| `recovery_total_latency` | Full recovery duration | +| `recovery_open_reader_latency` | Snapshot reader open | +| `recovery_rate_limiter_latency` | Rate limiting delay | +| `recovery_first_read_latency/bytes` | Initial WAL read | +| `recovery_takeover_latency` | Takeover phase | +| `recovery_wal_update_latency` | WAL update during recovery | + +**Ledger Metrics** (lines 77-82): +| Metric | Description | +|---|---| +| `ledger_rotation_count` | Rotations | +| `logs_per_ledger` | Logs per ledger histogram | +| `segments_per_shard` | Segments per shard histogram | +| `segment_too_old_count` | GC candidates | +| `active_segment_too_old_count` | Rotation delay | + +### 2.6 Execution Store Wrapper (`cds/export/cds/execution_store.go`) + +Wraps the Cassandra execution store to: +- Convert mutable state mutations to WAL records (`NewMSWALRecord()`) +- Convert history events to WAL records (`NewHEWALRecord()`) +- Calculate storage metering +- Manage snapshot trimming +- Implement history event caching + +Implements `persistence.ExecutionStore` and `persistence.ShardStore`. + +### 2.7 How Activity State Flows Through CDS + +**CHASM activities**: Activity mutable state → MS WAL write → Cassandra persistence. No HE WAL involvement (no history events). State transitions are persisted as mutable state mutations via the execution store wrapper. + +**Legacy workflow activities**: ActivityInfo mutable state → MS WAL write → Cassandra. History events (Scheduled, Started, Completed, etc.) → HE WAL write → Cassandra. Both paths go through the execution store wrapper's WAL record conversion. + +### 2.8 OpenSearch/Elasticsearch Visibility (`visibility/`) + +**Factory:** `visibility/factory.go` -- `VisibilityStoreFactory` creates visibility stores configured per cloud cell. + +**Batch processor metrics** (`visibility/common/metrics_defs.go`): +| Metric | Description | +|---|---| +| `visibility_batch_processor_request_add_latency` | Enqueue time | +| `visibility_batch_processor_request_latency` | Total request latency | +| `visibility_batch_processor_request_errors` | Failed requests | +| `visibility_batch_processor_commit_latency` | Batch commit time | +| `visibility_batch_processor_batch_size` | Items per batch histogram | +| `visibility_batch_processor_batch_requests` | Requests per batch histogram | +| `visibility_batch_processor_queued_requests` | Queue depth histogram | +| `visibility_batch_processor_corrupted_data` | Data integrity failures | +| `visibility_batch_processor_duplicate_request` | Deduplication events | + +### 2.9 Tiered Storage (`cds/persistence/tieredstorage/`) + +Long-term history archival to cloud object stores: +- S3 (AWS): `s3_store.go` +- GCS (Google Cloud): `gcs_store.go` +- Azure Blob: `azure_client.go` + +Interface: `Upload()`, `Read()`, `Delete()`, `List()`, `PluginName()` + +Metrics: `ReadWorkflowHistory`, `UploadWorkflowHistory`, `DeleteWorkflowHistory`, `ListTieredStorageObjects` + +### 2.10 Persistence Store Metrics (`cds/persistence/metrics/defs.go`) + +**Store layer** (lines 70-85): +| Metric | Description | +|---|---| +| `store_requests` | Request count by operation | +| `store_latency` | Operation latency | +| `store_errors` | Errors: shard_exists, shard_ownership_lost, condition_failed, timeout, unavailable | + +**Manager layer** (lines 89-102): +| Metric | Description | +|---|---| +| `saas_persistence_requests` | High-level request count | +| `saas_persistence_latency` | High-level latency | +| `saas_persistence_errors` | Error tracking | + +Tags: `operation` (CreateShard, UpdateShard, GetWorkflowExecution, etc.), `component`, `cass_cluster` + +### 2.11 Cloud Metrics Infrastructure + +**Handler chain** (`cloudmetricshandler/delegating_recorders.go`): +1. `allowlistedRecorder`: Filters through allowlist +2. `multiRecorder`: Sends to multiple backends + +**Chronicle integration** (`cloudmetricshandler/chronicle_recorder.go`): +- Enabled by `TEMPORAL_ENABLE_CLOUDMETRICSHANDLER` +- Config: `/etc/temporal/cloudmetricshandler` +- Kubernetes enrichment: pod name, namespace, labels +- Backends: S3 writer, HTTP writer (to Chronicle service) +- Batch config: 50K queue, 25K batch, 100ms flush + +**Action metering** (`actionmetering/metrics.go`): +- `billable_action_count` with tags: namespace, action_type, workflow_type, workflow_task_queue +- Activity type/task queue currently placeholder `"_unknown_"` with TODOs for standalone activity support + +### 2.12 Additional Cloud Features + +- **Authorization**: SaaS Auth0 JWT + Temporal JWT, TLS client certs +- **Quotas/Flow Control** (`quotas/`, `flowcontrol/`): Request-level and task-queue quotas +- **Multi-region replication** (`cds/service/history/replication/`): Custom replication filters +- **Metering V3**: S3/GCS/Azure bucket metering +- **SMS (etcd)**: Secondary Metadata Store for namespace/cluster metadata +- **Dynamic config**: 150+ hot-reloadable properties (`cds/config/configs.go`) +END_DOCUMENT-------------------------------------------------------------------------------------- + +START_DOCUMENT------------------------------------------------------------------------------ +# Standalone Activity COGS and margins + +@Dan Davison March 17, 2026 + +We want to ensure that we are billing in a way that meets our target margins for new product features in cloud, such as new CHASM execution types. To do this, we need to know certain things about COGS (cost of goods sold) for these features. This document outlines how to estimate COGS for Standalone Activity relative to Workflow and the implications of this for margins. + +# Motivation: avoiding cannibalization + +We have rules (see [temporalio/action](https://github.com/temporalio/action)) specifying how customer operations map to billable Actions. For example, suppose a customer executes a Workflow that executes a single Activity, which succeeds on first attempt without heartbeating. This incurs 2 Actions (StartWorkflow and ScheduleActivity). We’ll call this a “Single Activity Workflow” (SAW). + +We haven’t yet decided how we will bill for Standalone Activity (SAA). But suppose that we decide that executing a single SAA (no retries, no heartbeating) is 1 Action (StartStandaloneActivity). + +If we want SAA margins to match SAW margins, then we want the COGS of SAA (no retries, no heartbeating) to be ≤ 1/2 that of SAW (because we get half as much revenue for the SAA). If it is not, then there would be some degree of cannibalization (customers switch their single-activity workloads to SAA, but our margins there are worse). We’d hope it would be offset by increased volume, but we’d still prefer SAA margins to match SAW. + +### What about retries and heartbeating? + +SAW (no retries and no heartbeating) is 2 Actions. If the activity retries once it becomes 3 Actions (ScheduleActivity now happens twice); if it heartbeats once during each attempt it becomes 5 Actions. + +Let’s assume (as we currently intend) that we apply the same billing rules to Standalone Activity retries and heartbeating. Then, as long as SAA is not worse than Workflow Activity with respect to COGS of retries and heartbeating, our margins from those customer operations will be at least as good under SAA as when they are done in the context of a pre-CHASM workflow. CHASM has been designed for efficiency; we have reason to be optimistic that it’s not *worse* than the legacy workflow activity implementation. + +# Problem statement + +The above suggests that we should focus on estimating the ratio of COGS for Standalone Activity (SAA) relative to Single-activity Workflow (SAW) in the no retries, no heartbeating case: + +$$ +R = \frac{C_{SAA}}{C_{SAW}}. +$$ + +We expect $R < 1$ because SAA achieves execution of an activity with fewer RPCs, persistence operations, etc, than SAW. We are hoping that it is less than 1/2 since then our SAA margins are as good or better than our workflow margins, assuming we bill 1 Action for SAA. + +# Estimating the COGS ratio + +We’ll assume that the COGS for a SAA or SAW execution results solely from invoices from third parties relating to cloud compute resources. COGS for an execution type (SAA or SAW) is the sum of price ($p$) times quantity consumed ($q$) over all resources: + +$$ +C = \sum_{i} p_i q_i. +$$ + +We want the COGS ratio $R$. We can write that as a weighted average of per-resource usage ratios: + +$$ +R = \frac{C_{SAA}}{C_{SAW}} = \sum_i f_i r_i. +$$ + +This allows us to calculate $R$ as a function of two things that we can estimate: + +- $f_i = p_i q_{i}(SAW) / \sum_j p_j q_{j}(SAW)$ is the fraction of SAW COGS attributable to resource $i$ (“spend share”). We’ll use our current cloud spend for this. +- $r_i = q_i(SAA) / q_i(SAW)$ is the per-resource usage ratio. We will estimate these by comparing the implementations or by running experiments in cloud cells. + +The resources ($i$) potentially include: +1. Data egress +2. CPU usage +3. Memory usage +4. Persistence operations against our WALs +5. Persistence operations against Astra (to be replaced by Walker) +6. Persistence operations against OpenSearch (visibility) +7. Metrics/logs processing and storage costs, Clickhouse + +*At-rest data storage is excluded: we bill customers separately for storage on a GB/h basis, so it does not need to be subsidized by Actions. (Tangentially, it’s worth noting that we expect SAA storage to cost users half what they’d pay for SAW since SAW stores the input and output payloads in both workflow scheduled/complete events and activity scheduled/complete events.)* + +# Per-resource usage ratios + +To proceed, we need to estimate the SAW vs SAA usage ratio ($r_i$) for each resource. + +The following table summarizes the two implementations. It describes the simplest possible happy-path scenario: an activity that succeeds on first attempt without heartbeating, via sync matches. + +| # | Single-activity Workflow | Standalone Activity | +| --- | --- | --- | +| 1 | RPC: `StartWorkflowExecution` => HEWAL, MSWAL; Vis&; Cassandra& | RPC: `StartActivityExecution` => MSWAL; Vis&; Cassandra& | +| 2 | Task => RPC: `AddWorkflowTask` | | +| 3 | RPC: `RecordWorkflowTaskStarted` => HEWAL, MSWAL; Cassandra& | | +| 4 | RPC: `RespondWorkflowTaskCompleted` => HEWAL, MSWAL; Cassandra& | | +| 5 | Task => RPC: `AddActivityTask` | Task => RPC: `AddActivityTask` | +| 6 | RPC: `RecordActivityTaskStarted` => HEWAL, MSWAL; Cassandra& | RPC: `RecordActivityTaskStarted` => MSWAL; Cassandra& | +| 7 | RPC: `RespondActivityTaskCompleted` => HEWAL, MSWAL; Cassandra& | RPC: `RespondActivityTaskCompleted` => MSWAL; Vis&; Cassandra& | +| 8 | Task => RPC: `AddWorkflowTask` | | +| 9 | RPC: `RecordWorkflowTaskStarted` => HEWAL, MSWAL; Cassandra& | | +| 10 | RPC: `RespondWorkflowTaskCompleted` => HEWAL, MSWAL; Vis&; Cassandra& | | +- `&` indicates a write that’s not on the sync response path +- `AddWorkflowTask` and `AddActivityTask` involve inter-service RPCs but no persistence writes in the happy path (“sync match”). +- The table does not show worker poll requests +- An additional `Vis&` is incurred in both cases when the execution is deleted. + +Comparing the implementations in the table gives + +$$ +r_{\text{WAL}} = \frac{3}{14} = 0.21,~~~~ +r_{\text{Cass}} = \frac{3}{7} = 0.43,~~~~ +r_{\text{Vis}} = \frac{3}{3} = 1.0.~~~~ +$$ + +These ratios count writes only. Cassandra reads are not expected to differ much between SAW and SAA since they use similar caching mechanics with the result that a high proportiion of both SAW and SAA executions incur ~1 read (on execution creation);. + +In addition, we can estimate data transfer costs by comparing the implementations. These are likely dominated by egress to customer infra (ingress is free on AWS and GCP; data transfers to Astra, OpenSearch, and Grafana are in-VPC or via PrivateLink). Let the activity input and output payload sizes be $S_I$ and $S_O$. Payload egress for SAW is $2S_I + 2S_O$ (input payload sent to workflow and activity workers; output payload sent to workflow worker and client). For SAA this is $S_I + S_O$ since there is no workflow worker detour. This gives + +$$ +r_\text{data\_transfer} = 0.5. +$$ + +# COGS ratio estimate + +Using approximate/preliminary cloud spend share numbers (thanks @Stephen Chan ) we have: + +| **Resource** | **Spend share $f_i$ (preliminary)** | **Usage ratio $r_i$** | **Notes** | +| --- | --- | --- | --- | +| **Astra writes** | 40% | $\frac{3}{7}$ = 0.43 | SAW does 2 additional writes for each WFT | +| **Visibility** (OpenSearch) | 20% | $\frac{3}{3}$ = 1.00 | Equal — both SAA and SAW produce exactly ~~2~~ 3 visibility updates | +| **WAL writes** | 10% | $\frac{3}{14}$ = 0.21 | Half of Astra ratio: SAA writes only to MSWAL, whereas SAW writes to both HEWAL and MSWAL | +| **EC2 compute** | 10% | ? | Would need cloud cell experiment | +| **Data transfer** | 10% | $\frac{1}{2}$ = 0.50 | SAW sends payloads via workflow worker round-trip; SAA does not | +| **Overheads** (incl. Clickhouse) | 10% | ? | | + +This gives the following estimate of the COGS ratio: + +$$ +\begin{align*} +R &= +\underbrace{0.4 \times 0.43}_{\text{Astra}:~0.17} + +\underbrace{0.2 \times 1.0}_{\text{Vis}:~0.20} + +\underbrace{0.1 \times 0.21}_{\text{WAL}:~0.02} + +\underbrace{0.1 \times 0.50}_{\text{Tx}:~0.05} + +0.1 \cdot r_\text{compute} + 0.1 \cdot r_\text{overhead} \\\\ +&= +0.44 + 0.1(r_\text{compute} + r_\text{overhead}). +\end{align*} +$$ + +# Sensitivity analysis + +Before thinking about the implications of this for billing and margins, the next steps are: + +1. Refine the cloud spend estimates (Cloud Capacity team; does not involve load experiments) +2. Decide whether we want to do load experiments to estimate $r_\text{compute}$ +3. Decide how we will address $r_\text{overhead}$ + +For (2) and (3) we can do some initial sensitivity analysis: + +SAW does 10 RPCs vs SAA’s 4 (with 7 vs 3 of them doing persistence writes in the sync-match case). If services are CPU-bound then this suggests that $0.4 < r_\text{compute} < 1.0$ might be reasonable. + +The other overheads include (per @Stephen Chan ) Clickhouse, observability cells, and Envoy proxies. Since these costs should also scale with RPC count, let’s assume the same bounds: $0.4 < r_\text{overhead} < 1.0$. This gives: + +$$ +0.52 \leq R \leq 0.64. +$$ + +![image.png](.task/sensitivity.png) + +For example, if SAW margins were 70%, SAA margins would be 62% - 69%. This margin reduction would affect at maximum the ~3% of workflows that are SAW. + +- COGS ratio to margins conversion formula + + $\text{margin}_{\text{SAA}} = 1 - 2R(1 - \text{margin}_{\text{SAW}})$. + + +# Discussion + +- **Visibility limits SAA margins**. Visibility is expensive (20%), but SAA and SAW perform the same number of visibility writes, so it combines a large weight with the worst possible ratio. +- **(Unfavorable) Over-provisioning would push $R$ up.** The usage ratios above for persistence are derived from write counts, which only translate to cost savings if capacity tracks usage. But e.g. Astra is bought in fixed hardware units (“Astra Classic”). If any resource component is over-provisioned then SAA and SAW would pay the same cost per execution and $r_i \to 1.0$, making SAA margins less attractive relative to workflow. +- **Cloud spend share**. We could attempt to separate fixed costs and renormalize (see [Next steps](https://www.notion.so/Next-steps-3268fc567738805e82ddd9c1e1d4c9d1?pvs=21)). This would be favorable to SAA margins if it decreases the visibility share, but unfavorable if it decreases Astra share. + + We’re estimating $f_i$ from cloud spend, so we’re assuming that the spend distribution for single-activity workflows would be similar to the spend distribution for the real mix of customer workflows. I suspect this is a reasonable modeling assumption since in both cases the application is performing the same state transitions in response to workflow and activity task processing. + +- **(Mixed) Effect of migration to Walker**. Walker replaces Astra with storage that is under our own control, making right-sizing easier. This may mean that the 3/7 write ratio is more fully realized under Walker, moving SAA COGS away from SAW. However, Walker will be cheaper than Astra, so persistence’s share of spend shrinks. Since persistence is where SAA has its largest advantage, this would bring SAA COGS closer to SAW. + + These two effects act in opposite directions and the net result will depends on their relative magnitudes. This suggests that we should monitor COGS calculations as the Walker migration proceeds. + +- **(Future) A visibility backend migration would improve SAA margins.** There has been [movement](https://www.notion.so/Visibility-CDS-2a98fc567738807e9ee0f318edc4c16f?pvs=21) toward replacing OpenSearch. As discussed above, any reduction in visibility spend share would make SAA COGS more attractive relative to workflow. + +# Conclusion + +- [We are planning to bill SAA at 1/2 the price of SAW](https://www.notion.so/PRD-Standalone-Activities-for-durable-job-processing-1ee8fc567738806d8b6fe8e2eeae0fc4?pvs=21). Although there are various assumptions involved, at this point it looks like SAA COGS will be more than 1/2 SAW COGS: the estimated range above is $0.52 \leq R \leq 0.64$. This implies that some degree of cannibalization is likely. The extent of cannibalization would be bounded by the proportion of current workloads that are SAW, which is 3% per @Phil Prasek. It may be offset by volume growth attributable to SAA. + +# Next steps + +- **Refine cloud spend share estimates.** + + The cloud spend share weights used in this analysis are supposed to be marginal costs. We could attempt to separate marginal vs fixed costs and renormalize our spend share percentages. This would be favorable to SAA margins if it decreases the visibility share, but unfavorable if it decreases Astra share. + +- **Investigate any impact of over-provisioning.** + + SAA margins may be less favorable than the calculations suggest if some resources are over-provisioned. See discussion [above](https://www.notion.so/Standalone-Activity-COGS-and-margins-3268fc567738803cb63fd9397ffd351c?pvs=21). + +- **Decide whether to do cloud cell experiments**. + + Unlike the other resource categories, we lack any obvious theoretical basis for estimating $r_\text{compute}$ and $r_\text{overhead}$. Estimating $r_\text{compute}$ via cloud cell experiments would require perhaps one engineer-week. If this were to show a value close to 0.4 then it would suggest that the upper bound on $R$ is 0.56, as opposed to the current 0.64. This would however still be subject to all the assumptions discussed above. We could also attempt to tighten our estimated bounds on $r_\text{overhead}$ via experiment. + + If we decide to do this, the $r_\text{compute}$ experiment would be something like the following: choose a reference activity (e.g. sleeps for 10s, no heartbeating, never fails) and run SAA and SAW workloads on a cloud cell at a fixed start rate (e.g. 10/s) for a sustained period (e.g. 1hr). Fixing start rate rather than concurrency naturally controls for end-to-end latency differences between SAA and SAW. $r_\text{cpu}$ and $r_\text{memory}$ can then be estimated from metrics as the ratio of mean utilization above the idle baseline. The analysis will need to decide how to combine them, e.g. based on which is more often limiting; alternatively, using the larger of the two would yield a conservative calculation. +END_DOCUMENT------------------------------------------------------------------------------ + +START_DOCUMENT------------------------------------------------------------------------------ +# Test plan for SAA COGS measurement + +@Dan Davison March 19, 2026 + +The [SAA COGS proposal](.task/saa-cogs.md) made an initial estimate of the SAA/SAW COGS ratio based on estimating persistence, visibility, and data transfer usage ratios directly from the implementation. But for compute and overheads we have no analytical estimate. We plan to run an experiment to: + +1. Estimate the missing $r_\text{compute}$. +2. Validate the analytical $r_i$ against observed metrics + +For comparison, the Fairness COGS experiment docs: + +- [Test plan](https://www.notion.so/temporalio/Test-plan-for-COGS-measurement-28c8fc56773880169cdcc4087a98ceaf) +- [Fairness COGS Impact](https://www.notion.so/temporalio/Fairness-COGS-Impact-2c58fc567738808f806cfbf09b771b2c) +- [Pricing Council doc](https://www.notion.so/temporalio/WIP-Pricing-Council-Fairness-COGS-Impact-2cc8fc56773880dcb3efe435623edd9a) + + + + +# Proposed SAA experiment + + +## Workloads + +Two workloads, run sequentially on the same cell: + +1. **SAW**: execute workflow with one activity (no heartbeat, no retry). +2. **SAA**: execute standalone activity (no heartbeat, no retry). + +## Parameters + +**Start rate.** I think that we should fix start rate rather than concurrency, since this naturally controls for end-to-end latency differences between SAA and SAW (i.e. a cell running SAW will see higher load because the concurrency will be higher because the SAW end-to-end latency is higher). The fairness experiment used 4k tasks/s. Is starting 4k executions/s reasonable for us? + +**Activity.** Immediate successful return; no heartbeat, no retry. We could compare with a 1s sleep to see if result differ? + +**Sync match.** Do one run such that sync match should be 100%, and another tuned such that sync match is lower? Verify sync match from metrics (`syncmatch_latency`, `asyncmatch_latency`) + +**Duration and repetitions.** Steady-state load; we need long enough for stable CPU averages. The +fairness experiment used 6h per scenario but this was maybe because of their more sophisticated +sinusoidal load design? 1h more than enough for the SAA experiment? ≥2 runs per workload to check +variance/reproducibility. + +## Infrastructure + +- Anything special about test cell sizing? +- Workers should run outside the cell (how did fairness experiment do this?) + +## Metrics + +Initial dashboard content https://grafana.tmprl-internal.cloud/d/saacogs/saa-cogs: + + +- **CPU per service** (frontend, history, matching). `node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate` — a k8s recording rule over cAdvisor container metrics (defined in saas-components prometheus rules). +- **Memory per service**. `container_memory_working_set_bytes` — also k8s/cAdvisor (defined in saas-components alert rules). +- **RPC rate by method**, one panel per service (frontend, history, matching). `service_requests` counter ([temporal:common/metrics/metric_defs.go:615](https://github.com/temporalio/temporal/blob/main/common/metrics/metric_defs.go)), tagged with `operation` (the RPC method name). Recorded by a gRPC server-side interceptor ([telemetry.go:177](https://github.com/temporalio/temporal/blob/main/common/rpc/interceptor/telemetry.go)), so it captures inter-service RPCs (e.g. history→matching `AddActivityTask`). +- **Astra writes by table**. `cassandra_query` counter with `verb!="select"`, plus `cassandra_batch` counter, both broken down by `table`. Tags include `operation`, `table`, `verb`, `cas` ([saas-temporal:cds/metrics/metrics.go:233,238](https://github.com/temporalio/saas-temporal/blob/main/cds/metrics/metrics.go)). +- **Astra reads by table**. `cassandra_query` with `verb="select"`, broken down by `table`. +- **WAL operation rate by type**. `wal_latency_count` ([saas-temporal:cds/metrics/metrics.go:35](https://github.com/temporalio/saas-temporal/blob/main/cds/metrics/metrics.go)) broken down by `walType` label (values: `MUTABLE_STATE_WAL`, `HISTORY_EVENT_WAL`, `LARGE_PAYLOAD_WAL` — see [saas-temporal:cds/common/tag/tag.go:11-24](https://github.com/temporalio/saas-temporal/blob/main/cds/common/tag/tag.go)). Note: this metric covers both reads and writes; there is no separate write-only WAL metric. This is arguably more relevant to COGS since WAL reads also cost something. +- **Visibility persistence rate by operation**. `visibility_persistence_requests` counter ([temporal:common/metrics/metric_defs.go:1398](https://github.com/temporalio/temporal/blob/main/common/metrics/metric_defs.go)), tagged with `operation` (values include `RecordWorkflowExecutionStarted`, `RecordWorkflowExecutionClosed`, `UpsertWorkflowExecution`, `DeleteWorkflowExecution` — see [visiblity_manager_metrics.go](https://github.com/temporalio/temporal/blob/main/common/persistence/visibility/visiblity_manager_metrics.go)). +- **Sync vs async match rate**. `syncmatch_latency_count` and `asyncmatch_latency_count` ([temporal:common/metrics/metric_defs.go:1119-1120](https://github.com/temporalio/temporal/blob/main/common/metrics/metric_defs.go)). + + +## Load generator (omes) + +- Add a new scenario that starts standalone activities directly from the load generator, not from within a workflow. +- Build the omes Go worker Docker image and deploy it as a pod on k8s, configured to poll the test cell. Do we have implementation we can borrow from the fairness experiment? + + + + +
+Appendix: Comparison with fairness experiment (see commits by David Reiss) + +| | Fairness | SAA | +|---|---|---| +| **Treatments** | Same workload, two matcher modes | Two execution types (SAW vs SAA) | +| **Quantity computed** | $\Delta C / C$ | Ratio $r_i = q_i(\text{SAA}) / q_i(\text{SAW})$ | +| **Load shape** | Sinusoidal backlog (exercises matcher) | Steady-state at fixed start rate (our model assumes sync match) | +| **What is measured** | CPU per service, Astra operation rates | CPU per service, memory per service, Astra operation rates by table and verb, WAL write rates, visibility write rates, RPC handling rates per service per method | +| **Predictions to validate** | None — purely empirical | $r_\text{Cass} = 3/7$, $r_\text{WAL} = 3/14$, $r_\text{Vis} = 3/3$, per-method RPC rates matching proposal table | + +Fixed start rate (not fixed task throughput) because SAA and SAW generate different numbers of tasks per execution. + +**Question**: what is the incremental COGS of enabling the fairness matcher vs the classic matcher? + +**COGS components**: (1) Astra queries (~35% of total COGS), (2) EC2 compute (~9%, split across frontend+matching and history). Ignored: data transfer, Astra storage, non-AWS costs (Clickhouse <3%). + +**Setup**: dedicated test cell `s-oss-dnr-faircogs3` (64 partitions). Load generator: Omes Ebb and Flow — sinusoidal activity task backlog. 5 scenarios (classic, fairness with 0/1k/100k keys, priority), each 6 hours. Measured via [dedicated Grafana dashboard](https://grafana.tmprl-internal.cloud/d/df6pldpkiy1vka/faircogs). + +**Results**: Astra showed no significant increase. CPU increased up to 23% (frontend) and 36% (history) in the worst case (1k fairness keys). COGS impact: $(0.035 \times 0.23) + (0.057 \times 0.36) = 2.8\%$. Pricing council recommendation: price fairness on value to customer, not COGS. + + + + + +
+ +
+Appendix: possible experimental outcomes + +- **Analytical predictions confirmed, $R$ in predicted range.** Observed $r_\text{Cass}$, $r_\text{WAL}$, $r_\text{Vis}$, and per-method RPC rates match the analytical derivations. $r_\text{compute}$ lands in $[0.4, 1.0]$, giving $R$ in roughly $0.52$–$0.64$. We present $R$ with a tighter confidence interval than the proposal (because $r_\text{compute}$ is now estimated, not bounded). +- **$r_\text{compute}$ is low, pushing $R$ toward 0.5.** If $r_\text{compute} \approx 0.4$ and analytical predictions hold, $R \approx 0.52$. Cannibalization is near-zero. +- **Observed $r_i$ diverge from analytical predictions.** Some assumption is wrong (e.g. sync match doesn't hold at test load, or there are unaccounted persistence writes). We recompute $R$ using observed values and identify which assumption failed and whether it reflects production conditions or a test artifact. +- **$R$ is higher than predicted.** $R > 0.64$ would mean worse cannibalization than estimated. Options: accept the margin reduction (bounded by ~3% SAW share), adjust billing, or identify engineering work to reduce SAA COGS. + +
+ +END_DOCUMENT------------------------------------------------------------------------------ + + +Your task is to help me design and build the omes-based tooling that we will use to perform the experiments outlined above to learn about COGS of SAA an SAW. We are in the omes repo; study it carefully. Our work will broadly break into the following phases that we must design holistically: + +(1) Add any missing omes functionality that will be needed in order to be able to use omes to generate the SAA and SAW load for the experiments. +(2) Run the experiments against the cloud cell that Stephen has prepared: its name is s-saa-cogs. + +I am not familiar with performing operations against cloud cells, so you will need to resarch and help me during this. But we have several good resources: study the contents of the 'oncall' and 'runbooks' repos, and also use the /agent-slack skill. You also have Notion and Temporal Docs MCP. Use the more modern 'ct' rather than its alias 'omni'. + +Initial grafana dashboard JSON is at .task/saacogs.json. + +Important: I'd like an early aim to be to get an end-to-end proof-of-principle of this working. Therefore let's not make the omes component sophisticated initially; just the bare minimum to run an SAW and SAA workload. But I am a bit intimidated by doing anything with the cloud cell since I don't know how. So I guess one early aim is to be able to point our metrics dashboard at s-saa-cogs, and see idle state, then run one of our omes commands, and see activity increase iin the dashboard. Please maintain a file of useful shell commands with terse comments where necessary. I will run them and show you the outut. Don't do operations against cloud or observability yourself unless I explicitly ask you to. \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md new file mode 120000 index 00000000..81744092 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1 @@ +.task/AGENTS.md \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 120000 index 00000000..81744092 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +.task/AGENTS.md \ No newline at end of file From 0e752410cfd58ff18c9d076d5de60cbb83d0343d Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 09:57:14 -0400 Subject: [PATCH 02/40] neo(task): Project description (from /Users/dan/worktrees/omes/saa-cogs/omes/.task/ --- .task/commands.sh | 68 ++++++++++++++++++++ .task/research.md | 159 ++++++++++++++++++++++++++++++++++++++++++++++ .task/task.md | 2 +- 3 files changed, 228 insertions(+), 1 deletion(-) create mode 100644 .task/commands.sh create mode 100644 .task/research.md diff --git a/.task/commands.sh b/.task/commands.sh new file mode 100644 index 00000000..8829d79c --- /dev/null +++ b/.task/commands.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# SAA COGS experiment: useful commands +# Cell: s-saa-cogs + +CELL="s-saa-cogs" +# Namespace TBD — confirm with Stephen. Likely one of: +# ${CELL}-marathon.e2e +# or a custom namespace +NS="${CELL}-marathon.e2e" +HOST="${NS}.tmprl-test.cloud:7233" + +# ── Local testing ──────────────────────────────────────────────────────────── +# Start local dev server (with standalone activity support) +temporal server start-dev --headless --log-level warn + +# SAW: single-activity workflow (baseline) +go run ./cmd run-scenario-with-worker \ + --scenario saa_cogs_saw --language go \ + --iterations 10 --max-concurrent 5 --run-id saw-local-1 + +# SAA: standalone activity (no workflow) +go run ./cmd run-scenario-with-worker \ + --scenario saa_cogs_saa --language go \ + --iterations 10 --max-concurrent 5 --run-id saa-local-1 + +# Rate-limited run (e.g. 10 executions/s for 5 minutes) +go run ./cmd run-scenario-with-worker \ + --scenario saa_cogs_saw --language go \ + --duration 5m --max-iterations-per-second 10 --max-concurrent 100 --run-id saw-rate-1 + +# ── Cloud cell operations ──────────────────────────────────────────────────── +# Check cell pods +ct kubectl --context $CELL get pods -n temporal + +# Check namespace +omni admintools --context $CELL -- temporal operator namespace describe $NS + +# ── Running against cloud cell ─────────────────────────────────────────────── +# Requires TLS certs or API key. Two options: + +# Option A: mTLS +# TLS_CERT=path/to/cert.pem +# TLS_KEY=path/to/key.pem +# go run ./cmd run-scenario \ +# --scenario saa_cogs_saw \ +# --server-address $HOST --namespace $NS \ +# --tls --tls-cert-path $TLS_CERT --tls-key-path $TLS_KEY \ +# --do-not-register-search-attributes \ +# --iterations 1 --run-id saw-cloud-1 + +# Option B: API key +# go run ./cmd run-scenario \ +# --scenario saa_cogs_saw \ +# --server-address $HOST --namespace $NS \ +# --tls --auth-header "Bearer $API_KEY" \ +# --do-not-register-search-attributes \ +# --iterations 1 --run-id saw-cloud-1 + +# Worker (separate terminal, same auth flags) +# go run ./cmd run-worker \ +# --scenario saa_cogs_saw --language go \ +# --server-address $HOST --namespace $NS \ +# --tls --tls-cert-path $TLS_CERT --tls-key-path $TLS_KEY \ +# --run-id saw-cloud-1 + +# ── Grafana ────────────────────────────────────────────────────────────────── +# Dashboard: https://grafana.tmprl-internal.cloud/d/saacogs/saa-cogs +# Set cluster variable to: s-saa-cogs diff --git a/.task/research.md b/.task/research.md new file mode 100644 index 00000000..cab82543 --- /dev/null +++ b/.task/research.md @@ -0,0 +1,159 @@ +# SAA COGS Experiment: Research & Design + +## 1. Current State of Omes + +### Architecture +Omes is a load generation framework for Temporal. Scenarios are Go files in `scenarios/` that +register via `init()` → `loadgen.MustRegisterScenario()`. The scenario name comes from the +filename. Execution flows: + +1. `run-scenario` command: dials Temporal, runs scenario executor +2. `run-worker` command: starts a worker (Go/Python/etc) polling a task queue +3. `run-scenario-with-worker`: runs both together (local development) + +### Executor Types +- `GenericExecutor`: takes a `func(ctx, *Run) error` — most flexible +- `KitchenSinkExecutor`: wraps `GenericExecutor`, starts kitchen-sink workflows with configurable action sequences +- `FuzzExecutor`: random action generation + +### Existing Standalone Activity Support +Branch `standalone-activity` (commit `efbbb7f`) adds SAA to the `throughput_stress` scenario as +an *optional extra activity within a workflow*. The implementation: + +1. Proto: `StandaloneActivity` message in `kitchen_sink.proto` +2. Helper: `StandaloneActivity()` in `loadgen/kitchensink/helpers.go` creates an action +3. Worker: `ExecuteStandaloneActivity()` in `workers/go/kitchensink/kitchen_sink.go` — called as a + *workflow activity* that internally calls `StartActivityExecution` + `PollActivityExecution` +4. Scenario: enabled via `--option enable-standalone-activity=true` + +**Critical observation**: This existing support executes SAA *from within a workflow activity*. +That is useful for testing SAA functionality but **not** for the COGS experiment. For COGS, we need +to run SAA directly from the load generator (no workflow involved) so that the only server-side +work is the standalone activity execution itself. + +## 2. What We Need for the COGS Experiment + +### Two New Scenarios + +**`saa_cogs_saw`** — Single Activity Workflow (the baseline): +- Each iteration: start a workflow that executes one activity (payload: 256B in, 256B out), then completes +- This is very close to `workflow_with_single_noop_activity` but with a payload activity + +**`saa_cogs_saa`** — Standalone Activity: +- Each iteration: call `StartActivityExecution` directly from the load generator, then + `PollActivityExecution` to wait for the result +- No workflow involved +- Same activity (payload: 256B in, 256B out) and task queue +- **Requires a `GenericExecutor`** since `KitchenSinkExecutor` always starts workflows + +Both scenarios must use the same worker (the Go worker with `payload` activity registered). + +### Key Design Decisions + +1. **Activity type**: `payload` with 256B input, 256B output (matching the COGS analysis) +2. **No heartbeat, no retry** (matching the COGS analysis; retry max_attempts=1) +3. **Fixed start rate** (not fixed concurrency) — controls for latency differences +4. **Same task queue** for both scenarios — ensures same worker setup +5. **Sync match preferred** — the COGS analysis assumes sync match; verify via metrics + +### SAA Load Generator Implementation + +The SAA scenario needs to call gRPC APIs directly. Looking at the existing +`ExecuteStandaloneActivity` in the worker code (`workers/go/kitchensink/kitchen_sink.go:46-120`), +we have a working reference. The scenario version should: + +1. Use `client.WorkflowService()` to get the gRPC client +2. Call `StartActivityExecution` with the activity config +3. Call `PollActivityExecution` to wait for completion +4. This is a `GenericExecutor` with a custom `Execute` function + +## 3. Cloud Cell Operations + +### Connecting to a Cloud Cell + +From `bench-go.mdx`, the namespace format for test cells is `{cellId}-marathon.e2e` and the host +is `{cellId}-marathon.e2e.tmprl-test.cloud:7233`. For our cell `s-saa-cogs`: +- Namespace: `s-saa-cogs-marathon.e2e` (to be confirmed — Stephen may have set up differently) +- Host: `s-saa-cogs-marathon.e2e.tmprl-test.cloud:7233` + +Omes connects via: +``` +--server-address --namespace --tls --tls-cert-path --tls-key-path +``` + +Or with API key auth: +``` +--server-address --namespace --tls --auth-header "Bearer " +``` + +### Running omes against a cloud cell + +Two options: +1. **Local**: Run `go run ./cmd run-scenario` and `go run ./cmd run-worker` locally, connecting to + the cloud cell via TLS. Simplest for proof-of-concept. Higher latency (network round trip to + cloud) but the load generator itself isn't on the critical path for COGS measurement. +2. **K8s pod**: Deploy omes worker as a pod on the cell's k8s cluster. Lower latency, more + realistic. The bench-go runbook shows this is the standard approach. Uses `omni scaffold` with + `--benchgo-enabled` or manual deployment. + +For initial proof-of-concept: run locally. For the actual experiment: deploy to k8s. + +### Grafana Dashboard + +The dashboard at `https://grafana.tmprl-internal.cloud/d/saacogs/saa-cogs` uses a `$cluster` +variable. Set `cluster=s-saa-cogs` to point at our cell. + +### Cell Setup Verification + +Use `ct` / `omni` to verify cell state: +```sh +# Check cell status +ct kubectl --context s-saa-cogs get pods -n temporal + +# Check namespace exists +omni admintools --context s-saa-cogs -- temporal operator namespace describe s-saa-cogs-marathon.e2e +``` + +### Search Attributes + +Cloud cells cannot register search attributes via the SDK — they must be registered via the +control plane. The `--do-not-register-search-attributes` flag exists for this. We should use it, +and register `OmesExecutionID` separately if needed. For the simple COGS scenarios, we may not +even need search attributes. + +## 4. Implementation Plan + +### Phase 1: Minimal Scenarios (omes code changes) + +1. Create `scenarios/saa_cogs_saw.go` — SAW scenario using `KitchenSinkExecutor` +2. Create `scenarios/saa_cogs_saa.go` — SAA scenario using `GenericExecutor` with direct gRPC calls +3. Both share config: payload size, start rate, duration + +### Phase 2: Local Proof-of-Concept + +1. Test both scenarios against local Temporal server +2. Run `go run ./cmd run-scenario-with-worker` for SAW +3. For SAA: run worker separately, then scenario (since SAA doesn't use workflows but the + worker still needs to poll for activity tasks) + +### Phase 3: Cloud Cell Connection + +1. Obtain credentials for s-saa-cogs cell +2. Verify dashboard shows idle state +3. Run a single SAW iteration and observe metrics +4. Run a single SAA iteration and observe metrics + +### Phase 4: Full Experiment + +1. Deploy omes worker to cloud cell k8s +2. Run SAW at target start rate for target duration +3. Wait for cool-down, collect metrics +4. Run SAA at same start rate for same duration +5. Collect and compare metrics + +## 5. Open Questions + +- What namespace(s) are configured on s-saa-cogs? +- How do we obtain TLS certs or API keys for the cell? (Check oncall or runbooks repos or search slack) +- Does the cell have CHASM standalone activities enabled? (Dynamic config flag) +- Worker deployment: should we use the existing bench-go infrastructure or deploy omes directly? diff --git a/.task/task.md b/.task/task.md index 3e9d1e26..0e9dc39d 100644 --- a/.task/task.md +++ b/.task/task.md @@ -720,4 +720,4 @@ I am not familiar with performing operations against cloud cells, so you will ne Initial grafana dashboard JSON is at .task/saacogs.json. -Important: I'd like an early aim to be to get an end-to-end proof-of-principle of this working. Therefore let's not make the omes component sophisticated initially; just the bare minimum to run an SAW and SAA workload. But I am a bit intimidated by doing anything with the cloud cell since I don't know how. So I guess one early aim is to be able to point our metrics dashboard at s-saa-cogs, and see idle state, then run one of our omes commands, and see activity increase iin the dashboard. Please maintain a file of useful shell commands with terse comments where necessary. I will run them and show you the outut. Don't do operations against cloud or observability yourself unless I explicitly ask you to. \ No newline at end of file +Important: I'd like an early aim to be to get an end-to-end proof-of-principle of this working. Therefore let's not make the omes component sophisticated initially; just the bare minimum to run an SAW and SAA workload. But I am a bit intimidated by doing anything with the cloud cell since I don't know how. So I guess one early aim is to be able to point our metrics dashboard at s-saa-cogs, and see idle state, then run one of our omes commands, and see activity increase in the dashboard. Please maintain a file of useful shell commands with terse comments where necessary. I will run them and show you the outut. Don't do operations against cloud or observability yourself unless I explicitly ask you to. \ No newline at end of file From 8889ed6a878107dfbcc7d497f8ec427070630ab3 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 10:03:51 -0400 Subject: [PATCH 03/40] Specify SDK version --- .task/task.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.task/task.md b/.task/task.md index 0e9dc39d..b9c71259 100644 --- a/.task/task.md +++ b/.task/task.md @@ -720,4 +720,6 @@ I am not familiar with performing operations against cloud cells, so you will ne Initial grafana dashboard JSON is at .task/saacogs.json. -Important: I'd like an early aim to be to get an end-to-end proof-of-principle of this working. Therefore let's not make the omes component sophisticated initially; just the bare minimum to run an SAW and SAA workload. But I am a bit intimidated by doing anything with the cloud cell since I don't know how. So I guess one early aim is to be able to point our metrics dashboard at s-saa-cogs, and see idle state, then run one of our omes commands, and see activity increase in the dashboard. Please maintain a file of useful shell commands with terse comments where necessary. I will run them and show you the outut. Don't do operations against cloud or observability yourself unless I explicitly ask you to. \ No newline at end of file +Important: I'd like an early aim to be to get an end-to-end proof-of-principle of this working. Therefore let's not make the omes component sophisticated initially; just the bare minimum to run an SAW and SAA workload. But I am a bit intimidated by doing anything with the cloud cell since I don't know how. So I guess one early aim is to be able to point our metrics dashboard at s-saa-cogs, and see idle state, then run one of our omes commands, and see activity increase in the dashboard. Please maintain a file of useful shell commands with terse comments where necessary. I will run them and show you the outut. Don't do operations against cloud or observability yourself unless I explicitly ask you to. + +In the omes work, we must use the latest version of the SDK with Standalone Activity support, such that our code is consistent with what Temporal Docs (use MCP) and the samples-go repo show. \ No newline at end of file From 93c14ba789dfefd63f194a96e40908acd2c3d1fa Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 10:15:11 -0400 Subject: [PATCH 04/40] neo: Project description (from /Users/dan/worktrees/omes/saa-cogs/omes/.task/ --- commands.sh | 51 ++++++++++++++++++++++++++++++++++ scenarios/saa_cogs_saa.go | 38 +++++++++++++++++++++++++ scenarios/saa_cogs_saw.go | 58 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 147 insertions(+) create mode 100644 commands.sh create mode 100644 scenarios/saa_cogs_saa.go create mode 100644 scenarios/saa_cogs_saw.go diff --git a/commands.sh b/commands.sh new file mode 100644 index 00000000..7951ce07 --- /dev/null +++ b/commands.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# SAA COGS experiment — useful commands +# See .task/plan.md for context. + +## --- Local testing (against embedded dev server) --- + +# SAW: 5 iterations +go run ./cmd run-scenario-with-worker --scenario saa_cogs_saw --language go --iterations 5 + +# SAA: 5 iterations +go run ./cmd run-scenario-with-worker --scenario saa_cogs_saa --language go --iterations 5 + +# SAW: sustained 60s at 10 starts/s +go run ./cmd run-scenario-with-worker --scenario saa_cogs_saw --language go \ + --duration 60s --max-iterations-per-second 10 --max-concurrent 100 + +# SAA: sustained 60s at 10 starts/s +go run ./cmd run-scenario-with-worker --scenario saa_cogs_saa --language go \ + --duration 60s --max-iterations-per-second 10 --max-concurrent 100 + +## --- Cloud cell: s-saa-cogs --- + +CELL=s-saa-cogs +NS=${CELL}-marathon.e2e +HOST=${NS}.tmprl-test.cloud:7233 + +# Verify cell is alive +ct kubectl --context $CELL get pods -n temporal + +# Check namespace +omni admintools --context $CELL -- temporal operator namespace describe $NS + +# Run worker against cloud cell (in one terminal) +go run ./cmd run-worker --language go --run-id saa-cogs-test \ + --server-address $HOST --namespace $NS --tls \ + --tls-cert-path /tmp/saa-cogs-cert.pem --tls-key-path /tmp/saa-cogs-key.pem + +# Run SAW scenario against cloud cell (in another terminal) +go run ./cmd run-scenario --scenario saa_cogs_saw --run-id saa-cogs-test \ + --server-address $HOST --namespace $NS --tls \ + --tls-cert-path /tmp/saa-cogs-cert.pem --tls-key-path /tmp/saa-cogs-key.pem \ + --iterations 5 --do-not-register-search-attributes + +# Run SAA scenario against cloud cell +go run ./cmd run-scenario --scenario saa_cogs_saa --run-id saa-cogs-test \ + --server-address $HOST --namespace $NS --tls \ + --tls-cert-path /tmp/saa-cogs-cert.pem --tls-key-path /tmp/saa-cogs-key.pem \ + --iterations 5 --do-not-register-search-attributes + +# Grafana dashboard +# https://grafana.tmprl-internal.cloud/d/saacogs/saa-cogs?var-cluster=s-saa-cogs diff --git a/scenarios/saa_cogs_saa.go b/scenarios/saa_cogs_saa.go new file mode 100644 index 00000000..1ec05a7b --- /dev/null +++ b/scenarios/saa_cogs_saa.go @@ -0,0 +1,38 @@ +package scenarios + +import ( + "context" + "fmt" + "time" + + "go.temporal.io/sdk/client" + "go.temporal.io/sdk/temporal" + + "github.com/temporalio/omes/loadgen" +) + +func init() { + loadgen.MustRegisterScenario(loadgen.Scenario{ + Description: "SAA for COGS: standalone activity with payload, no workflow.", + ExecutorFn: func() loadgen.Executor { + return &loadgen.GenericExecutor{ + Execute: executeSAA, + } + }, + }) +} + +func executeSAA(ctx context.Context, run *loadgen.Run) error { + inputData := make([]byte, 256) + handle, err := run.Client.ExecuteActivity(ctx, client.StartActivityOptions{ + ID: fmt.Sprintf("a-%s-%s-%d", run.RunID, run.ExecutionID, run.Iteration), + TaskQueue: run.TaskQueue(), + ScheduleToCloseTimeout: 60 * time.Second, + RetryPolicy: &temporal.RetryPolicy{MaximumAttempts: 1}, + }, "payload", inputData, int32(256)) + if err != nil { + return fmt.Errorf("failed to start standalone activity: %w", err) + } + var result []byte + return handle.Get(ctx, &result) +} diff --git a/scenarios/saa_cogs_saw.go b/scenarios/saa_cogs_saw.go new file mode 100644 index 00000000..ef4e92d8 --- /dev/null +++ b/scenarios/saa_cogs_saw.go @@ -0,0 +1,58 @@ +package scenarios + +import ( + "time" + + "go.temporal.io/api/common/v1" + "google.golang.org/protobuf/types/known/durationpb" + + "github.com/temporalio/omes/loadgen" + "github.com/temporalio/omes/loadgen/kitchensink" +) + +func init() { + loadgen.MustRegisterScenario(loadgen.Scenario{ + Description: "SAW baseline for COGS: single workflow executing one payload activity.", + ExecutorFn: func() loadgen.Executor { + return loadgen.KitchenSinkExecutor{ + TestInput: &kitchensink.TestInput{ + WorkflowInput: &kitchensink.WorkflowInput{ + InitialActions: []*kitchensink.ActionSet{ + saaCogsSAWActionSet(), + }, + }, + }, + } + }, + }) +} + +func saaCogsSAWActionSet() *kitchensink.ActionSet { + return &kitchensink.ActionSet{ + Actions: []*kitchensink.Action{ + { + Variant: &kitchensink.Action_ExecActivity{ + ExecActivity: &kitchensink.ExecuteActivityAction{ + ActivityType: &kitchensink.ExecuteActivityAction_Payload{ + Payload: &kitchensink.ExecuteActivityAction_PayloadActivity{ + BytesToReceive: 256, + BytesToReturn: 256, + }, + }, + ScheduleToCloseTimeout: durationpb.New(60 * time.Second), + RetryPolicy: &common.RetryPolicy{ + MaximumAttempts: 1, + }, + }, + }, + }, + { + Variant: &kitchensink.Action_ReturnResult{ + ReturnResult: &kitchensink.ReturnResultAction{ + ReturnThis: &common.Payload{}, + }, + }, + }, + }, + } +} From e75708189dbfe061d404f43ea45b8d1e6dc1b8b5 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 10:15:11 -0400 Subject: [PATCH 05/40] neo(task): Project description (from /Users/dan/worktrees/omes/saa-cogs/omes/.task/ --- .task/commands.sh | 68 ----------------------- .task/plan.md | 135 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+), 68 deletions(-) delete mode 100644 .task/commands.sh create mode 100644 .task/plan.md diff --git a/.task/commands.sh b/.task/commands.sh deleted file mode 100644 index 8829d79c..00000000 --- a/.task/commands.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash -# SAA COGS experiment: useful commands -# Cell: s-saa-cogs - -CELL="s-saa-cogs" -# Namespace TBD — confirm with Stephen. Likely one of: -# ${CELL}-marathon.e2e -# or a custom namespace -NS="${CELL}-marathon.e2e" -HOST="${NS}.tmprl-test.cloud:7233" - -# ── Local testing ──────────────────────────────────────────────────────────── -# Start local dev server (with standalone activity support) -temporal server start-dev --headless --log-level warn - -# SAW: single-activity workflow (baseline) -go run ./cmd run-scenario-with-worker \ - --scenario saa_cogs_saw --language go \ - --iterations 10 --max-concurrent 5 --run-id saw-local-1 - -# SAA: standalone activity (no workflow) -go run ./cmd run-scenario-with-worker \ - --scenario saa_cogs_saa --language go \ - --iterations 10 --max-concurrent 5 --run-id saa-local-1 - -# Rate-limited run (e.g. 10 executions/s for 5 minutes) -go run ./cmd run-scenario-with-worker \ - --scenario saa_cogs_saw --language go \ - --duration 5m --max-iterations-per-second 10 --max-concurrent 100 --run-id saw-rate-1 - -# ── Cloud cell operations ──────────────────────────────────────────────────── -# Check cell pods -ct kubectl --context $CELL get pods -n temporal - -# Check namespace -omni admintools --context $CELL -- temporal operator namespace describe $NS - -# ── Running against cloud cell ─────────────────────────────────────────────── -# Requires TLS certs or API key. Two options: - -# Option A: mTLS -# TLS_CERT=path/to/cert.pem -# TLS_KEY=path/to/key.pem -# go run ./cmd run-scenario \ -# --scenario saa_cogs_saw \ -# --server-address $HOST --namespace $NS \ -# --tls --tls-cert-path $TLS_CERT --tls-key-path $TLS_KEY \ -# --do-not-register-search-attributes \ -# --iterations 1 --run-id saw-cloud-1 - -# Option B: API key -# go run ./cmd run-scenario \ -# --scenario saa_cogs_saw \ -# --server-address $HOST --namespace $NS \ -# --tls --auth-header "Bearer $API_KEY" \ -# --do-not-register-search-attributes \ -# --iterations 1 --run-id saw-cloud-1 - -# Worker (separate terminal, same auth flags) -# go run ./cmd run-worker \ -# --scenario saa_cogs_saw --language go \ -# --server-address $HOST --namespace $NS \ -# --tls --tls-cert-path $TLS_CERT --tls-key-path $TLS_KEY \ -# --run-id saw-cloud-1 - -# ── Grafana ────────────────────────────────────────────────────────────────── -# Dashboard: https://grafana.tmprl-internal.cloud/d/saacogs/saa-cogs -# Set cluster variable to: s-saa-cogs diff --git a/.task/plan.md b/.task/plan.md new file mode 100644 index 00000000..7b5775a6 --- /dev/null +++ b/.task/plan.md @@ -0,0 +1,135 @@ +# Implementation Plan: SAA COGS Load Generation + +## Goal + +Create two omes scenarios to generate SAW and SAA workloads against cloud cell `s-saa-cogs`, then +observe metrics on the Grafana dashboard. + +## Design + +### Scenarios + +**`saa_cogs_saw`** — Single Activity Workflow baseline. Uses `KitchenSinkExecutor` with a single +payload activity (256B in, 256B out), no retry, no heartbeat. Very close to +`workflow_with_single_noop_activity` but with payload instead of noop. + +**`saa_cogs_saa`** — Standalone Activity. Uses `GenericExecutor`. Each iteration calls +`client.ExecuteActivity()` (the SDK's standalone activity API) with the same payload activity, then +`handle.Get()` to wait for the result. No workflow involved. + +Both use the same task queue (derived from run-id) and the same Go worker (which already registers +the `payload` activity). + +### Why GenericExecutor for SAA + +`KitchenSinkExecutor` always starts a kitchen-sink workflow. The SAA scenario must call +`client.ExecuteActivity` directly — no workflow. `GenericExecutor` gives us the `Execute` function +hook, plus all the concurrency/rate-limiting/duration infrastructure. + +### SDK version + +The current `go.temporal.io/sdk v1.40.0` already includes `client.ExecuteActivity` (added in +v1.40.0, commit `215920a6`). No upgrade needed. + +### Activity configuration + +Both scenarios use the `payload` activity type (already registered in the Go worker as `"payload"`). +Arguments: `inputData []byte` (256 bytes), `bytesToReturn int32` (256). No heartbeat. Retry policy +`MaximumAttempts: 1` (no retries). `ScheduleToCloseTimeout: 60s`. + +## Implementation + +### Step 1: Create `scenarios/saa_cogs_saw.go` + +```go +package scenarios + +func init() { + loadgen.MustRegisterScenario(loadgen.Scenario{ + Description: "SAW baseline for COGS: single workflow executing one payload activity.", + ExecutorFn: func() loadgen.Executor { + return loadgen.KitchenSinkExecutor{ + TestInput: &kitchensink.TestInput{ + WorkflowInput: &kitchensink.WorkflowInput{ + InitialActions: []*kitchensink.ActionSet{ + payloadActivityActionSet(), + }, + }, + }, + } + }, + }) +} +``` + +Where `payloadActivityActionSet()` creates a `PayloadActivity(256, 256, ...)` plus +`ReturnResultAction`, with `MaximumAttempts: 1`, `ScheduleToCloseTimeout: 60s`. + +### Step 2: Create `scenarios/saa_cogs_saa.go` + +```go +package scenarios + +func init() { + loadgen.MustRegisterScenario(loadgen.Scenario{ + Description: "SAA for COGS: standalone activity with payload, no workflow.", + ExecutorFn: func() loadgen.Executor { + return &loadgen.GenericExecutor{ + Execute: executeSAA, + } + }, + }) +} + +func executeSAA(ctx context.Context, run *loadgen.Run) error { + inputData := make([]byte, 256) + handle, err := run.Client.ExecuteActivity(ctx, client.StartActivityOptions{ + ID: fmt.Sprintf("a-%s-%s-%d", run.RunID, run.ExecutionID, run.Iteration), + TaskQueue: run.TaskQueue(), + ScheduleToCloseTimeout: 60 * time.Second, + RetryPolicy: &temporal.RetryPolicy{MaximumAttempts: 1}, + }, "payload", inputData, int32(256)) + if err != nil { + return err + } + var result []byte + return handle.Get(ctx, &result) +} +``` + +This calls `"payload"` by name (string) so the SDK dispatches it to the worker which has it +registered as `activity.RegisterOptions{Name: "payload"}`. + +### Step 3: Create `commands.sh` — useful shell commands + +A file with terse comments documenting how to run the scenarios locally and against the cloud cell. + +### Step 4: Test locally + +Run against local dev server using `run-scenario-with-worker`: + +```sh +# SAW +go run ./cmd run-scenario-with-worker \ + --scenario saa_cogs_saw --language go \ + --iterations 5 + +# SAA +go run ./cmd run-scenario-with-worker \ + --scenario saa_cogs_saa --language go \ + --iterations 5 +``` + +### Step 5: Connect to cloud cell + +Use `omni admintools` to verify cell state, then obtain credentials. Run scenarios against +`s-saa-cogs-marathon.e2e.tmprl-test.cloud:7233` with TLS. + +## Verification + +1. **Build**: `go build ./...` succeeds. +2. **Lint/vet**: `go vet ./...` succeeds. +3. **Local test — SAW**: `go run ./cmd run-scenario-with-worker --scenario saa_cogs_saw --language go --iterations 5` completes successfully. +4. **Local test — SAA**: `go run ./cmd run-scenario-with-worker --scenario saa_cogs_saa --language go --iterations 5` completes successfully. +5. **List scenarios**: `go run ./cmd list-scenarios` includes both `saa_cogs_saw` and `saa_cogs_saa`. +6. **Cloud cell proof-of-concept**: Point dashboard at `s-saa-cogs`, run one scenario, observe metrics increase. From dcad9c7d563120d23bb665dede681a9a7b8c2eac Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 10:19:17 -0400 Subject: [PATCH 06/40] Plan for SAA COGS load generation scenarios --- .task/plan.md | 128 ++++++++++++++------------------------ commands.sh | 51 --------------- scenarios/saa_cogs_saa.go | 38 ----------- scenarios/saa_cogs_saw.go | 58 ----------------- 4 files changed, 45 insertions(+), 230 deletions(-) delete mode 100644 commands.sh delete mode 100644 scenarios/saa_cogs_saa.go delete mode 100644 scenarios/saa_cogs_saw.go diff --git a/.task/plan.md b/.task/plan.md index 7b5775a6..300decd9 100644 --- a/.task/plan.md +++ b/.task/plan.md @@ -20,11 +20,20 @@ payload activity (256B in, 256B out), no retry, no heartbeat. Very close to Both use the same task queue (derived from run-id) and the same Go worker (which already registers the `payload` activity). -### Why GenericExecutor for SAA +### Why different executor types -`KitchenSinkExecutor` always starts a kitchen-sink workflow. The SAA scenario must call -`client.ExecuteActivity` directly — no workflow. `GenericExecutor` gives us the `Execute` function -hook, plus all the concurrency/rate-limiting/duration infrastructure. +`KitchenSinkExecutor` always starts a kitchen-sink workflow — this is inherently what SAW needs. +SAA must call `client.ExecuteActivity` directly (no workflow). `GenericExecutor` gives us the +`Execute` function hook for this. + +Both executor types share the same iteration-driving machinery: `KitchenSinkExecutor` wraps +`GenericExecutor`, so concurrency control, rate limiting, and duration handling are identical. +The only difference between the scenarios is what each iteration *does*, which is exactly the +variable under test. + +The activity configuration (256B payload, no retry, 60s timeout) is specified independently in each +scenario. These are simple literal values; sharing them via an abstraction would add indirection +without meaningful deduplication. ### SDK version @@ -37,99 +46,52 @@ Both scenarios use the `payload` activity type (already registered in the Go wor Arguments: `inputData []byte` (256 bytes), `bytesToReturn int32` (256). No heartbeat. Retry policy `MaximumAttempts: 1` (no retries). `ScheduleToCloseTimeout: 60s`. -## Implementation +## Implementation steps ### Step 1: Create `scenarios/saa_cogs_saw.go` -```go -package scenarios - -func init() { - loadgen.MustRegisterScenario(loadgen.Scenario{ - Description: "SAW baseline for COGS: single workflow executing one payload activity.", - ExecutorFn: func() loadgen.Executor { - return loadgen.KitchenSinkExecutor{ - TestInput: &kitchensink.TestInput{ - WorkflowInput: &kitchensink.WorkflowInput{ - InitialActions: []*kitchensink.ActionSet{ - payloadActivityActionSet(), - }, - }, - }, - } - }, - }) -} -``` - -Where `payloadActivityActionSet()` creates a `PayloadActivity(256, 256, ...)` plus -`ReturnResultAction`, with `MaximumAttempts: 1`, `ScheduleToCloseTimeout: 60s`. +`KitchenSinkExecutor` with a single `ActionSet` containing a `PayloadActivity(256, 256)` action +(with `MaximumAttempts: 1`, `ScheduleToCloseTimeout: 60s`) followed by a `ReturnResultAction`. ### Step 2: Create `scenarios/saa_cogs_saa.go` -```go -package scenarios - -func init() { - loadgen.MustRegisterScenario(loadgen.Scenario{ - Description: "SAA for COGS: standalone activity with payload, no workflow.", - ExecutorFn: func() loadgen.Executor { - return &loadgen.GenericExecutor{ - Execute: executeSAA, - } - }, - }) -} - -func executeSAA(ctx context.Context, run *loadgen.Run) error { - inputData := make([]byte, 256) - handle, err := run.Client.ExecuteActivity(ctx, client.StartActivityOptions{ - ID: fmt.Sprintf("a-%s-%s-%d", run.RunID, run.ExecutionID, run.Iteration), - TaskQueue: run.TaskQueue(), - ScheduleToCloseTimeout: 60 * time.Second, - RetryPolicy: &temporal.RetryPolicy{MaximumAttempts: 1}, - }, "payload", inputData, int32(256)) - if err != nil { - return err - } - var result []byte - return handle.Get(ctx, &result) -} -``` - -This calls `"payload"` by name (string) so the SDK dispatches it to the worker which has it -registered as `activity.RegisterOptions{Name: "payload"}`. - -### Step 3: Create `commands.sh` — useful shell commands - -A file with terse comments documenting how to run the scenarios locally and against the cloud cell. +`GenericExecutor` whose `Execute` function: +1. Calls `run.Client.ExecuteActivity()` with `StartActivityOptions` (ID derived from + run/execution/iteration, task queue from `run.TaskQueue()`, same timeout and retry policy as SAW). +2. Passes activity type `"payload"` by name with `[]byte` (256 zeros) and `int32(256)` as args. +3. Calls `handle.Get()` to wait for the result. -### Step 4: Test locally +### Step 3: Create `commands.sh` -Run against local dev server using `run-scenario-with-worker`: +Useful shell commands with terse comments for: +- Local testing with `--embedded-server` +- Cloud cell verification via `ct` +- Running scenarios against `s-saa-cogs` -```sh -# SAW -go run ./cmd run-scenario-with-worker \ - --scenario saa_cogs_saw --language go \ - --iterations 5 +### Step 4: Test locally -# SAA -go run ./cmd run-scenario-with-worker \ - --scenario saa_cogs_saa --language go \ - --iterations 5 -``` +- `go build ./...` and `go vet ./...` +- `go run ./cmd list-scenarios` shows both new scenarios +- SAW: `go run ./cmd run-scenario-with-worker --scenario saa_cogs_saw --language go --iterations 5 --embedded-server` +- SAA: same command with `saa_cogs_saa` — will get "Standalone activity is disabled" from the dev + server (v1.30.1 doesn't have the feature flag), confirming the code path reaches + `StartActivityExecution`. Will succeed on the cloud cell. ### Step 5: Connect to cloud cell -Use `omni admintools` to verify cell state, then obtain credentials. Run scenarios against -`s-saa-cogs-marathon.e2e.tmprl-test.cloud:7233` with TLS. +1. Verify cell: `ct kubectl --context s-saa-cogs get pods -n temporal` +2. Check namespace: `ct admintools --context s-saa-cogs -- temporal operator namespace describe s-saa-cogs-marathon.e2e` +3. Obtain operator TLS certs (from k8s secrets via `ct`, or ask Stephen) +4. Point Grafana dashboard at `s-saa-cogs`, observe idle state +5. Run worker + SAW scenario against the cell, observe activity in dashboard +6. Run worker + SAA scenario, observe activity ## Verification 1. **Build**: `go build ./...` succeeds. -2. **Lint/vet**: `go vet ./...` succeeds. -3. **Local test — SAW**: `go run ./cmd run-scenario-with-worker --scenario saa_cogs_saw --language go --iterations 5` completes successfully. -4. **Local test — SAA**: `go run ./cmd run-scenario-with-worker --scenario saa_cogs_saa --language go --iterations 5` completes successfully. -5. **List scenarios**: `go run ./cmd list-scenarios` includes both `saa_cogs_saw` and `saa_cogs_saa`. -6. **Cloud cell proof-of-concept**: Point dashboard at `s-saa-cogs`, run one scenario, observe metrics increase. +2. **Lint/vet**: `go vet ./...` clean on our files. +3. **List scenarios**: `go run ./cmd list-scenarios` includes both `saa_cogs_saw` and `saa_cogs_saa`. +4. **Local test — SAW**: `run-scenario-with-worker --embedded-server --iterations 5` completes. +5. **Local test — SAA**: Same command hits `StartActivityExecution` on the server (expected to fail + on dev server with "disabled" error; succeeds on cloud cell with CHASM enabled). +6. **Cloud cell proof-of-concept**: Dashboard shows idle → run scenario → dashboard shows activity. diff --git a/commands.sh b/commands.sh deleted file mode 100644 index 7951ce07..00000000 --- a/commands.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env bash -# SAA COGS experiment — useful commands -# See .task/plan.md for context. - -## --- Local testing (against embedded dev server) --- - -# SAW: 5 iterations -go run ./cmd run-scenario-with-worker --scenario saa_cogs_saw --language go --iterations 5 - -# SAA: 5 iterations -go run ./cmd run-scenario-with-worker --scenario saa_cogs_saa --language go --iterations 5 - -# SAW: sustained 60s at 10 starts/s -go run ./cmd run-scenario-with-worker --scenario saa_cogs_saw --language go \ - --duration 60s --max-iterations-per-second 10 --max-concurrent 100 - -# SAA: sustained 60s at 10 starts/s -go run ./cmd run-scenario-with-worker --scenario saa_cogs_saa --language go \ - --duration 60s --max-iterations-per-second 10 --max-concurrent 100 - -## --- Cloud cell: s-saa-cogs --- - -CELL=s-saa-cogs -NS=${CELL}-marathon.e2e -HOST=${NS}.tmprl-test.cloud:7233 - -# Verify cell is alive -ct kubectl --context $CELL get pods -n temporal - -# Check namespace -omni admintools --context $CELL -- temporal operator namespace describe $NS - -# Run worker against cloud cell (in one terminal) -go run ./cmd run-worker --language go --run-id saa-cogs-test \ - --server-address $HOST --namespace $NS --tls \ - --tls-cert-path /tmp/saa-cogs-cert.pem --tls-key-path /tmp/saa-cogs-key.pem - -# Run SAW scenario against cloud cell (in another terminal) -go run ./cmd run-scenario --scenario saa_cogs_saw --run-id saa-cogs-test \ - --server-address $HOST --namespace $NS --tls \ - --tls-cert-path /tmp/saa-cogs-cert.pem --tls-key-path /tmp/saa-cogs-key.pem \ - --iterations 5 --do-not-register-search-attributes - -# Run SAA scenario against cloud cell -go run ./cmd run-scenario --scenario saa_cogs_saa --run-id saa-cogs-test \ - --server-address $HOST --namespace $NS --tls \ - --tls-cert-path /tmp/saa-cogs-cert.pem --tls-key-path /tmp/saa-cogs-key.pem \ - --iterations 5 --do-not-register-search-attributes - -# Grafana dashboard -# https://grafana.tmprl-internal.cloud/d/saacogs/saa-cogs?var-cluster=s-saa-cogs diff --git a/scenarios/saa_cogs_saa.go b/scenarios/saa_cogs_saa.go deleted file mode 100644 index 1ec05a7b..00000000 --- a/scenarios/saa_cogs_saa.go +++ /dev/null @@ -1,38 +0,0 @@ -package scenarios - -import ( - "context" - "fmt" - "time" - - "go.temporal.io/sdk/client" - "go.temporal.io/sdk/temporal" - - "github.com/temporalio/omes/loadgen" -) - -func init() { - loadgen.MustRegisterScenario(loadgen.Scenario{ - Description: "SAA for COGS: standalone activity with payload, no workflow.", - ExecutorFn: func() loadgen.Executor { - return &loadgen.GenericExecutor{ - Execute: executeSAA, - } - }, - }) -} - -func executeSAA(ctx context.Context, run *loadgen.Run) error { - inputData := make([]byte, 256) - handle, err := run.Client.ExecuteActivity(ctx, client.StartActivityOptions{ - ID: fmt.Sprintf("a-%s-%s-%d", run.RunID, run.ExecutionID, run.Iteration), - TaskQueue: run.TaskQueue(), - ScheduleToCloseTimeout: 60 * time.Second, - RetryPolicy: &temporal.RetryPolicy{MaximumAttempts: 1}, - }, "payload", inputData, int32(256)) - if err != nil { - return fmt.Errorf("failed to start standalone activity: %w", err) - } - var result []byte - return handle.Get(ctx, &result) -} diff --git a/scenarios/saa_cogs_saw.go b/scenarios/saa_cogs_saw.go deleted file mode 100644 index ef4e92d8..00000000 --- a/scenarios/saa_cogs_saw.go +++ /dev/null @@ -1,58 +0,0 @@ -package scenarios - -import ( - "time" - - "go.temporal.io/api/common/v1" - "google.golang.org/protobuf/types/known/durationpb" - - "github.com/temporalio/omes/loadgen" - "github.com/temporalio/omes/loadgen/kitchensink" -) - -func init() { - loadgen.MustRegisterScenario(loadgen.Scenario{ - Description: "SAW baseline for COGS: single workflow executing one payload activity.", - ExecutorFn: func() loadgen.Executor { - return loadgen.KitchenSinkExecutor{ - TestInput: &kitchensink.TestInput{ - WorkflowInput: &kitchensink.WorkflowInput{ - InitialActions: []*kitchensink.ActionSet{ - saaCogsSAWActionSet(), - }, - }, - }, - } - }, - }) -} - -func saaCogsSAWActionSet() *kitchensink.ActionSet { - return &kitchensink.ActionSet{ - Actions: []*kitchensink.Action{ - { - Variant: &kitchensink.Action_ExecActivity{ - ExecActivity: &kitchensink.ExecuteActivityAction{ - ActivityType: &kitchensink.ExecuteActivityAction_Payload{ - Payload: &kitchensink.ExecuteActivityAction_PayloadActivity{ - BytesToReceive: 256, - BytesToReturn: 256, - }, - }, - ScheduleToCloseTimeout: durationpb.New(60 * time.Second), - RetryPolicy: &common.RetryPolicy{ - MaximumAttempts: 1, - }, - }, - }, - }, - { - Variant: &kitchensink.Action_ReturnResult{ - ReturnResult: &kitchensink.ReturnResultAction{ - ReturnThis: &common.Payload{}, - }, - }, - }, - }, - } -} From 9719b5b0b9b91e8fb7ce119b8f81273605ede4aa Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 10:24:06 -0400 Subject: [PATCH 07/40] Update plan: symmetric GenericExecutor for both scenarios Both SAW and SAA use GenericExecutor with a simple Execute function. SAW gets a dedicated minimal workflow registered on the existing Go worker. Reuse existing "payload" activity registration. Drop "cogs" from names. --- .task/plan.md | 93 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 39 deletions(-) diff --git a/.task/plan.md b/.task/plan.md index 300decd9..08e81f3e 100644 --- a/.task/plan.md +++ b/.task/plan.md @@ -9,75 +9,90 @@ observe metrics on the Grafana dashboard. ### Scenarios -**`saa_cogs_saw`** — Single Activity Workflow baseline. Uses `KitchenSinkExecutor` with a single -payload activity (256B in, 256B out), no retry, no heartbeat. Very close to -`workflow_with_single_noop_activity` but with payload instead of noop. - -**`saa_cogs_saa`** — Standalone Activity. Uses `GenericExecutor`. Each iteration calls -`client.ExecuteActivity()` (the SDK's standalone activity API) with the same payload activity, then -`handle.Get()` to wait for the result. No workflow involved. +Both scenarios use `GenericExecutor` with a simple `Execute` function. This keeps the +implementations symmetric — the only difference is what each iteration does, which is exactly the +variable under test. -Both use the same task queue (derived from run-id) and the same Go worker (which already registers -the `payload` activity). +**`saw`** — Single Activity Workflow baseline. Each iteration calls `client.ExecuteWorkflow` with a +dedicated minimal workflow (`saw`) that executes one payload activity and returns. Then +`handle.Get()` to wait for the result. -### Why different executor types +**`saa`** — Standalone Activity. Each iteration calls `client.ExecuteActivity` with the same payload +activity. Then `handle.Get()` to wait for the result. No workflow involved. -`KitchenSinkExecutor` always starts a kitchen-sink workflow — this is inherently what SAW needs. -SAA must call `client.ExecuteActivity` directly (no workflow). `GenericExecutor` gives us the -`Execute` function hook for this. +Both use the same task queue (derived from run-id) and the same Go worker. -Both executor types share the same iteration-driving machinery: `KitchenSinkExecutor` wraps -`GenericExecutor`, so concurrency control, rate limiting, and duration handling are identical. -The only difference between the scenarios is what each iteration *does*, which is exactly the -variable under test. +### Worker code -The activity configuration (256B payload, no retry, 60s timeout) is specified independently in each -scenario. These are simple literal values; sharing them via an abstraction would add indirection -without meaningful deduplication. +A dedicated activity (`payload`) and a dedicated workflow (`saw`), both minimal: -### SDK version +- **`payload` activity**: Takes `[]byte` input and `int32` output size, returns `[]byte` of + requested size. (This activity already exists in the kitchen-sink worker as `"payload"` with + exactly this signature. We write our own to avoid depending on the kitchen-sink worker.) +- **`saw` workflow**: Executes the `payload` activity with the input it receives, returns the + result. No signals, queries, updates, or other machinery. -The current `go.temporal.io/sdk v1.40.0` already includes `client.ExecuteActivity` (added in -v1.40.0, commit `215920a6`). No upgrade needed. +Both are registered on the Go worker alongside the existing kitchen-sink registrations. ### Activity configuration -Both scenarios use the `payload` activity type (already registered in the Go worker as `"payload"`). -Arguments: `inputData []byte` (256 bytes), `bytesToReturn int32` (256). No heartbeat. Retry policy -`MaximumAttempts: 1` (no retries). `ScheduleToCloseTimeout: 60s`. +Both scenarios use: `inputData []byte` (256 bytes), `bytesToReturn int32` (256). No heartbeat. +Retry policy `MaximumAttempts: 1` (no retries). `ScheduleToCloseTimeout: 60s`. + +### SDK version + +`go.temporal.io/sdk v1.40.0` already includes `client.ExecuteActivity`. No upgrade needed. ## Implementation steps -### Step 1: Create `scenarios/saa_cogs_saw.go` +### Step 1: Add worker code -`KitchenSinkExecutor` with a single `ActionSet` containing a `PayloadActivity(256, 256)` action -(with `MaximumAttempts: 1`, `ScheduleToCloseTimeout: 60s`) followed by a `ReturnResultAction`. +In `workers/go/`, add a small file registering: +- Activity `"payload"` — takes `(ctx, []byte, int32)`, returns `([]byte, error)` +- Workflow `"saw"` — executes `"payload"` activity with its input, returns result + +These are registered on the worker alongside existing kitchen-sink registrations. + +**Wait — the existing worker already registers `"payload"` with the same signature.** We should +reuse that registration rather than duplicate it. The question is whether we also need a separate +worker binary or can share the existing one. The existing Go worker registers the kitchen-sink +workflow plus all activities including `"payload"`. For SAW we just need to also register our `saw` +workflow. For SAA we need no workflow at all — just the `"payload"` activity, which is already +registered. + +Decision: add `saw` workflow registration to the existing Go worker. No new worker binary needed. + +### Step 2: Create `scenarios/saw.go` + +`GenericExecutor` whose `Execute` function: +1. Calls `run.Client.ExecuteWorkflow()` starting workflow `"saw"` with the payload input. +2. Calls `handle.Get()` to wait for result. -### Step 2: Create `scenarios/saa_cogs_saa.go` +### Step 3: Create `scenarios/saa.go` `GenericExecutor` whose `Execute` function: 1. Calls `run.Client.ExecuteActivity()` with `StartActivityOptions` (ID derived from - run/execution/iteration, task queue from `run.TaskQueue()`, same timeout and retry policy as SAW). -2. Passes activity type `"payload"` by name with `[]byte` (256 zeros) and `int32(256)` as args. -3. Calls `handle.Get()` to wait for the result. + run/execution/iteration, task queue from `run.TaskQueue()`, same timeout and retry policy). +2. Passes activity type `"payload"` by name with `[]byte` (256 zeros) and `int32(256)`. +3. Calls `handle.Get()` to wait for result. -### Step 3: Create `commands.sh` +### Step 4: Create `commands.sh` Useful shell commands with terse comments for: - Local testing with `--embedded-server` - Cloud cell verification via `ct` - Running scenarios against `s-saa-cogs` -### Step 4: Test locally +### Step 5: Test locally - `go build ./...` and `go vet ./...` - `go run ./cmd list-scenarios` shows both new scenarios -- SAW: `go run ./cmd run-scenario-with-worker --scenario saa_cogs_saw --language go --iterations 5 --embedded-server` -- SAA: same command with `saa_cogs_saa` — will get "Standalone activity is disabled" from the dev +- SAW: `go run ./cmd run-scenario-with-worker --scenario saw --language go --iterations 5 --embedded-server` +- SAA: same command with `saa` — will get "Standalone activity is disabled" from the embedded dev server (v1.30.1 doesn't have the feature flag), confirming the code path reaches `StartActivityExecution`. Will succeed on the cloud cell. -### Step 5: Connect to cloud cell +### Step 6: Connect to cloud cell 1. Verify cell: `ct kubectl --context s-saa-cogs get pods -n temporal` 2. Check namespace: `ct admintools --context s-saa-cogs -- temporal operator namespace describe s-saa-cogs-marathon.e2e` @@ -90,7 +105,7 @@ Useful shell commands with terse comments for: 1. **Build**: `go build ./...` succeeds. 2. **Lint/vet**: `go vet ./...` clean on our files. -3. **List scenarios**: `go run ./cmd list-scenarios` includes both `saa_cogs_saw` and `saa_cogs_saa`. +3. **List scenarios**: `go run ./cmd list-scenarios` includes both `saw` and `saa`. 4. **Local test — SAW**: `run-scenario-with-worker --embedded-server --iterations 5` completes. 5. **Local test — SAA**: Same command hits `StartActivityExecution` on the server (expected to fail on dev server with "disabled" error; succeeds on cloud cell with CHASM enabled). From f414847d0d2e6d23fe6c3f350ce2434e9d1010a9 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 10:33:11 -0400 Subject: [PATCH 08/40] Update plan: rename scenarios, reuse existing payload activity --- .task/plan.md | 62 +++++++++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 39 deletions(-) diff --git a/.task/plan.md b/.task/plan.md index 08e81f3e..00587a64 100644 --- a/.task/plan.md +++ b/.task/plan.md @@ -1,4 +1,4 @@ -# Implementation Plan: SAA COGS Load Generation +# Implementation Plan: SAA Load Generation ## Goal @@ -13,31 +13,27 @@ Both scenarios use `GenericExecutor` with a simple `Execute` function. This keep implementations symmetric — the only difference is what each iteration does, which is exactly the variable under test. -**`saw`** — Single Activity Workflow baseline. Each iteration calls `client.ExecuteWorkflow` with a -dedicated minimal workflow (`saw`) that executes one payload activity and returns. Then -`handle.Get()` to wait for the result. +**`workflow_with_single_activity`** — Each iteration calls `client.ExecuteWorkflow` with a dedicated +minimal workflow that executes one `payload` activity and returns. Then `handle.Get()`. -**`saa`** — Standalone Activity. Each iteration calls `client.ExecuteActivity` with the same payload -activity. Then `handle.Get()` to wait for the result. No workflow involved. +**`standalone_activity`** — Each iteration calls `client.ExecuteActivity` with the same `payload` +activity. Then `handle.Get()`. No workflow involved. Both use the same task queue (derived from run-id) and the same Go worker. ### Worker code -A dedicated activity (`payload`) and a dedicated workflow (`saw`), both minimal: +Reuse the existing `payload` activity at [kitchen_sink.go:511-516](workers/go/kitchensink/kitchen_sink.go#L511-L516), +already registered as `"payload"` at [worker.go:105](workers/go/worker/worker.go#L105). -- **`payload` activity**: Takes `[]byte` input and `int32` output size, returns `[]byte` of - requested size. (This activity already exists in the kitchen-sink worker as `"payload"` with - exactly this signature. We write our own to avoid depending on the kitchen-sink worker.) -- **`saw` workflow**: Executes the `payload` activity with the input it receives, returns the - result. No signals, queries, updates, or other machinery. - -Both are registered on the Go worker alongside the existing kitchen-sink registrations. +Add one new workflow: a minimal function that executes the `payload` activity with its input and +returns the result. Register it on the existing Go worker at [worker.go:102](workers/go/worker/worker.go#L102) +alongside the existing registrations. No new worker binary needed. ### Activity configuration -Both scenarios use: `inputData []byte` (256 bytes), `bytesToReturn int32` (256). No heartbeat. -Retry policy `MaximumAttempts: 1` (no retries). `ScheduleToCloseTimeout: 60s`. +Both scenarios: `inputData []byte` (256 bytes), `bytesToReturn int32` (256). No heartbeat. +`MaximumAttempts: 1` (no retries). `ScheduleToCloseTimeout: 60s`. ### SDK version @@ -45,30 +41,18 @@ Retry policy `MaximumAttempts: 1` (no retries). `ScheduleToCloseTimeout: 60s`. ## Implementation steps -### Step 1: Add worker code - -In `workers/go/`, add a small file registering: -- Activity `"payload"` — takes `(ctx, []byte, int32)`, returns `([]byte, error)` -- Workflow `"saw"` — executes `"payload"` activity with its input, returns result - -These are registered on the worker alongside existing kitchen-sink registrations. - -**Wait — the existing worker already registers `"payload"` with the same signature.** We should -reuse that registration rather than duplicate it. The question is whether we also need a separate -worker binary or can share the existing one. The existing Go worker registers the kitchen-sink -workflow plus all activities including `"payload"`. For SAW we just need to also register our `saw` -workflow. For SAA we need no workflow at all — just the `"payload"` activity, which is already -registered. +### Step 1: Add workflow to worker -Decision: add `saw` workflow registration to the existing Go worker. No new worker binary needed. +Add a small file under `workers/go/` with the minimal workflow function. Register it in +[worker.go](workers/go/worker/worker.go) alongside existing registrations. -### Step 2: Create `scenarios/saw.go` +### Step 2: Create `scenarios/workflow_with_single_activity.go` `GenericExecutor` whose `Execute` function: -1. Calls `run.Client.ExecuteWorkflow()` starting workflow `"saw"` with the payload input. +1. Calls `run.Client.ExecuteWorkflow()` starting the new workflow with the payload input. 2. Calls `handle.Get()` to wait for result. -### Step 3: Create `scenarios/saa.go` +### Step 3: Create `scenarios/standalone_activity.go` `GenericExecutor` whose `Execute` function: 1. Calls `run.Client.ExecuteActivity()` with `StartActivityOptions` (ID derived from @@ -87,9 +71,9 @@ Useful shell commands with terse comments for: - `go build ./...` and `go vet ./...` - `go run ./cmd list-scenarios` shows both new scenarios -- SAW: `go run ./cmd run-scenario-with-worker --scenario saw --language go --iterations 5 --embedded-server` -- SAA: same command with `saa` — will get "Standalone activity is disabled" from the embedded dev - server (v1.30.1 doesn't have the feature flag), confirming the code path reaches +- SAW: `go run ./cmd run-scenario-with-worker --scenario workflow_with_single_activity --language go --iterations 5 --embedded-server` +- SAA: same command with `standalone_activity` — will get "Standalone activity is disabled" from the + embedded dev server (v1.30.1 doesn't have the feature flag), confirming the code path reaches `StartActivityExecution`. Will succeed on the cloud cell. ### Step 6: Connect to cloud cell @@ -105,8 +89,8 @@ Useful shell commands with terse comments for: 1. **Build**: `go build ./...` succeeds. 2. **Lint/vet**: `go vet ./...` clean on our files. -3. **List scenarios**: `go run ./cmd list-scenarios` includes both `saw` and `saa`. +3. **List scenarios**: `go run ./cmd list-scenarios` includes both names. 4. **Local test — SAW**: `run-scenario-with-worker --embedded-server --iterations 5` completes. 5. **Local test — SAA**: Same command hits `StartActivityExecution` on the server (expected to fail on dev server with "disabled" error; succeeds on cloud cell with CHASM enabled). -6. **Cloud cell proof-of-concept**: Dashboard shows idle → run scenario → dashboard shows activity. +6. **Cloud cell proof-of-concept**: Dashboard shows idle -> run scenario -> dashboard shows activity. From 786be36a33fb04160871cd0b77bfbf3046ed07b6 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 10:38:31 -0400 Subject: [PATCH 09/40] painting by numbers --- .task/plan.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.task/plan.md b/.task/plan.md index 00587a64..acc2e53d 100644 --- a/.task/plan.md +++ b/.task/plan.md @@ -41,6 +41,20 @@ Both scenarios: `inputData []byte` (256 bytes), `bytesToReturn int32` (256). No ## Implementation steps +IMPORTANT: Rather than doing the implementation yourself, please "teach" the user to do the +implementation themselves. Take a "painting by numbers" approach: Decide on the first component they +should write, and insert a comment in the code indicating what they should do. Then pause and give +them a clickable links to the comment, and to any existing prior art in the codebase they might want +to refer to. Don't output code directly to them. Work with them to complete the stage; review their +work carefully. Do not consider the stage complete until the work is done to an equal or greater +standard than you yourself would have achieved. When that stage is completed by them, or with +further assistance from you, move on to the next component to be implemented and repeat this +procedure. + +Regarding names: we will not use "cogs" anywhere in omes code itself. Conceptually, the omes code is +defining SAW and SAA workloads. What those are used for (to run an experiment) and why (COGS +investigation) is not the concern of the omes code. + ### Step 1: Add workflow to worker Add a small file under `workers/go/` with the minimal workflow function. Register it in From a29561e78ad36e7567cd8a988b30a54b93d568b0 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 11:35:45 -0400 Subject: [PATCH 10/40] Document Payload activity --- workers/go/kitchensink/kitchen_sink.go | 1 + 1 file changed, 1 insertion(+) diff --git a/workers/go/kitchensink/kitchen_sink.go b/workers/go/kitchensink/kitchen_sink.go index 21551c18..fa977496 100644 --- a/workers/go/kitchensink/kitchen_sink.go +++ b/workers/go/kitchensink/kitchen_sink.go @@ -508,6 +508,7 @@ func Noop(_ context.Context) error { return nil } +// Payload is an activity that takes arbitrary bytes input and returns a bytes result of size `bytesToReturn`. func Payload(_ context.Context, inputData []byte, bytesToReturn int32) ([]byte, error) { output := make([]byte, bytesToReturn) //goland:noinspection GoDeprecation -- This is fine. We don't need crypto security. From cc4df14eab96c6e060eb7840218fa82d5e3c3bae Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 11:37:13 -0400 Subject: [PATCH 11/40] SingleActivityWorkflow: implement and register --- workers/go/singleactivityworkflow/workflow.go | 20 +++++++++++++++++++ workers/go/worker/worker.go | 2 ++ 2 files changed, 22 insertions(+) create mode 100644 workers/go/singleactivityworkflow/workflow.go diff --git a/workers/go/singleactivityworkflow/workflow.go b/workers/go/singleactivityworkflow/workflow.go new file mode 100644 index 00000000..95c394c7 --- /dev/null +++ b/workers/go/singleactivityworkflow/workflow.go @@ -0,0 +1,20 @@ +package singleactivityworkflow + +import ( + "time" + + "go.temporal.io/sdk/temporal" + "go.temporal.io/sdk/workflow" +) + +func SingleActivityWorkflow(ctx workflow.Context, input []byte, outputSize int32) ([]byte, error) { + var output []byte + err := workflow.ExecuteActivity(workflow.WithActivityOptions(ctx, workflow.ActivityOptions{ + StartToCloseTimeout: 5 * time.Second, + RetryPolicy: &temporal.RetryPolicy{MaximumAttempts: 1}, + }), "payload", input, outputSize).Get(ctx, &output) + if err != nil { + return nil, err + } + return output, nil +} diff --git a/workers/go/worker/worker.go b/workers/go/worker/worker.go index 40ffc757..aaab61cc 100644 --- a/workers/go/worker/worker.go +++ b/workers/go/worker/worker.go @@ -9,6 +9,7 @@ import ( "github.com/temporalio/omes/workers/go/ebbandflow" "github.com/temporalio/omes/workers/go/kitchensink" "github.com/temporalio/omes/workers/go/schedulerstress" + "github.com/temporalio/omes/workers/go/singleactivityworkflow" "go.temporal.io/sdk/activity" "go.temporal.io/sdk/client" "go.temporal.io/sdk/worker" @@ -112,6 +113,7 @@ func runWorkers(client client.Client, taskQueues []string, options clioptions.Wo w.RegisterActivity(&ebbFlowActivities) w.RegisterWorkflowWithOptions(schedulerstress.NoopScheduledWorkflow, workflow.RegisterOptions{Name: "NoopScheduledWorkflow"}) w.RegisterWorkflowWithOptions(schedulerstress.SleepScheduledWorkflow, workflow.RegisterOptions{Name: "SleepScheduledWorkflow"}) + w.RegisterWorkflow(singleactivityworkflow.SingleActivityWorkflow) w.RegisterNexusService(service) errCh <- w.Run(worker.InterruptCh()) }() From 83b4610d6ef2f820a5d1ca1c4e85f19b50628361 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 12:06:37 -0400 Subject: [PATCH 12/40] Register scenario --- scenarios/standalone_activity.go | 50 ++++++++++++++++++++++ scenarios/workflow_with_single_activity.go | 32 ++++++++++++++ workers/go/worker/worker.go | 2 +- 3 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 scenarios/standalone_activity.go create mode 100644 scenarios/workflow_with_single_activity.go diff --git a/scenarios/standalone_activity.go b/scenarios/standalone_activity.go new file mode 100644 index 00000000..d55547f0 --- /dev/null +++ b/scenarios/standalone_activity.go @@ -0,0 +1,50 @@ +package scenarios + +import ( + "context" + "fmt" + "time" + + "github.com/temporalio/omes/loadgen" + "go.temporal.io/sdk/client" + "go.temporal.io/sdk/temporal" +) + +func init() { + loadgen.MustRegisterScenario(loadgen.Scenario{ + Description: "Run a standalone activity. The activity takes in some bytes and returns some bytes. " + + "It never retries or heartbeats.", + ExecutorFn: func() loadgen.Executor { + return &loadgen.GenericExecutor{ + Execute: func(ctx context.Context, r *loadgen.Run) error { + payloadSize := r.ScenarioOptionInt("payload-size", 0) + handle, err := r.Client.ExecuteActivity( + ctx, + activityOptions(r), + "payload", + make([]byte, payloadSize), + int32(payloadSize), + ) + if err != nil { + return err + } + return handle.Get(ctx, nil) + }, + } + }, + }) +} + +func activityOptions(r *loadgen.Run) client.StartActivityOptions { + return client.StartActivityOptions{ + ID: fmt.Sprintf( + "a-%s-%s-%d", + r.RunID, + r.ExecutionID, + r.Iteration, + ), + TaskQueue: r.TaskQueue(), + StartToCloseTimeout: 5 * time.Second, + RetryPolicy: &temporal.RetryPolicy{MaximumAttempts: 1}, + } +} diff --git a/scenarios/workflow_with_single_activity.go b/scenarios/workflow_with_single_activity.go new file mode 100644 index 00000000..ef47a1de --- /dev/null +++ b/scenarios/workflow_with_single_activity.go @@ -0,0 +1,32 @@ +package scenarios + +import ( + "context" + + "github.com/temporalio/omes/loadgen" +) + +func init() { + loadgen.MustRegisterScenario(loadgen.Scenario{ + Description: "Run a single-activity workflow. It takes in some bytes, passes them to an " + + "activity, and returns the bytes returned by the activity. The activity never retries or heartbeats.", + ExecutorFn: func() loadgen.Executor { + return &loadgen.GenericExecutor{ + Execute: func(ctx context.Context, r *loadgen.Run) error { + payloadSize := r.ScenarioOptionInt("payload-size", 0) + handle, err := r.Client.ExecuteWorkflow( + ctx, + r.DefaultStartWorkflowOptions(), + "singleActivityWorkflow", + make([]byte, payloadSize), + int32(payloadSize), + ) + if err != nil { + return err + } + return handle.Get(ctx, nil) + }, + } + }, + }) +} diff --git a/workers/go/worker/worker.go b/workers/go/worker/worker.go index aaab61cc..fb1bc791 100644 --- a/workers/go/worker/worker.go +++ b/workers/go/worker/worker.go @@ -113,7 +113,7 @@ func runWorkers(client client.Client, taskQueues []string, options clioptions.Wo w.RegisterActivity(&ebbFlowActivities) w.RegisterWorkflowWithOptions(schedulerstress.NoopScheduledWorkflow, workflow.RegisterOptions{Name: "NoopScheduledWorkflow"}) w.RegisterWorkflowWithOptions(schedulerstress.SleepScheduledWorkflow, workflow.RegisterOptions{Name: "SleepScheduledWorkflow"}) - w.RegisterWorkflow(singleactivityworkflow.SingleActivityWorkflow) + w.RegisterWorkflowWithOptions(singleactivityworkflow.SingleActivityWorkflow, workflow.RegisterOptions{Name: "singleActivityWorkflow"}) w.RegisterNexusService(service) errCh <- w.Run(worker.InterruptCh()) }() From a77c193dde975d97b3d00e2c8151d3f9a560a8e3 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 12:51:06 -0400 Subject: [PATCH 13/40] commands --- commands.sh | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 commands.sh diff --git a/commands.sh b/commands.sh new file mode 100644 index 00000000..b214633c --- /dev/null +++ b/commands.sh @@ -0,0 +1,41 @@ +# Shell commands for SAA/SAW load generation scenarios. + +# --- Local testing (embedded dev server) --- + +go run ./cmd run-scenario-with-worker --scenario workflow_with_single_activity --language go --iterations 5 --embedded-server --option payload-size=1024 + +go run ./cmd run-scenario-with-worker --scenario standalone_activity --language go --iterations 5 --embedded-server --option payload-size=1024 + +# --- Cloud cell: s-saa-cogs --- + +# List all k8s namespaces on the cell +ct kubectl --context s-saa-cogs get namespaces + +# Verify cell is up +ct kubectl --context s-saa-cogs get pods -n temporal + +# Check namespace +ct admintools --context s-saa-cogs -- temporal operator namespace describe s-saa-cogs-marathon.e2e + +# Run worker (in one terminal) +go run ./workers/go --task-queue omes \ + --server-address TODO \ + --namespace s-saa-cogs-marathon.e2e \ + --tls-cert-path TODO \ + --tls-key-path TODO + +# Run SAW scenario +go run ./cmd run-scenario --scenario workflow_with_single_activity \ + --server-address TODO \ + --namespace s-saa-cogs-marathon.e2e \ + --tls-cert-path TODO \ + --tls-key-path TODO \ + --iterations 100 --max-concurrent 10 + +# Run SAA scenario +go run ./cmd run-scenario --scenario standalone_activity \ + --server-address TODO \ + --namespace s-saa-cogs-marathon.e2e \ + --tls-cert-path TODO \ + --tls-key-path TODO \ + --iterations 100 --max-concurrent 10 From b17844e4a0f8a7acf209b10b662a2d5c37fecdcb Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 14:02:45 -0400 Subject: [PATCH 14/40] Add cell info --- .task/task.md | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/.task/task.md b/.task/task.md index b9c71259..b9b47cc3 100644 --- a/.task/task.md +++ b/.task/task.md @@ -716,6 +716,45 @@ Your task is to help me design and build the omes-based tooling that we will use (1) Add any missing omes functionality that will be needed in order to be able to use omes to generate the SAA and SAW load for the experiments. (2) Run the experiments against the cloud cell that Stephen has prepared: its name is s-saa-cogs. +Stephen linked to the 'scaffold' run that created the cell. I see it had the following input: + +{ + "CellConfig": { + "Identity": { + "Location": { + "CloudProvider": "aws", + "AccountID": "124355634071", + "Region": "us-west-2" + }, + "ID": "s-saa-cogs" + }, + "Template": "v5-aws-dev", + "ServerVersion": "v3.151.9_oss1.31.0_151.6", + "AgentVersion": "v3.151.9_oss1.31.0_151.6", + "WebVersion": "v2.47.0", + "GoCanaryVersion": "v1.35.0", + "ComponentVersion": "v2026-03-20.00", + "WalVersion": "v10.0.3", + "EnableMetering": false + }, + "FailurePolicy": 1 +} + +and output: + +{ + "Cell": { + "Identity": { + "Location": { + "CloudProvider": "aws", + "AccountID": "124355634071", + "Region": "us-west-2" + }, + "ID": "s-saa-cogs" + } + } +} + I am not familiar with performing operations against cloud cells, so you will need to resarch and help me during this. But we have several good resources: study the contents of the 'oncall' and 'runbooks' repos, and also use the /agent-slack skill. You also have Notion and Temporal Docs MCP. Use the more modern 'ct' rather than its alias 'omni'. Initial grafana dashboard JSON is at .task/saacogs.json. From 4a1458d6fdc0443339b469a41eff3120c6cbe833 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 14:18:02 -0400 Subject: [PATCH 15/40] Annotate cell support URL as potentially unavailable for dev cells --- commands.sh | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/commands.sh b/commands.sh index b214633c..bdca80ab 100644 --- a/commands.sh +++ b/commands.sh @@ -8,14 +8,26 @@ go run ./cmd run-scenario-with-worker --scenario standalone_activity --language # --- Cloud cell: s-saa-cogs --- +# Cell support page: https://cloud.temporal.io/support/cells/s-saa-cogs +# (may not resolve for dev-template cells; use ct ocld / ct kubectl instead) +# K8s access: ct k9s --readonly --context s-saa-cogs + # List all k8s namespaces on the cell ct kubectl --context s-saa-cogs get namespaces # Verify cell is up ct kubectl --context s-saa-cogs get pods -n temporal -# Check namespace -ct admintools --context s-saa-cogs -- temporal operator namespace describe s-saa-cogs-marathon.e2e +# List Temporal namespaces on this cell +# Web: https://cloud.temporal.io/support/cells/s-saa-cogs (if cell is registered in UI) +ct ocld namespace db list --active-cluster s-saa-cogs + +# Grafana dashboards +# Overview: https://grafana.tmprl-internal.cloud/d/e613c827-243e-4759-a5ca-3e334201c124/temporal-cloud-overview +# By namespace: https://grafana.tmprl-internal.cloud/d/iyRCOBD4z/temporal-cloud-external-metrics-by-namespace +# Frontend: https://grafana.tmprl-internal.cloud/d/SxRYJXZMz/frontend +# Matching: https://grafana.tmprl-internal.cloud/d/wuh-8uZGk/matching +# History: https://grafana.tmprl-internal.cloud/d/jh_LXEin2/history # Run worker (in one terminal) go run ./workers/go --task-queue omes \ From f529e1f4e915bc0de7b425c5d56457ca0032c086 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 14:21:28 -0400 Subject: [PATCH 16/40] Fix cell support URL: s-saa* cells use staging.thundergun.io --- commands.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/commands.sh b/commands.sh index bdca80ab..42daa729 100644 --- a/commands.sh +++ b/commands.sh @@ -8,8 +8,8 @@ go run ./cmd run-scenario-with-worker --scenario standalone_activity --language # --- Cloud cell: s-saa-cogs --- -# Cell support page: https://cloud.temporal.io/support/cells/s-saa-cogs -# (may not resolve for dev-template cells; use ct ocld / ct kubectl instead) +# Cell support page: https://staging.thundergun.io/support/cells/s-saa-cogs +# (s-saa* cells are staging/test cells on thundergun, not cloud.temporal.io) # K8s access: ct k9s --readonly --context s-saa-cogs # List all k8s namespaces on the cell @@ -19,7 +19,7 @@ ct kubectl --context s-saa-cogs get namespaces ct kubectl --context s-saa-cogs get pods -n temporal # List Temporal namespaces on this cell -# Web: https://cloud.temporal.io/support/cells/s-saa-cogs (if cell is registered in UI) +# Web: https://staging.thundergun.io/support/cells/s-saa-cogs ct ocld namespace db list --active-cluster s-saa-cogs # Grafana dashboards From 479db2d8ea1ee8bc6e0e83d7f4686a0e0e3f58f1 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 14:25:23 -0400 Subject: [PATCH 17/40] Use ct admintools to list Temporal namespaces --- commands.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/commands.sh b/commands.sh index 42daa729..d937b9dc 100644 --- a/commands.sh +++ b/commands.sh @@ -20,7 +20,7 @@ ct kubectl --context s-saa-cogs get pods -n temporal # List Temporal namespaces on this cell # Web: https://staging.thundergun.io/support/cells/s-saa-cogs -ct ocld namespace db list --active-cluster s-saa-cogs +ct admintools --context s-saa-cogs -- temporal operator namespace list -o json # Grafana dashboards # Overview: https://grafana.tmprl-internal.cloud/d/e613c827-243e-4759-a5ca-3e334201c124/temporal-cloud-overview From 9d0fcc8af6c13b40bdf47cc3e143315d2e99e8e8 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 14:30:22 -0400 Subject: [PATCH 18/40] Update namespace to match created saa-cogs namespace --- commands.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/commands.sh b/commands.sh index d937b9dc..a6d7e3c6 100644 --- a/commands.sh +++ b/commands.sh @@ -22,6 +22,9 @@ ct kubectl --context s-saa-cogs get pods -n temporal # Web: https://staging.thundergun.io/support/cells/s-saa-cogs ct admintools --context s-saa-cogs -- temporal operator namespace list -o json +# Create a namespace +ct admintools --context s-saa-cogs -- temporal operator namespace create saa-cogs + # Grafana dashboards # Overview: https://grafana.tmprl-internal.cloud/d/e613c827-243e-4759-a5ca-3e334201c124/temporal-cloud-overview # By namespace: https://grafana.tmprl-internal.cloud/d/iyRCOBD4z/temporal-cloud-external-metrics-by-namespace @@ -32,14 +35,14 @@ ct admintools --context s-saa-cogs -- temporal operator namespace list -o json # Run worker (in one terminal) go run ./workers/go --task-queue omes \ --server-address TODO \ - --namespace s-saa-cogs-marathon.e2e \ + --namespace saa-cogs \ --tls-cert-path TODO \ --tls-key-path TODO # Run SAW scenario go run ./cmd run-scenario --scenario workflow_with_single_activity \ --server-address TODO \ - --namespace s-saa-cogs-marathon.e2e \ + --namespace saa-cogs \ --tls-cert-path TODO \ --tls-key-path TODO \ --iterations 100 --max-concurrent 10 @@ -47,7 +50,7 @@ go run ./cmd run-scenario --scenario workflow_with_single_activity \ # Run SAA scenario go run ./cmd run-scenario --scenario standalone_activity \ --server-address TODO \ - --namespace s-saa-cogs-marathon.e2e \ + --namespace saa-cogs \ --tls-cert-path TODO \ --tls-key-path TODO \ --iterations 100 --max-concurrent 10 From c3f80772ba5eec9640bfae1e664c8775714c421d Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 19:01:23 -0400 Subject: [PATCH 19/40] Add units to dashboard --- .task/saacogs.json | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.task/saacogs.json b/.task/saacogs.json index 8654452f..bb30e263 100644 --- a/.task/saacogs.json +++ b/.task/saacogs.json @@ -221,7 +221,7 @@ "refId": "A" } ], - "title": "Frontend RPC by method", + "title": "Frontend RPC by method (req/s)", "type": "timeseries" }, { @@ -279,7 +279,7 @@ "refId": "A" } ], - "title": "History RPC by method", + "title": "History RPC by method (req/s)", "type": "timeseries" }, { @@ -337,7 +337,7 @@ "refId": "A" } ], - "title": "Matching RPC by method", + "title": "Matching RPC by method (req/s)", "type": "timeseries" }, { @@ -403,7 +403,7 @@ "refId": "B" } ], - "title": "Astra writes by table", + "title": "Astra writes by table (req/s)", "description": "Validate r_Cass = 3/7 for writes. cassandra_query filtered to verb!=select; cassandra_batch is always writes.", "type": "timeseries" }, @@ -462,7 +462,7 @@ "refId": "A" } ], - "title": "Astra reads by table", + "title": "Astra reads by table (req/s)", "description": "Reads are not expected to differ much between SAW and SAA (similar caching, ~1 read on creation).", "type": "timeseries" }, @@ -521,7 +521,7 @@ "refId": "A" } ], - "title": "WAL operation rate by type", + "title": "WAL operation rate by type (ops/s)", "description": "Covers both reads and writes (no separate write-only metric). Expect HISTORY_EVENT_WAL activity for SAW only; both use MUTABLE_STATE_WAL.", "type": "timeseries" }, @@ -580,7 +580,7 @@ "refId": "A" } ], - "title": "Visibility persistence rate by operation", + "title": "Visibility persistence rate by operation (ops/s)", "description": "OSS visibility_persistence_requests counter, tagged by operation (RecordWorkflowExecutionStarted, RecordWorkflowExecutionClosed, UpsertWorkflowExecution, DeleteWorkflowExecution).", "type": "timeseries" }, @@ -647,7 +647,7 @@ "refId": "B" } ], - "title": "Sync vs async match rate", + "title": "Sync vs async match rate (matches/s)", "description": "Health check on experimental conditions. Async match means tasks went through persistence/backlog rather than being dispatched directly to a waiting poller.", "type": "timeseries" } From 6c1ac63787ae45829c888c04d086332a4d2380e3 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 19:09:47 -0400 Subject: [PATCH 20/40] commands --- commands.sh | 83 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 24 deletions(-) diff --git a/commands.sh b/commands.sh index a6d7e3c6..91d6a86f 100644 --- a/commands.sh +++ b/commands.sh @@ -13,7 +13,7 @@ go run ./cmd run-scenario-with-worker --scenario standalone_activity --language # K8s access: ct k9s --readonly --context s-saa-cogs # List all k8s namespaces on the cell -ct kubectl --context s-saa-cogs get namespaces +ct kubectl --context s-saa-cogs get namespaces -o json # Verify cell is up ct kubectl --context s-saa-cogs get pods -n temporal @@ -22,35 +22,70 @@ ct kubectl --context s-saa-cogs get pods -n temporal # Web: https://staging.thundergun.io/support/cells/s-saa-cogs ct admintools --context s-saa-cogs -- temporal operator namespace list -o json -# Create a namespace -ct admintools --context s-saa-cogs -- temporal operator namespace create saa-cogs +# Create namespace pinned to the cell +ct ocld test namespace create \ + --namespace saa-cogs-4.temporal-dev \ + --region us-west-2 \ + --cloud-provider aws \ + --retention 1 \ + --placement-override-cell-id s-saa-cogs \ + --auth-method api_key -# Grafana dashboards -# Overview: https://grafana.tmprl-internal.cloud/d/e613c827-243e-4759-a5ca-3e334201c124/temporal-cloud-overview -# By namespace: https://grafana.tmprl-internal.cloud/d/iyRCOBD4z/temporal-cloud-external-metrics-by-namespace -# Frontend: https://grafana.tmprl-internal.cloud/d/SxRYJXZMz/frontend -# Matching: https://grafana.tmprl-internal.cloud/d/wuh-8uZGk/matching -# History: https://grafana.tmprl-internal.cloud/d/jh_LXEin2/history +# DNS should resolve +nslookup saa-cogs-4.temporal-dev.tmprl-test.cloud + +# Namespace should appear on the cell +ct admintools --context s-saa-cogs -- temporal operator namespace list + +# Namespace should be active with API key auth enabled +# Output contains grpcAddress +ct ocld test cloud-apis namespaces get -n saa-cogs-4.temporal-dev + +export TEMPORAL_API_KEY=xxx +export TEMPORAL_ADDRESS=us-west-2.aws.api.tmprl-test.cloud:7233 +export TEMPORAL_NAMESPACE=saa-cogs-4.temporal-dev +export TEMPORAL_TLS=true +export TEMPORAL_TLS_DISABLE_HOST_VERIFICATION=true + +ct admintools --context s-saa-cogs -- temporal operator search-attribute create \ + --namespace saa-cogs-4.temporal-dev --name OmesExecutionID --type Keyword # Run worker (in one terminal) -go run ./workers/go --task-queue omes \ - --server-address TODO \ - --namespace saa-cogs \ - --tls-cert-path TODO \ - --tls-key-path TODO +go run ./cmd run-worker \ + --run-id run-1 \ + --scenario workflow_with_single_activity \ + --language go \ + --server-address us-west-2.aws.api.tmprl-test.cloud:7233 \ + --namespace saa-cogs-4.temporal-dev \ + --tls \ + --disable-tls-host-verification \ + --auth-header "Bearer $TEMPORAL_API_KEY" # Run SAW scenario go run ./cmd run-scenario --scenario workflow_with_single_activity \ - --server-address TODO \ - --namespace saa-cogs \ - --tls-cert-path TODO \ - --tls-key-path TODO \ - --iterations 100 --max-concurrent 10 + --run-id run-1 \ + --iterations 100 --max-concurrent 10 \ + --do-not-register-search-attributes \ + --server-address us-west-2.aws.api.tmprl-test.cloud:7233 \ + --namespace saa-cogs-4.temporal-dev \ + --tls \ + --disable-tls-host-verification \ + --auth-header "Bearer $TEMPORAL_API_KEY" # Run SAA scenario go run ./cmd run-scenario --scenario standalone_activity \ - --server-address TODO \ - --namespace saa-cogs \ - --tls-cert-path TODO \ - --tls-key-path TODO \ - --iterations 100 --max-concurrent 10 + --run-id run-1 \ + --iterations 100 --max-concurrent 10 \ + --do-not-register-search-attributes \ + --server-address us-west-2.aws.api.tmprl-test.cloud:7233 \ + --namespace saa-cogs-4.temporal-dev \ + --tls \ + --disable-tls-host-verification \ + --auth-header "Bearer $TEMPORAL_API_KEY" + +# Grafana dashboards +# Overview: https://grafana.tmprl-internal.cloud/d/e613c827-243e-4759-a5ca-3e334201c124/temporal-cloud-overview +# By namespace: https://grafana.tmprl-internal.cloud/d/iyRCOBD4z/temporal-cloud-external-metrics-by-namespace +# Frontend: https://grafana.tmprl-internal.cloud/d/SxRYJXZMz/frontend +# Matching: https://grafana.tmprl-internal.cloud/d/wuh-8uZGk/matching +# History: https://grafana.tmprl-internal.cloud/d/jh_LXEin2/history From d983a466cefe1f9e19a871c36e3d26ad14abdf59 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 21:53:10 -0400 Subject: [PATCH 21/40] Don't do anything wit searxh attributes --- scenarios/workflow_with_single_activity.go | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/scenarios/workflow_with_single_activity.go b/scenarios/workflow_with_single_activity.go index ef47a1de..6930a329 100644 --- a/scenarios/workflow_with_single_activity.go +++ b/scenarios/workflow_with_single_activity.go @@ -2,8 +2,10 @@ package scenarios import ( "context" + "fmt" "github.com/temporalio/omes/loadgen" + "go.temporal.io/sdk/client" ) func init() { @@ -16,7 +18,7 @@ func init() { payloadSize := r.ScenarioOptionInt("payload-size", 0) handle, err := r.Client.ExecuteWorkflow( ctx, - r.DefaultStartWorkflowOptions(), + startWorkflowOptions(r), "singleActivityWorkflow", make([]byte, payloadSize), int32(payloadSize), @@ -30,3 +32,16 @@ func init() { }, }) } + +func startWorkflowOptions(r *loadgen.Run) client.StartWorkflowOptions { + return client.StartWorkflowOptions{ + TaskQueue: loadgen.TaskQueueForRun(r.RunID), + ID: fmt.Sprintf( + "w-%s-%s-%d", + r.RunID, + r.ExecutionID, + r.Iteration, + ), + WorkflowExecutionErrorWhenAlreadyStarted: !r.Configuration.IgnoreAlreadyStarted, + } +} From 39e0aa0e1c50b9807e77220f282717adc86f0be2 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 21:54:32 -0400 Subject: [PATCH 22/40] commands --- commands.sh | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/commands.sh b/commands.sh index 91d6a86f..8aa1bf1e 100644 --- a/commands.sh +++ b/commands.sh @@ -50,21 +50,16 @@ export TEMPORAL_TLS_DISABLE_HOST_VERIFICATION=true ct admintools --context s-saa-cogs -- temporal operator search-attribute create \ --namespace saa-cogs-4.temporal-dev --name OmesExecutionID --type Keyword -# Run worker (in one terminal) -go run ./cmd run-worker \ - --run-id run-1 \ +# SAW +go run ./cmd run-scenario-with-worker \ --scenario workflow_with_single_activity \ --language go \ - --server-address us-west-2.aws.api.tmprl-test.cloud:7233 \ - --namespace saa-cogs-4.temporal-dev \ - --tls \ - --disable-tls-host-verification \ - --auth-header "Bearer $TEMPORAL_API_KEY" - -# Run SAW scenario -go run ./cmd run-scenario --scenario workflow_with_single_activity \ --run-id run-1 \ - --iterations 100 --max-concurrent 10 \ + --duration 1h --max-concurrent 500 --max-iterations-per-second 100 \ + --worker-max-concurrent-workflow-pollers 40 \ + --worker-max-concurrent-workflow-tasks 500 \ + --worker-max-concurrent-activity-pollers 40 \ + --worker-max-concurrent-activities 500 \ --do-not-register-search-attributes \ --server-address us-west-2.aws.api.tmprl-test.cloud:7233 \ --namespace saa-cogs-4.temporal-dev \ @@ -72,10 +67,14 @@ go run ./cmd run-scenario --scenario workflow_with_single_activity \ --disable-tls-host-verification \ --auth-header "Bearer $TEMPORAL_API_KEY" -# Run SAA scenario -go run ./cmd run-scenario --scenario standalone_activity \ +# SAA +go run ./cmd run-scenario-with-worker \ + --scenario standalone_activity \ + --language go \ --run-id run-1 \ - --iterations 100 --max-concurrent 10 \ + --duration 1h --max-concurrent 500 --max-iterations-per-second 100 \ + --worker-max-concurrent-activity-pollers 40 \ + --worker-max-concurrent-activities 500 \ --do-not-register-search-attributes \ --server-address us-west-2.aws.api.tmprl-test.cloud:7233 \ --namespace saa-cogs-4.temporal-dev \ @@ -89,3 +88,8 @@ go run ./cmd run-scenario --scenario standalone_activity \ # Frontend: https://grafana.tmprl-internal.cloud/d/SxRYJXZMz/frontend # Matching: https://grafana.tmprl-internal.cloud/d/wuh-8uZGk/matching # History: https://grafana.tmprl-internal.cloud/d/jh_LXEin2/history + +ct ocld test dynamic-config namespace get -n saa-cogs-4.temporal-dev + +# 88ms RTT +for i in $(seq 10); do curl -s -o /dev/null -w '%{time_connect}\n' https://us-west-2.aws.api.tmprl-test.cloud:7233; done From 771f2f6f55c0de4142bf3df7bb1bba278855db89 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Sat, 21 Mar 2026 21:55:17 -0400 Subject: [PATCH 23/40] Fix Go worker flag names to match other language workers WorkerOptions.FlagSet() now takes a prefix parameter. The outer CLI passes "worker-" (so users write --worker-max-concurrent-activities), and passthrough() strips it for the subprocess. The Go worker binary passes "" so it accepts the stripped names, matching dotnet/python/ typescript/java workers. --- cmd/cli/run_worker.go | 2 +- cmd/clioptions/worker.go | 19 ++++++++++--------- workers/go/worker/worker.go | 2 +- workers/run.go | 2 +- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/cmd/cli/run_worker.go b/cmd/cli/run_worker.go index 272b49fa..a1b1b869 100644 --- a/cmd/cli/run_worker.go +++ b/cmd/cli/run_worker.go @@ -53,7 +53,7 @@ func (r *workerRunner) addCLIFlags(fs *pflag.FlagSet) { fs.IntVar(&r.TaskQueueIndexSuffixEnd, "task-queue-suffix-index-end", 0, "Inclusive end for task queue suffix range") fs.AddFlagSet(r.ClientOptions.FlagSet()) fs.AddFlagSet(r.MetricsOptions.FlagSet("worker-")) - fs.AddFlagSet(r.WorkerOptions.FlagSet()) + fs.AddFlagSet(r.WorkerOptions.FlagSet("worker-")) } func (r *workerRunner) preRun() { diff --git a/cmd/clioptions/worker.go b/cmd/clioptions/worker.go index 82f4cc9e..2df633dd 100644 --- a/cmd/clioptions/worker.go +++ b/cmd/clioptions/worker.go @@ -20,19 +20,20 @@ type WorkerOptions struct { } // FlagSet adds the relevant flags to populate the options struct and returns a pflag.FlagSet. -func (m *WorkerOptions) FlagSet() *pflag.FlagSet { +// The prefix is prepended to each flag name (e.g. "worker-" for the outer CLI, "" for worker binaries). +func (m *WorkerOptions) FlagSet(prefix string) *pflag.FlagSet { if m.fs != nil { return m.fs } m.fs = pflag.NewFlagSet("worker_options", pflag.ExitOnError) m.fs.StringVar(&m.BuildID, "build-id", "", "Build ID") - m.fs.IntVar(&m.MaxConcurrentActivityPollers, "worker-max-concurrent-activity-pollers", 0, "Max concurrent activity pollers") - m.fs.IntVar(&m.MaxConcurrentWorkflowPollers, "worker-max-concurrent-workflow-pollers", 0, "Max concurrent workflow pollers") - m.fs.IntVar(&m.MaxConcurrentActivities, "worker-max-concurrent-activities", 0, "Max concurrent activities") - m.fs.IntVar(&m.MaxConcurrentWorkflowTasks, "worker-max-concurrent-workflow-tasks", 0, "Max concurrent workflow tasks") - m.fs.IntVar(&m.ActivityPollerAutoscaleMax, "worker-activity-poller-autoscale-max", 0, "Max for activity poller autoscaling (overrides max-concurrent-activity-pollers") - m.fs.IntVar(&m.WorkflowPollerAutoscaleMax, "worker-workflow-poller-autoscale-max", 0, "Max for workflow poller autoscaling (overrides max-concurrent-workflow-pollers") - m.fs.Float64Var(&m.WorkerActivitiesPerSecond, "worker-activities-per-second", 0, "Per-worker activity rate limit") - m.fs.BoolVar(&m.ErrOnUnimplemented, "worker-err-on-unimplemented", false, "Fail on unimplemented actions (currently this only applies to concurrent client actions)") + m.fs.IntVar(&m.MaxConcurrentActivityPollers, prefix+"max-concurrent-activity-pollers", 0, "Max concurrent activity pollers") + m.fs.IntVar(&m.MaxConcurrentWorkflowPollers, prefix+"max-concurrent-workflow-pollers", 0, "Max concurrent workflow pollers") + m.fs.IntVar(&m.MaxConcurrentActivities, prefix+"max-concurrent-activities", 0, "Max concurrent activities") + m.fs.IntVar(&m.MaxConcurrentWorkflowTasks, prefix+"max-concurrent-workflow-tasks", 0, "Max concurrent workflow tasks") + m.fs.IntVar(&m.ActivityPollerAutoscaleMax, prefix+"activity-poller-autoscale-max", 0, "Max for activity poller autoscaling (overrides max-concurrent-activity-pollers)") + m.fs.IntVar(&m.WorkflowPollerAutoscaleMax, prefix+"workflow-poller-autoscale-max", 0, "Max for workflow poller autoscaling (overrides max-concurrent-workflow-pollers)") + m.fs.Float64Var(&m.WorkerActivitiesPerSecond, prefix+"activities-per-second", 0, "Per-worker activity rate limit") + m.fs.BoolVar(&m.ErrOnUnimplemented, prefix+"err-on-unimplemented", false, "Fail on unimplemented actions (currently this only applies to concurrent client actions)") return m.fs } diff --git a/workers/go/worker/worker.go b/workers/go/worker/worker.go index fb1bc791..a221c7e0 100644 --- a/workers/go/worker/worker.go +++ b/workers/go/worker/worker.go @@ -138,7 +138,7 @@ func Main() { cmd.Flags().AddFlagSet(app.loggingOptions.FlagSet()) cmd.Flags().AddFlagSet(app.clientOptions.FlagSet()) cmd.Flags().AddFlagSet(app.metricsOptions.FlagSet("")) - cmd.Flags().AddFlagSet(app.workerOptions.FlagSet()) + cmd.Flags().AddFlagSet(app.workerOptions.FlagSet("")) cmd.Flags().StringVarP(&app.taskQueue, "task-queue", "q", "omes", "Task queue to use") cmd.Flags().IntVar(&app.taskQueueIndexSuffixStart, "task-queue-suffix-index-start", 0, "Inclusive start for task queue suffix range") diff --git a/workers/run.go b/workers/run.go index 9ca84150..feb639ba 100644 --- a/workers/run.go +++ b/workers/run.go @@ -131,7 +131,7 @@ func (r *Runner) Run(ctx context.Context, baseDir string) error { args = append(args, passthrough(r.ClientOptions.FlagSet(), "")...) args = append(args, passthrough(r.LoggingOptions.FlagSet(), "")...) args = append(args, passthroughExcluding(r.MetricsOptions.FlagSet("worker-"), "worker-", "process-metrics-address", "metrics-version-tag")...) - args = append(args, passthrough(r.WorkerOptions.FlagSet(), "worker-")...) + args = append(args, passthrough(r.WorkerOptions.FlagSet("worker-"), "worker-")...) cmd, err := prog.NewCommand(context.Background(), args...) if err != nil { From 4a5bafa9119fafc48f86b67f9de1e97609bf2513 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Mon, 23 Mar 2026 13:19:42 -0400 Subject: [PATCH 24/40] Clean up commands --- commands.sh | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/commands.sh b/commands.sh index 8aa1bf1e..c0e53038 100644 --- a/commands.sh +++ b/commands.sh @@ -3,13 +3,11 @@ # --- Local testing (embedded dev server) --- go run ./cmd run-scenario-with-worker --scenario workflow_with_single_activity --language go --iterations 5 --embedded-server --option payload-size=1024 - go run ./cmd run-scenario-with-worker --scenario standalone_activity --language go --iterations 5 --embedded-server --option payload-size=1024 # --- Cloud cell: s-saa-cogs --- # Cell support page: https://staging.thundergun.io/support/cells/s-saa-cogs -# (s-saa* cells are staging/test cells on thundergun, not cloud.temporal.io) # K8s access: ct k9s --readonly --context s-saa-cogs # List all k8s namespaces on the cell @@ -55,7 +53,7 @@ go run ./cmd run-scenario-with-worker \ --scenario workflow_with_single_activity \ --language go \ --run-id run-1 \ - --duration 1h --max-concurrent 500 --max-iterations-per-second 100 \ + --duration 1h --max-concurrent 500 --max-iterations-per-second 50 \ --worker-max-concurrent-workflow-pollers 40 \ --worker-max-concurrent-workflow-tasks 500 \ --worker-max-concurrent-activity-pollers 40 \ @@ -82,13 +80,6 @@ go run ./cmd run-scenario-with-worker \ --disable-tls-host-verification \ --auth-header "Bearer $TEMPORAL_API_KEY" -# Grafana dashboards -# Overview: https://grafana.tmprl-internal.cloud/d/e613c827-243e-4759-a5ca-3e334201c124/temporal-cloud-overview -# By namespace: https://grafana.tmprl-internal.cloud/d/iyRCOBD4z/temporal-cloud-external-metrics-by-namespace -# Frontend: https://grafana.tmprl-internal.cloud/d/SxRYJXZMz/frontend -# Matching: https://grafana.tmprl-internal.cloud/d/wuh-8uZGk/matching -# History: https://grafana.tmprl-internal.cloud/d/jh_LXEin2/history - ct ocld test dynamic-config namespace get -n saa-cogs-4.temporal-dev # 88ms RTT From 312bbe33950a1cd33899015a4d79f3037e29fe83 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Mon, 23 Mar 2026 13:20:48 -0400 Subject: [PATCH 25/40] cleanup --- .task/AGENTS.md | 3 - .task/plan.md | 110 ------- .task/research.md | 159 ---------- .task/saacogs.json | 703 ----------------------------------------- .task/task.md | 764 --------------------------------------------- AGENTS.md | 1 - CLAUDE.md | 1 - 7 files changed, 1741 deletions(-) delete mode 100644 .task/AGENTS.md delete mode 100644 .task/plan.md delete mode 100644 .task/research.md delete mode 100644 .task/saacogs.json delete mode 100644 .task/task.md delete mode 120000 AGENTS.md delete mode 120000 CLAUDE.md diff --git a/.task/AGENTS.md b/.task/AGENTS.md deleted file mode 100644 index ff9f7032..00000000 --- a/.task/AGENTS.md +++ /dev/null @@ -1,3 +0,0 @@ -At the start of the conversation output the following so that I know you've read these instructions: - -📖 omes:saa-cogs diff --git a/.task/plan.md b/.task/plan.md deleted file mode 100644 index acc2e53d..00000000 --- a/.task/plan.md +++ /dev/null @@ -1,110 +0,0 @@ -# Implementation Plan: SAA Load Generation - -## Goal - -Create two omes scenarios to generate SAW and SAA workloads against cloud cell `s-saa-cogs`, then -observe metrics on the Grafana dashboard. - -## Design - -### Scenarios - -Both scenarios use `GenericExecutor` with a simple `Execute` function. This keeps the -implementations symmetric — the only difference is what each iteration does, which is exactly the -variable under test. - -**`workflow_with_single_activity`** — Each iteration calls `client.ExecuteWorkflow` with a dedicated -minimal workflow that executes one `payload` activity and returns. Then `handle.Get()`. - -**`standalone_activity`** — Each iteration calls `client.ExecuteActivity` with the same `payload` -activity. Then `handle.Get()`. No workflow involved. - -Both use the same task queue (derived from run-id) and the same Go worker. - -### Worker code - -Reuse the existing `payload` activity at [kitchen_sink.go:511-516](workers/go/kitchensink/kitchen_sink.go#L511-L516), -already registered as `"payload"` at [worker.go:105](workers/go/worker/worker.go#L105). - -Add one new workflow: a minimal function that executes the `payload` activity with its input and -returns the result. Register it on the existing Go worker at [worker.go:102](workers/go/worker/worker.go#L102) -alongside the existing registrations. No new worker binary needed. - -### Activity configuration - -Both scenarios: `inputData []byte` (256 bytes), `bytesToReturn int32` (256). No heartbeat. -`MaximumAttempts: 1` (no retries). `ScheduleToCloseTimeout: 60s`. - -### SDK version - -`go.temporal.io/sdk v1.40.0` already includes `client.ExecuteActivity`. No upgrade needed. - -## Implementation steps - -IMPORTANT: Rather than doing the implementation yourself, please "teach" the user to do the -implementation themselves. Take a "painting by numbers" approach: Decide on the first component they -should write, and insert a comment in the code indicating what they should do. Then pause and give -them a clickable links to the comment, and to any existing prior art in the codebase they might want -to refer to. Don't output code directly to them. Work with them to complete the stage; review their -work carefully. Do not consider the stage complete until the work is done to an equal or greater -standard than you yourself would have achieved. When that stage is completed by them, or with -further assistance from you, move on to the next component to be implemented and repeat this -procedure. - -Regarding names: we will not use "cogs" anywhere in omes code itself. Conceptually, the omes code is -defining SAW and SAA workloads. What those are used for (to run an experiment) and why (COGS -investigation) is not the concern of the omes code. - -### Step 1: Add workflow to worker - -Add a small file under `workers/go/` with the minimal workflow function. Register it in -[worker.go](workers/go/worker/worker.go) alongside existing registrations. - -### Step 2: Create `scenarios/workflow_with_single_activity.go` - -`GenericExecutor` whose `Execute` function: -1. Calls `run.Client.ExecuteWorkflow()` starting the new workflow with the payload input. -2. Calls `handle.Get()` to wait for result. - -### Step 3: Create `scenarios/standalone_activity.go` - -`GenericExecutor` whose `Execute` function: -1. Calls `run.Client.ExecuteActivity()` with `StartActivityOptions` (ID derived from - run/execution/iteration, task queue from `run.TaskQueue()`, same timeout and retry policy). -2. Passes activity type `"payload"` by name with `[]byte` (256 zeros) and `int32(256)`. -3. Calls `handle.Get()` to wait for result. - -### Step 4: Create `commands.sh` - -Useful shell commands with terse comments for: -- Local testing with `--embedded-server` -- Cloud cell verification via `ct` -- Running scenarios against `s-saa-cogs` - -### Step 5: Test locally - -- `go build ./...` and `go vet ./...` -- `go run ./cmd list-scenarios` shows both new scenarios -- SAW: `go run ./cmd run-scenario-with-worker --scenario workflow_with_single_activity --language go --iterations 5 --embedded-server` -- SAA: same command with `standalone_activity` — will get "Standalone activity is disabled" from the - embedded dev server (v1.30.1 doesn't have the feature flag), confirming the code path reaches - `StartActivityExecution`. Will succeed on the cloud cell. - -### Step 6: Connect to cloud cell - -1. Verify cell: `ct kubectl --context s-saa-cogs get pods -n temporal` -2. Check namespace: `ct admintools --context s-saa-cogs -- temporal operator namespace describe s-saa-cogs-marathon.e2e` -3. Obtain operator TLS certs (from k8s secrets via `ct`, or ask Stephen) -4. Point Grafana dashboard at `s-saa-cogs`, observe idle state -5. Run worker + SAW scenario against the cell, observe activity in dashboard -6. Run worker + SAA scenario, observe activity - -## Verification - -1. **Build**: `go build ./...` succeeds. -2. **Lint/vet**: `go vet ./...` clean on our files. -3. **List scenarios**: `go run ./cmd list-scenarios` includes both names. -4. **Local test — SAW**: `run-scenario-with-worker --embedded-server --iterations 5` completes. -5. **Local test — SAA**: Same command hits `StartActivityExecution` on the server (expected to fail - on dev server with "disabled" error; succeeds on cloud cell with CHASM enabled). -6. **Cloud cell proof-of-concept**: Dashboard shows idle -> run scenario -> dashboard shows activity. diff --git a/.task/research.md b/.task/research.md deleted file mode 100644 index cab82543..00000000 --- a/.task/research.md +++ /dev/null @@ -1,159 +0,0 @@ -# SAA COGS Experiment: Research & Design - -## 1. Current State of Omes - -### Architecture -Omes is a load generation framework for Temporal. Scenarios are Go files in `scenarios/` that -register via `init()` → `loadgen.MustRegisterScenario()`. The scenario name comes from the -filename. Execution flows: - -1. `run-scenario` command: dials Temporal, runs scenario executor -2. `run-worker` command: starts a worker (Go/Python/etc) polling a task queue -3. `run-scenario-with-worker`: runs both together (local development) - -### Executor Types -- `GenericExecutor`: takes a `func(ctx, *Run) error` — most flexible -- `KitchenSinkExecutor`: wraps `GenericExecutor`, starts kitchen-sink workflows with configurable action sequences -- `FuzzExecutor`: random action generation - -### Existing Standalone Activity Support -Branch `standalone-activity` (commit `efbbb7f`) adds SAA to the `throughput_stress` scenario as -an *optional extra activity within a workflow*. The implementation: - -1. Proto: `StandaloneActivity` message in `kitchen_sink.proto` -2. Helper: `StandaloneActivity()` in `loadgen/kitchensink/helpers.go` creates an action -3. Worker: `ExecuteStandaloneActivity()` in `workers/go/kitchensink/kitchen_sink.go` — called as a - *workflow activity* that internally calls `StartActivityExecution` + `PollActivityExecution` -4. Scenario: enabled via `--option enable-standalone-activity=true` - -**Critical observation**: This existing support executes SAA *from within a workflow activity*. -That is useful for testing SAA functionality but **not** for the COGS experiment. For COGS, we need -to run SAA directly from the load generator (no workflow involved) so that the only server-side -work is the standalone activity execution itself. - -## 2. What We Need for the COGS Experiment - -### Two New Scenarios - -**`saa_cogs_saw`** — Single Activity Workflow (the baseline): -- Each iteration: start a workflow that executes one activity (payload: 256B in, 256B out), then completes -- This is very close to `workflow_with_single_noop_activity` but with a payload activity - -**`saa_cogs_saa`** — Standalone Activity: -- Each iteration: call `StartActivityExecution` directly from the load generator, then - `PollActivityExecution` to wait for the result -- No workflow involved -- Same activity (payload: 256B in, 256B out) and task queue -- **Requires a `GenericExecutor`** since `KitchenSinkExecutor` always starts workflows - -Both scenarios must use the same worker (the Go worker with `payload` activity registered). - -### Key Design Decisions - -1. **Activity type**: `payload` with 256B input, 256B output (matching the COGS analysis) -2. **No heartbeat, no retry** (matching the COGS analysis; retry max_attempts=1) -3. **Fixed start rate** (not fixed concurrency) — controls for latency differences -4. **Same task queue** for both scenarios — ensures same worker setup -5. **Sync match preferred** — the COGS analysis assumes sync match; verify via metrics - -### SAA Load Generator Implementation - -The SAA scenario needs to call gRPC APIs directly. Looking at the existing -`ExecuteStandaloneActivity` in the worker code (`workers/go/kitchensink/kitchen_sink.go:46-120`), -we have a working reference. The scenario version should: - -1. Use `client.WorkflowService()` to get the gRPC client -2. Call `StartActivityExecution` with the activity config -3. Call `PollActivityExecution` to wait for completion -4. This is a `GenericExecutor` with a custom `Execute` function - -## 3. Cloud Cell Operations - -### Connecting to a Cloud Cell - -From `bench-go.mdx`, the namespace format for test cells is `{cellId}-marathon.e2e` and the host -is `{cellId}-marathon.e2e.tmprl-test.cloud:7233`. For our cell `s-saa-cogs`: -- Namespace: `s-saa-cogs-marathon.e2e` (to be confirmed — Stephen may have set up differently) -- Host: `s-saa-cogs-marathon.e2e.tmprl-test.cloud:7233` - -Omes connects via: -``` ---server-address --namespace --tls --tls-cert-path --tls-key-path -``` - -Or with API key auth: -``` ---server-address --namespace --tls --auth-header "Bearer " -``` - -### Running omes against a cloud cell - -Two options: -1. **Local**: Run `go run ./cmd run-scenario` and `go run ./cmd run-worker` locally, connecting to - the cloud cell via TLS. Simplest for proof-of-concept. Higher latency (network round trip to - cloud) but the load generator itself isn't on the critical path for COGS measurement. -2. **K8s pod**: Deploy omes worker as a pod on the cell's k8s cluster. Lower latency, more - realistic. The bench-go runbook shows this is the standard approach. Uses `omni scaffold` with - `--benchgo-enabled` or manual deployment. - -For initial proof-of-concept: run locally. For the actual experiment: deploy to k8s. - -### Grafana Dashboard - -The dashboard at `https://grafana.tmprl-internal.cloud/d/saacogs/saa-cogs` uses a `$cluster` -variable. Set `cluster=s-saa-cogs` to point at our cell. - -### Cell Setup Verification - -Use `ct` / `omni` to verify cell state: -```sh -# Check cell status -ct kubectl --context s-saa-cogs get pods -n temporal - -# Check namespace exists -omni admintools --context s-saa-cogs -- temporal operator namespace describe s-saa-cogs-marathon.e2e -``` - -### Search Attributes - -Cloud cells cannot register search attributes via the SDK — they must be registered via the -control plane. The `--do-not-register-search-attributes` flag exists for this. We should use it, -and register `OmesExecutionID` separately if needed. For the simple COGS scenarios, we may not -even need search attributes. - -## 4. Implementation Plan - -### Phase 1: Minimal Scenarios (omes code changes) - -1. Create `scenarios/saa_cogs_saw.go` — SAW scenario using `KitchenSinkExecutor` -2. Create `scenarios/saa_cogs_saa.go` — SAA scenario using `GenericExecutor` with direct gRPC calls -3. Both share config: payload size, start rate, duration - -### Phase 2: Local Proof-of-Concept - -1. Test both scenarios against local Temporal server -2. Run `go run ./cmd run-scenario-with-worker` for SAW -3. For SAA: run worker separately, then scenario (since SAA doesn't use workflows but the - worker still needs to poll for activity tasks) - -### Phase 3: Cloud Cell Connection - -1. Obtain credentials for s-saa-cogs cell -2. Verify dashboard shows idle state -3. Run a single SAW iteration and observe metrics -4. Run a single SAA iteration and observe metrics - -### Phase 4: Full Experiment - -1. Deploy omes worker to cloud cell k8s -2. Run SAW at target start rate for target duration -3. Wait for cool-down, collect metrics -4. Run SAA at same start rate for same duration -5. Collect and compare metrics - -## 5. Open Questions - -- What namespace(s) are configured on s-saa-cogs? -- How do we obtain TLS certs or API keys for the cell? (Check oncall or runbooks repos or search slack) -- Does the cell have CHASM standalone activities enabled? (Dynamic config flag) -- Worker deployment: should we use the existing bench-go infrastructure or deploy omes directly? diff --git a/.task/saacogs.json b/.task/saacogs.json deleted file mode 100644 index bb30e263..00000000 --- a/.task/saacogs.json +++ /dev/null @@ -1,703 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { "type": "grafana", "uid": "-- Grafana --" }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "links": [], - "panels": [ - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { "group": "A", "mode": "none" }, - "thresholdsStyle": { "mode": "off" } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } - }, - "overrides": [] - }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, - "id": 1, - "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", pod=~\"frontend-.*\"})", - "legendFormat": "frontend", - "range": true, - "refId": "A" - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", pod=~\"history-.*\"})", - "legendFormat": "history", - "range": true, - "refId": "B" - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", pod=~\"matching-.*\"})", - "legendFormat": "matching", - "range": true, - "refId": "C" - } - ], - "title": "CPU per service (vCPU)", - "type": "timeseries" - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { "group": "A", "mode": "none" }, - "thresholdsStyle": { "mode": "off" } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - }, - "unit": "decbytes" - }, - "overrides": [] - }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, - "id": 2, - "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "quantile(0.5, sum by (pod) (container_memory_working_set_bytes{cluster=\"$cluster\",container!=\"\",image!=\"\",namespace=\"temporal\"} * on (namespace, pod) group_left (workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"temporal\",workload=\"frontend\",workload_type=\"deployment\"}))", - "legendFormat": "frontend", - "range": true, - "refId": "A" - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "quantile(0.5, sum by (pod) (container_memory_working_set_bytes{cluster=\"$cluster\",container!=\"\",image!=\"\",namespace=\"temporal\"} * on (namespace, pod) group_left (workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"temporal\",workload=\"history\",workload_type=\"deployment\"}))", - "legendFormat": "history", - "range": true, - "refId": "B" - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "quantile(0.5, sum by (pod) (container_memory_working_set_bytes{cluster=\"$cluster\",container!=\"\",image!=\"\",namespace=\"temporal\"} * on (namespace, pod) group_left (workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"temporal\",workload=\"matching\",workload_type=\"deployment\"}))", - "legendFormat": "matching", - "range": true, - "refId": "C" - } - ], - "title": "Memory per service (p50 working set)", - "type": "timeseries" - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { "group": "A", "mode": "none" }, - "thresholdsStyle": { "mode": "off" } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } - }, - "overrides": [] - }, - "gridPos": { "h": 8, "w": 8, "x": 0, "y": 8 }, - "id": 3, - "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "sum by (operation)(rate(service_requests{cluster=\"$cluster\",temporal_service_type=\"frontend\"}[$__rate_interval]))", - "legendFormat": "{{operation}}", - "range": true, - "refId": "A" - } - ], - "title": "Frontend RPC by method (req/s)", - "type": "timeseries" - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { "group": "A", "mode": "none" }, - "thresholdsStyle": { "mode": "off" } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } - }, - "overrides": [] - }, - "gridPos": { "h": 8, "w": 8, "x": 8, "y": 8 }, - "id": 4, - "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "sum by (operation)(rate(service_requests{cluster=\"$cluster\",temporal_service_type=\"history\"}[$__rate_interval]))", - "legendFormat": "{{operation}}", - "range": true, - "refId": "A" - } - ], - "title": "History RPC by method (req/s)", - "type": "timeseries" - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { "group": "A", "mode": "none" }, - "thresholdsStyle": { "mode": "off" } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } - }, - "overrides": [] - }, - "gridPos": { "h": 8, "w": 8, "x": 16, "y": 8 }, - "id": 5, - "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "sum by (operation)(rate(service_requests{cluster=\"$cluster\",temporal_service_type=\"matching\"}[$__rate_interval]))", - "legendFormat": "{{operation}}", - "range": true, - "refId": "A" - } - ], - "title": "Matching RPC by method (req/s)", - "type": "timeseries" - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { "group": "A", "mode": "none" }, - "thresholdsStyle": { "mode": "off" } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } - }, - "overrides": [] - }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, - "id": 6, - "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "sum by (table)(rate(cassandra_query{cluster=\"$cluster\",verb!=\"select\"}[$__rate_interval]))", - "legendFormat": "query: {{table}}", - "range": true, - "refId": "A" - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "sum by (table)(rate(cassandra_batch{cluster=\"$cluster\"}[$__rate_interval]))", - "legendFormat": "batch: {{table}}", - "range": true, - "refId": "B" - } - ], - "title": "Astra writes by table (req/s)", - "description": "Validate r_Cass = 3/7 for writes. cassandra_query filtered to verb!=select; cassandra_batch is always writes.", - "type": "timeseries" - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { "group": "A", "mode": "none" }, - "thresholdsStyle": { "mode": "off" } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } - }, - "overrides": [] - }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, - "id": 7, - "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "sum by (table)(rate(cassandra_query{cluster=\"$cluster\",verb=\"select\"}[$__rate_interval]))", - "legendFormat": "{{table}}", - "range": true, - "refId": "A" - } - ], - "title": "Astra reads by table (req/s)", - "description": "Reads are not expected to differ much between SAW and SAA (similar caching, ~1 read on creation).", - "type": "timeseries" - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { "group": "A", "mode": "none" }, - "thresholdsStyle": { "mode": "off" } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } - }, - "overrides": [] - }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, - "id": 8, - "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "sum by (walType)(rate(wal_latency_count{cluster=\"$cluster\"}[$__rate_interval]))", - "legendFormat": "{{walType}}", - "range": true, - "refId": "A" - } - ], - "title": "WAL operation rate by type (ops/s)", - "description": "Covers both reads and writes (no separate write-only metric). Expect HISTORY_EVENT_WAL activity for SAW only; both use MUTABLE_STATE_WAL.", - "type": "timeseries" - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { "group": "A", "mode": "none" }, - "thresholdsStyle": { "mode": "off" } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } - }, - "overrides": [] - }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, - "id": 9, - "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "sum by (operation)(rate(visibility_persistence_requests{cluster=\"$cluster\"}[$__rate_interval]))", - "legendFormat": "{{operation}}", - "range": true, - "refId": "A" - } - ], - "title": "Visibility persistence rate by operation (ops/s)", - "description": "OSS visibility_persistence_requests counter, tagged by operation (RecordWorkflowExecutionStarted, RecordWorkflowExecutionClosed, UpsertWorkflowExecution, DeleteWorkflowExecution).", - "type": "timeseries" - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { "type": "linear" }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { "group": "A", "mode": "none" }, - "thresholdsStyle": { "mode": "off" } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { "color": "green", "value": null }, - { "color": "red", "value": 80 } - ] - } - }, - "overrides": [] - }, - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 32 }, - "id": 10, - "options": { - "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, - "tooltip": { "mode": "single", "sort": "none" } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "sum(rate(syncmatch_latency_count{cluster=\"$cluster\"}[$__rate_interval]))", - "legendFormat": "sync match", - "range": true, - "refId": "A" - }, - { - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "editorMode": "code", - "expr": "sum(rate(asyncmatch_latency_count{cluster=\"$cluster\"}[$__rate_interval]))", - "legendFormat": "async match", - "range": true, - "refId": "B" - } - ], - "title": "Sync vs async match rate (matches/s)", - "description": "Health check on experimental conditions. Async match means tasks went through persistence/backlog rather than being dispatched directly to a waiting poller.", - "type": "timeseries" - } - ], - "preload": false, - "schemaVersion": 40, - "tags": [], - "templating": { - "list": [ - { - "current": { "text": "prod", "value": "prod" }, - "name": "env", - "options": [ - { "selected": true, "text": "prod", "value": "prod" }, - { "selected": false, "text": "dev", "value": "test" } - ], - "query": "prod : prod, dev : test", - "type": "custom" - }, - { - "current": { "text": "prod thanos", "value": "af7fe237-211e-413e-9723-41a73886bcbb" }, - "hide": 2, - "includeAll": false, - "name": "datasource", - "options": [], - "query": "prometheus", - "refresh": 1, - "regex": "${env:text}.*", - "type": "datasource" - }, - { - "current": {}, - "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(restarts,cluster)", - "includeAll": false, - "label": "Cluster", - "name": "cluster", - "options": [], - "query": { "query": "label_values(restarts,cluster)", "refId": "StandardVariableQuery" }, - "refresh": 2, - "regex": "", - "type": "query" - } - ] - }, - "time": { "from": "now-3h", "to": "now" }, - "timepicker": {}, - "timezone": "utc", - "title": "SAA COGS", - "uid": "saacogs", - "version": 1, - "weekStart": "" -} diff --git a/.task/task.md b/.task/task.md deleted file mode 100644 index b9b47cc3..00000000 --- a/.task/task.md +++ /dev/null @@ -1,764 +0,0 @@ -For background context, please study the following documents carefully: - -START_DOCUMENT------------------------------------------------------------------------------ -# Temporal Activity Execution & saas-temporal Cloud Persistence: Implementation Overview - -## Part 1: Activity Execution Models in Temporal Server - -### 1.1 CHASM Standalone Activities (`chasm/lib/activity/`) - -CHASM standalone activities are first-class, independently-scheduled executions outside workflow context. They use **mutable state only** -- no history events. - -#### State Machine - -States defined in `chasm/lib/activity/proto/v1/activity_state.proto`: - -``` -UNSPECIFIED - → SCHEDULED - → STARTED - → COMPLETED (terminal) - → FAILED (terminal) - → CANCEL_REQUESTED → CANCELED (terminal) - → TIMED_OUT (terminal) - → TERMINATED (terminal) - → CANCEL_REQUESTED → CANCELED (terminal) - → TIMED_OUT (terminal) - → TERMINATED (terminal) - → SCHEDULED (retry path) -``` - -Lifecycle states (`activity.go:95-107`): -- `LifecycleStateRunning`: SCHEDULED, STARTED, CANCEL_REQUESTED -- `LifecycleStateCompleted`: COMPLETED -- `LifecycleStateFailed`: FAILED, TERMINATED, TIMED_OUT, CANCELED - -#### State Transitions (`statemachine.go`) - -| Transition | From | To | Trigger | -|---|---|---|---| -| TransitionScheduled (37-77) | UNSPECIFIED | SCHEDULED | Initial scheduling | -| TransitionRescheduled (87-127) | STARTED | SCHEDULED | Retry after failure | -| TransitionStarted (130-169) | SCHEDULED | STARTED | Worker accepts task | -| TransitionCompleted (177-202) | STARTED/CANCEL_REQUESTED | COMPLETED | Worker completes | -| TransitionFailed (210-237) | STARTED/CANCEL_REQUESTED | FAILED | Non-retryable failure | -| TransitionCancelRequested (278-295) | STARTED/SCHEDULED | CANCEL_REQUESTED | Cancel API called | -| TransitionCanceled (304-331) | CANCEL_REQUESTED | CANCELED | Worker acknowledges cancel | -| TransitionTerminated (246-275) | SCHEDULED/STARTED/CANCEL_REQUESTED | TERMINATED | Terminate API called | -| TransitionTimedOut (340-374) | SCHEDULED/STARTED/CANCEL_REQUESTED | TIMED_OUT | Timer task fires | - -#### Mutable State Structures - -**ActivityState** (proto): -- `activity_type`, `task_queue`, timeouts (`schedule_to_close`, `schedule_to_start`, `start_to_close`, `heartbeat`), `retry_policy`, `status`, `schedule_time`, `priority`, `cancel_state`, `terminate_state` - -**Activity Go Component** (`activity.go:52-68`): -- `ActivityState` (embedded proto) -- `Visibility: chasm.Field[*chasm.Visibility]` -- search attributes -- `LastAttempt: chasm.Field[*ActivityAttemptState]` -- attempt count, stamp, started_time, failure details, worker identity -- `LastHeartbeat: chasm.Field[*ActivityHeartbeatState]` -- heartbeat details and recorded_time -- `RequestData: chasm.Field[*ActivityRequestData]` -- input, header, user_metadata -- `Outcome: chasm.Field[*ActivityOutcome]` -- successful (output) or failed (failure) -- `Store: chasm.ParentPtr[ActivityStore]` -- parent workflow (nil for standalone) - -#### Task Flow - -1. **Scheduling** (`handler.go:51-104`): `StartActivityExecution()` → creates Activity → applies TransitionScheduled -2. **Dispatch** (`activity_tasks.go:21-79`): `activityDispatchTaskExecutor` pushes to matching service via `AddActivityTask()` -3. **Start** (`activity.go:173-191`): `HandleStarted()` applies TransitionStarted, schedules start-to-close and heartbeat timeout tasks -4. **Completion** (`activity.go:259-280`): `HandleCompleted()` applies TransitionCompleted -5. **Failure** (`activity.go:284-323`): `HandleFailed()` checks retryability → either `tryReschedule()` or TransitionFailed -6. **Heartbeat** (`activity.go:559-586`): Updates LastHeartbeat, reschedules heartbeat timeout task - -#### Timeout Tasks - -- **ScheduleToStartTimeoutTask** (`activity_tasks.go:81-116`): Non-retryable → TIMED_OUT -- **ScheduleToCloseTimeoutTask** (`activity_tasks.go:118-150`): Non-retryable → TIMED_OUT -- **StartToCloseTimeoutTask** (`activity_tasks.go:152-198`): Attempts retry via `tryReschedule()`; if not retryable → TIMED_OUT -- **HeartbeatTimeoutTask** (`activity_tasks.go:200-276`): Validates heartbeat recency; attempts retry; if not retryable → TIMED_OUT - -#### Retry Logic - -- `shouldRetry()` (`activity.go:504-514`): Checks TransitionRescheduled possible, attempt < max, enough time remaining -- `hasEnoughTimeForRetry()` (`activity.go:518-534`): Exponential backoff calculation against schedule-to-close deadline -- `tryReschedule()` (`activity.go:489-502`): Applies TransitionRescheduled (increments attempt, schedules dispatch with backoff) - -#### Cancellation - -- `RequestCancelActivityExecution` (`handler.go:273-296`): Applies TransitionCancelRequested - - If SCHEDULED: immediately applies TransitionCanceled (`activity.go:414-433`) - - If STARTED: stays CANCEL_REQUESTED; worker receives cancellation on next interaction - ---- - -### 1.2 Legacy Workflow Activities - -Activities executed as part of a workflow use **mutable state (ActivityInfo) plus history events**. - -#### History Events - -``` -EVENT_TYPE_ACTIVITY_TASK_SCHEDULED (10) -EVENT_TYPE_ACTIVITY_TASK_STARTED (11) -EVENT_TYPE_ACTIVITY_TASK_COMPLETED (12) -EVENT_TYPE_ACTIVITY_TASK_FAILED (13) -EVENT_TYPE_ACTIVITY_TASK_TIMED_OUT (14) -EVENT_TYPE_ACTIVITY_TASK_CANCEL_REQUESTED (15) -EVENT_TYPE_ACTIVITY_TASK_CANCELED (16) -``` - -#### ActivityInfo Mutable State (`persistence/v1/executions.proto:524-661`) - -Core: `activity_id`, `activity_type`, `task_queue`, `scheduled_time`, `started_time`, `started_event_id`, `scheduled_event_id` - -Timeouts: `schedule_to_close_timeout`, `schedule_to_start_timeout`, `start_to_close_timeout`, `heartbeat_timeout` - -Retry: `attempt`, `has_retry_policy`, `retry_initial_interval`, `retry_maximum_interval`, `retry_maximum_attempts`, `retry_backoff_coefficient`, `retry_expiration_time`, `retry_non_retryable_error_types`, `retry_last_failure` - -State flags: `cancel_requested`, `cancel_request_id`, `timer_task_status` (bit flags), `stamp`, `paused`, `pause_info` - -#### Pending Activity States (`activity.go:53-61`) - -- SCHEDULED: `StartedEventId == 0` -- STARTED: `StartedEventId != 0 && !CancelRequested` -- CANCEL_REQUESTED: `CancelRequested` -- PAUSED: `Paused && Scheduled` -- PAUSE_REQUESTED: `Paused && Started` - -#### Timer Task Status Flags - -```go -TimerTaskStatusCreatedScheduleToStart = 1 -TimerTaskStatusCreatedScheduleToClose = 2 -TimerTaskStatusCreatedStartToClose = 4 -TimerTaskStatusCreatedHeartbeat = 8 -``` - -#### Pause/Unpause/Reset (unique to legacy model) - -- **Pause** (`activity.go:254-284`): Sets `paused = true`, increments stamp if SCHEDULED -- **Unpause** (`activity.go:388-425`): Clears pause, regenerates retry task if SCHEDULED -- **Reset** (`activity.go:286-379`): Resets attempt to 1, optionally resets heartbeat/options - -#### API Handlers (`service/history/api/`) - -- `recordactivitytaskstarted/api.go`: Creates ActivityTaskStartedEvent -- `respondactivitytaskcompleted/api.go`: Creates ActivityTaskCompletedEvent -- `respondactivitytaskfailed/api.go`: Retry or ActivityTaskFailedEvent -- `respondactivitytaskcanceled/api.go`: Creates ActivityTaskCanceledEvent -- `recordactivitytaskheartbeat/api.go`: Updates heartbeat state, reschedules timeout - ---- - -### 1.3 Activity Metrics (Both Models) - -Defined in `common/metrics/metric_defs.go`. Both models emit the same metric names. - -**Counters:** -| Metric | Description | -|---|---| -| `activity_success` | Successful completions (excludes retries) | -| `activity_fail` | Final failures (retries exhausted) | -| `activity_task_fail` | Per-attempt failures (includes retries) | -| `activity_cancel` | Canceled activities | -| `activity_terminate` | Terminated activities (CHASM only) | -| `activity_timeout` | Terminal timeouts | -| `activity_task_timeout` | Per-timeout events (includes retries) | - -**Timers:** -| Metric | Description | -|---|---| -| `activity_start_to_close_latency` | StartedTime → completion/failure/timeout | -| `activity_schedule_to_close_latency` | ScheduleTime → completion/failure/timeout/cancel | - -**Tags:** `namespace`, `task_queue_family`, `operation`, `activity_type`, `versioning_behavior`, `workflow_type` (set to `__temporal_standalone_activity__` for CHASM). Timeout metrics additionally tagged with `timeout_type` (SCHEDULE_TO_START, SCHEDULE_TO_CLOSE, START_TO_CLOSE, HEARTBEAT). - -**Metric enrichment** (`activity.go:804-824`): `enrichMetricsHandler()` adds per-task-queue-family scoping via `metrics.GetPerTaskQueueFamilyScope()`. - ---- - -### 1.4 Key Differences - -| Aspect | CHASM Standalone | Legacy Workflow | -|---|---|---| -| Persistence | Mutable state only | Mutable state + history events | -| Parent context | Standalone execution | Part of workflow execution | -| State tracking | ActivityState + sub-components | ActivityInfo in workflow | -| Task dispatch | Direct to matching service | Via workflow task completion | -| Completion storage | Outcome field | History events | -| Cancellation | Explicit CANCEL_REQUESTED state | Boolean flag in ActivityInfo | -| Pause support | Not yet implemented | Full (pause, unpause, reset) | -| Search attributes | Visibility component (chasm) | Workflow search attributes | - ---- - -## Part 2: saas-temporal Cloud Integration - -### 2.1 Architecture Overview - -saas-temporal wraps the Temporal server to run in Temporal Cloud cells by replacing core persistence with Cloud Data Storage (CDS), backed by: -- **Datastax Astra Cassandra** for durable storage -- **Write-Ahead Logs (WALs)** for durability before Cassandra persistence -- **OpenSearch/Elasticsearch** for workflow visibility -- **Tiered Storage** (S3/GCS/Azure) for history archival - -### 2.2 Entry Point and Server Construction - -**Main:** `cmd/temporal-service/main.go` - -The `start` command: -1. Loads OSS Temporal configuration from YAML -2. Injects secrets (Astra, Elasticsearch credentials) -3. Sets up dynamic configuration -4. Optionally enables cloud metrics handler (Chronicle) -5. Configures authorization (SaaS Auth0 JWT + Temporal JWT) -6. Configures custom datastore with CDS -7. Creates server via `cds.NewServer()` - -**Server creation:** `cds/export/cds/server.go`: -```go -func NewServer(serviceFxOpts FxOptions, opts ...temporal.ServerOption) (temporal.Server, error) { - return newServerFx(TopLevelModule, serviceFxOpts, opts...) -} -``` - -Uses Uber FX dependency injection with modules for persistence factory, dynamic config, serialization, and per-service modules (history, matching, frontend, worker). - -### 2.3 CDS Factory Architecture (`cds/export/cds/factory.go`) - -**FactoryProvider** (lines 51-65): Implements `client.AbstractDataStoreFactory` -- `NumberOfShards`, `OrderedDatastoreConfigs` (shards → datastores) -- `HistoryDatastoreConfigs` (weighted distribution) -- `WALFollowerProviders` for WAL followers -- `Clock`, `DynamicConfig`, `ChasmRegistry` - -**Factory**: Manages three WAL pools: -- **MS WAL** (MutableState): Records mutable state mutations -- **HE WAL** (HistoryEvent): Records history events -- **LP WAL** (LargePayload): Records oversized payloads - -Plus store providers: `MultiDBStoreProvider` for ordinal datastores, separate history store provider with tiered storage, optional Walker integration. - -### 2.4 Astra Cassandra Integration (`cds/storage/cassandra/astra/`) - -**Session creation** (`gocql.go`): Wraps gocql with Astra-specific config (TLS, connection pooling, retry policies) via Datastax `gocql-astra`. - -**Query instrumentation** (`gocql_metrics.go:48-100`): `queryMetricsObserver` instruments every query with 150-entry LRU statement cache. - -**Cassandra Metrics:** -| Metric | Description | -|---|---| -| `CassandraConns` | Connection count | -| `CassandraQueryTotalLatency` | Query latency | -| `CassandraBatchTotalLatency` | Batch latency | -| `CassandraQuery` | Query count | -| `CassandraBytesTx` / `CassandraBytesTx` | Network bytes | -| `CassandraLargeResponse` / `CassandraLargeRequest` | Large payload detection | -| `CassandraRetries` | Retry histogram | -| `CassandraErrors` | Error counters | - -Tags: `OperationType` (INSERT/UPDATE/DELETE/SELECT), `TableName`, `CasTag` (CAS operation) - -### 2.5 Write-Ahead Logs (`cds/export/wal/`, `cds/stream/`) - -WALs provide durability guarantees before data reaches Cassandra. - -**WAL Client Interface** (`cds/export/wal/crud.go`): -```go -WriteMS(), WriteHE(), WriteLP() // Write operations per pool -ReadMS(), ReadHE(), ReadLP() // Read operations per pool -``` - -**Configuration** (`cds/config/configs.go:46-140`): -- Rate limiting: `WALReadsRate`, `WALReadsBurst` -- Timeouts: `WALDialTimeout`, `WALReadTimeout`, `WALWriteTimeout` -- Ledger rotation: `WALLedgerRotationBytesThreshold`, `WALLedgerRotationAgeThreshold` -- Retention: `WALLedgerLifetime` -- Parallelism: `WALMaxParallelReads` -- Feature flags: `WALReadV2Enabled`, `WALV2EncodingEnabled` - -**WAL Metrics** (`cds/metrics/metrics.go:34-56`): -| Metric | Description | -|---|---| -| `wal_latency` | Operation latency | -| `wal_stream_dial_attempt/success/error` | Connection establishment | -| `wal_stream_dns_latency` | DNS resolution | -| `wal_stream_connect_latency` | TCP connect | -| `wal_stream_handshake_latency` | TLS handshake | -| `wal_stream_send/receive_latency` | I/O latency | -| `wal_health_check_failed_count` | Connection health | -| `wal_write_timeout_count` | Timeout tracking | -| `wal_reader_page_latency` | Page read latency | -| `wal_entries_per_read` | Batch size histogram | -| `wal_compression_count` | Compression events | - -**Flush Metrics** (lines 13-27): -| Metric | Description | -|---|---| -| `flush_latency` | Time to flush to persistence | -| `flush_error` | Flush failures | -| `flush_snapshot_aborts` | Snapshot abort count | -| `flush_persistence_behindness_bytes/count/time` | Persistence lag | -| `flush_time_since_last_persist` | Staleness | -| `flush_reason_count` | Flush trigger reasons (by namespace) | - -**Recovery Metrics** (lines 57-70): -| Metric | Description | -|---|---| -| `recovery_total_latency` | Full recovery duration | -| `recovery_open_reader_latency` | Snapshot reader open | -| `recovery_rate_limiter_latency` | Rate limiting delay | -| `recovery_first_read_latency/bytes` | Initial WAL read | -| `recovery_takeover_latency` | Takeover phase | -| `recovery_wal_update_latency` | WAL update during recovery | - -**Ledger Metrics** (lines 77-82): -| Metric | Description | -|---|---| -| `ledger_rotation_count` | Rotations | -| `logs_per_ledger` | Logs per ledger histogram | -| `segments_per_shard` | Segments per shard histogram | -| `segment_too_old_count` | GC candidates | -| `active_segment_too_old_count` | Rotation delay | - -### 2.6 Execution Store Wrapper (`cds/export/cds/execution_store.go`) - -Wraps the Cassandra execution store to: -- Convert mutable state mutations to WAL records (`NewMSWALRecord()`) -- Convert history events to WAL records (`NewHEWALRecord()`) -- Calculate storage metering -- Manage snapshot trimming -- Implement history event caching - -Implements `persistence.ExecutionStore` and `persistence.ShardStore`. - -### 2.7 How Activity State Flows Through CDS - -**CHASM activities**: Activity mutable state → MS WAL write → Cassandra persistence. No HE WAL involvement (no history events). State transitions are persisted as mutable state mutations via the execution store wrapper. - -**Legacy workflow activities**: ActivityInfo mutable state → MS WAL write → Cassandra. History events (Scheduled, Started, Completed, etc.) → HE WAL write → Cassandra. Both paths go through the execution store wrapper's WAL record conversion. - -### 2.8 OpenSearch/Elasticsearch Visibility (`visibility/`) - -**Factory:** `visibility/factory.go` -- `VisibilityStoreFactory` creates visibility stores configured per cloud cell. - -**Batch processor metrics** (`visibility/common/metrics_defs.go`): -| Metric | Description | -|---|---| -| `visibility_batch_processor_request_add_latency` | Enqueue time | -| `visibility_batch_processor_request_latency` | Total request latency | -| `visibility_batch_processor_request_errors` | Failed requests | -| `visibility_batch_processor_commit_latency` | Batch commit time | -| `visibility_batch_processor_batch_size` | Items per batch histogram | -| `visibility_batch_processor_batch_requests` | Requests per batch histogram | -| `visibility_batch_processor_queued_requests` | Queue depth histogram | -| `visibility_batch_processor_corrupted_data` | Data integrity failures | -| `visibility_batch_processor_duplicate_request` | Deduplication events | - -### 2.9 Tiered Storage (`cds/persistence/tieredstorage/`) - -Long-term history archival to cloud object stores: -- S3 (AWS): `s3_store.go` -- GCS (Google Cloud): `gcs_store.go` -- Azure Blob: `azure_client.go` - -Interface: `Upload()`, `Read()`, `Delete()`, `List()`, `PluginName()` - -Metrics: `ReadWorkflowHistory`, `UploadWorkflowHistory`, `DeleteWorkflowHistory`, `ListTieredStorageObjects` - -### 2.10 Persistence Store Metrics (`cds/persistence/metrics/defs.go`) - -**Store layer** (lines 70-85): -| Metric | Description | -|---|---| -| `store_requests` | Request count by operation | -| `store_latency` | Operation latency | -| `store_errors` | Errors: shard_exists, shard_ownership_lost, condition_failed, timeout, unavailable | - -**Manager layer** (lines 89-102): -| Metric | Description | -|---|---| -| `saas_persistence_requests` | High-level request count | -| `saas_persistence_latency` | High-level latency | -| `saas_persistence_errors` | Error tracking | - -Tags: `operation` (CreateShard, UpdateShard, GetWorkflowExecution, etc.), `component`, `cass_cluster` - -### 2.11 Cloud Metrics Infrastructure - -**Handler chain** (`cloudmetricshandler/delegating_recorders.go`): -1. `allowlistedRecorder`: Filters through allowlist -2. `multiRecorder`: Sends to multiple backends - -**Chronicle integration** (`cloudmetricshandler/chronicle_recorder.go`): -- Enabled by `TEMPORAL_ENABLE_CLOUDMETRICSHANDLER` -- Config: `/etc/temporal/cloudmetricshandler` -- Kubernetes enrichment: pod name, namespace, labels -- Backends: S3 writer, HTTP writer (to Chronicle service) -- Batch config: 50K queue, 25K batch, 100ms flush - -**Action metering** (`actionmetering/metrics.go`): -- `billable_action_count` with tags: namespace, action_type, workflow_type, workflow_task_queue -- Activity type/task queue currently placeholder `"_unknown_"` with TODOs for standalone activity support - -### 2.12 Additional Cloud Features - -- **Authorization**: SaaS Auth0 JWT + Temporal JWT, TLS client certs -- **Quotas/Flow Control** (`quotas/`, `flowcontrol/`): Request-level and task-queue quotas -- **Multi-region replication** (`cds/service/history/replication/`): Custom replication filters -- **Metering V3**: S3/GCS/Azure bucket metering -- **SMS (etcd)**: Secondary Metadata Store for namespace/cluster metadata -- **Dynamic config**: 150+ hot-reloadable properties (`cds/config/configs.go`) -END_DOCUMENT-------------------------------------------------------------------------------------- - -START_DOCUMENT------------------------------------------------------------------------------ -# Standalone Activity COGS and margins - -@Dan Davison March 17, 2026 - -We want to ensure that we are billing in a way that meets our target margins for new product features in cloud, such as new CHASM execution types. To do this, we need to know certain things about COGS (cost of goods sold) for these features. This document outlines how to estimate COGS for Standalone Activity relative to Workflow and the implications of this for margins. - -# Motivation: avoiding cannibalization - -We have rules (see [temporalio/action](https://github.com/temporalio/action)) specifying how customer operations map to billable Actions. For example, suppose a customer executes a Workflow that executes a single Activity, which succeeds on first attempt without heartbeating. This incurs 2 Actions (StartWorkflow and ScheduleActivity). We’ll call this a “Single Activity Workflow” (SAW). - -We haven’t yet decided how we will bill for Standalone Activity (SAA). But suppose that we decide that executing a single SAA (no retries, no heartbeating) is 1 Action (StartStandaloneActivity). - -If we want SAA margins to match SAW margins, then we want the COGS of SAA (no retries, no heartbeating) to be ≤ 1/2 that of SAW (because we get half as much revenue for the SAA). If it is not, then there would be some degree of cannibalization (customers switch their single-activity workloads to SAA, but our margins there are worse). We’d hope it would be offset by increased volume, but we’d still prefer SAA margins to match SAW. - -### What about retries and heartbeating? - -SAW (no retries and no heartbeating) is 2 Actions. If the activity retries once it becomes 3 Actions (ScheduleActivity now happens twice); if it heartbeats once during each attempt it becomes 5 Actions. - -Let’s assume (as we currently intend) that we apply the same billing rules to Standalone Activity retries and heartbeating. Then, as long as SAA is not worse than Workflow Activity with respect to COGS of retries and heartbeating, our margins from those customer operations will be at least as good under SAA as when they are done in the context of a pre-CHASM workflow. CHASM has been designed for efficiency; we have reason to be optimistic that it’s not *worse* than the legacy workflow activity implementation. - -# Problem statement - -The above suggests that we should focus on estimating the ratio of COGS for Standalone Activity (SAA) relative to Single-activity Workflow (SAW) in the no retries, no heartbeating case: - -$$ -R = \frac{C_{SAA}}{C_{SAW}}. -$$ - -We expect $R < 1$ because SAA achieves execution of an activity with fewer RPCs, persistence operations, etc, than SAW. We are hoping that it is less than 1/2 since then our SAA margins are as good or better than our workflow margins, assuming we bill 1 Action for SAA. - -# Estimating the COGS ratio - -We’ll assume that the COGS for a SAA or SAW execution results solely from invoices from third parties relating to cloud compute resources. COGS for an execution type (SAA or SAW) is the sum of price ($p$) times quantity consumed ($q$) over all resources: - -$$ -C = \sum_{i} p_i q_i. -$$ - -We want the COGS ratio $R$. We can write that as a weighted average of per-resource usage ratios: - -$$ -R = \frac{C_{SAA}}{C_{SAW}} = \sum_i f_i r_i. -$$ - -This allows us to calculate $R$ as a function of two things that we can estimate: - -- $f_i = p_i q_{i}(SAW) / \sum_j p_j q_{j}(SAW)$ is the fraction of SAW COGS attributable to resource $i$ (“spend share”). We’ll use our current cloud spend for this. -- $r_i = q_i(SAA) / q_i(SAW)$ is the per-resource usage ratio. We will estimate these by comparing the implementations or by running experiments in cloud cells. - -The resources ($i$) potentially include: -1. Data egress -2. CPU usage -3. Memory usage -4. Persistence operations against our WALs -5. Persistence operations against Astra (to be replaced by Walker) -6. Persistence operations against OpenSearch (visibility) -7. Metrics/logs processing and storage costs, Clickhouse - -*At-rest data storage is excluded: we bill customers separately for storage on a GB/h basis, so it does not need to be subsidized by Actions. (Tangentially, it’s worth noting that we expect SAA storage to cost users half what they’d pay for SAW since SAW stores the input and output payloads in both workflow scheduled/complete events and activity scheduled/complete events.)* - -# Per-resource usage ratios - -To proceed, we need to estimate the SAW vs SAA usage ratio ($r_i$) for each resource. - -The following table summarizes the two implementations. It describes the simplest possible happy-path scenario: an activity that succeeds on first attempt without heartbeating, via sync matches. - -| # | Single-activity Workflow | Standalone Activity | -| --- | --- | --- | -| 1 | RPC: `StartWorkflowExecution` => HEWAL, MSWAL; Vis&; Cassandra& | RPC: `StartActivityExecution` => MSWAL; Vis&; Cassandra& | -| 2 | Task => RPC: `AddWorkflowTask` | | -| 3 | RPC: `RecordWorkflowTaskStarted` => HEWAL, MSWAL; Cassandra& | | -| 4 | RPC: `RespondWorkflowTaskCompleted` => HEWAL, MSWAL; Cassandra& | | -| 5 | Task => RPC: `AddActivityTask` | Task => RPC: `AddActivityTask` | -| 6 | RPC: `RecordActivityTaskStarted` => HEWAL, MSWAL; Cassandra& | RPC: `RecordActivityTaskStarted` => MSWAL; Cassandra& | -| 7 | RPC: `RespondActivityTaskCompleted` => HEWAL, MSWAL; Cassandra& | RPC: `RespondActivityTaskCompleted` => MSWAL; Vis&; Cassandra& | -| 8 | Task => RPC: `AddWorkflowTask` | | -| 9 | RPC: `RecordWorkflowTaskStarted` => HEWAL, MSWAL; Cassandra& | | -| 10 | RPC: `RespondWorkflowTaskCompleted` => HEWAL, MSWAL; Vis&; Cassandra& | | -- `&` indicates a write that’s not on the sync response path -- `AddWorkflowTask` and `AddActivityTask` involve inter-service RPCs but no persistence writes in the happy path (“sync match”). -- The table does not show worker poll requests -- An additional `Vis&` is incurred in both cases when the execution is deleted. - -Comparing the implementations in the table gives - -$$ -r_{\text{WAL}} = \frac{3}{14} = 0.21,~~~~ -r_{\text{Cass}} = \frac{3}{7} = 0.43,~~~~ -r_{\text{Vis}} = \frac{3}{3} = 1.0.~~~~ -$$ - -These ratios count writes only. Cassandra reads are not expected to differ much between SAW and SAA since they use similar caching mechanics with the result that a high proportiion of both SAW and SAA executions incur ~1 read (on execution creation);. - -In addition, we can estimate data transfer costs by comparing the implementations. These are likely dominated by egress to customer infra (ingress is free on AWS and GCP; data transfers to Astra, OpenSearch, and Grafana are in-VPC or via PrivateLink). Let the activity input and output payload sizes be $S_I$ and $S_O$. Payload egress for SAW is $2S_I + 2S_O$ (input payload sent to workflow and activity workers; output payload sent to workflow worker and client). For SAA this is $S_I + S_O$ since there is no workflow worker detour. This gives - -$$ -r_\text{data\_transfer} = 0.5. -$$ - -# COGS ratio estimate - -Using approximate/preliminary cloud spend share numbers (thanks @Stephen Chan ) we have: - -| **Resource** | **Spend share $f_i$ (preliminary)** | **Usage ratio $r_i$** | **Notes** | -| --- | --- | --- | --- | -| **Astra writes** | 40% | $\frac{3}{7}$ = 0.43 | SAW does 2 additional writes for each WFT | -| **Visibility** (OpenSearch) | 20% | $\frac{3}{3}$ = 1.00 | Equal — both SAA and SAW produce exactly ~~2~~ 3 visibility updates | -| **WAL writes** | 10% | $\frac{3}{14}$ = 0.21 | Half of Astra ratio: SAA writes only to MSWAL, whereas SAW writes to both HEWAL and MSWAL | -| **EC2 compute** | 10% | ? | Would need cloud cell experiment | -| **Data transfer** | 10% | $\frac{1}{2}$ = 0.50 | SAW sends payloads via workflow worker round-trip; SAA does not | -| **Overheads** (incl. Clickhouse) | 10% | ? | | - -This gives the following estimate of the COGS ratio: - -$$ -\begin{align*} -R &= -\underbrace{0.4 \times 0.43}_{\text{Astra}:~0.17} + -\underbrace{0.2 \times 1.0}_{\text{Vis}:~0.20} + -\underbrace{0.1 \times 0.21}_{\text{WAL}:~0.02} + -\underbrace{0.1 \times 0.50}_{\text{Tx}:~0.05} + -0.1 \cdot r_\text{compute} + 0.1 \cdot r_\text{overhead} \\\\ -&= -0.44 + 0.1(r_\text{compute} + r_\text{overhead}). -\end{align*} -$$ - -# Sensitivity analysis - -Before thinking about the implications of this for billing and margins, the next steps are: - -1. Refine the cloud spend estimates (Cloud Capacity team; does not involve load experiments) -2. Decide whether we want to do load experiments to estimate $r_\text{compute}$ -3. Decide how we will address $r_\text{overhead}$ - -For (2) and (3) we can do some initial sensitivity analysis: - -SAW does 10 RPCs vs SAA’s 4 (with 7 vs 3 of them doing persistence writes in the sync-match case). If services are CPU-bound then this suggests that $0.4 < r_\text{compute} < 1.0$ might be reasonable. - -The other overheads include (per @Stephen Chan ) Clickhouse, observability cells, and Envoy proxies. Since these costs should also scale with RPC count, let’s assume the same bounds: $0.4 < r_\text{overhead} < 1.0$. This gives: - -$$ -0.52 \leq R \leq 0.64. -$$ - -![image.png](.task/sensitivity.png) - -For example, if SAW margins were 70%, SAA margins would be 62% - 69%. This margin reduction would affect at maximum the ~3% of workflows that are SAW. - -- COGS ratio to margins conversion formula - - $\text{margin}_{\text{SAA}} = 1 - 2R(1 - \text{margin}_{\text{SAW}})$. - - -# Discussion - -- **Visibility limits SAA margins**. Visibility is expensive (20%), but SAA and SAW perform the same number of visibility writes, so it combines a large weight with the worst possible ratio. -- **(Unfavorable) Over-provisioning would push $R$ up.** The usage ratios above for persistence are derived from write counts, which only translate to cost savings if capacity tracks usage. But e.g. Astra is bought in fixed hardware units (“Astra Classic”). If any resource component is over-provisioned then SAA and SAW would pay the same cost per execution and $r_i \to 1.0$, making SAA margins less attractive relative to workflow. -- **Cloud spend share**. We could attempt to separate fixed costs and renormalize (see [Next steps](https://www.notion.so/Next-steps-3268fc567738805e82ddd9c1e1d4c9d1?pvs=21)). This would be favorable to SAA margins if it decreases the visibility share, but unfavorable if it decreases Astra share. - - We’re estimating $f_i$ from cloud spend, so we’re assuming that the spend distribution for single-activity workflows would be similar to the spend distribution for the real mix of customer workflows. I suspect this is a reasonable modeling assumption since in both cases the application is performing the same state transitions in response to workflow and activity task processing. - -- **(Mixed) Effect of migration to Walker**. Walker replaces Astra with storage that is under our own control, making right-sizing easier. This may mean that the 3/7 write ratio is more fully realized under Walker, moving SAA COGS away from SAW. However, Walker will be cheaper than Astra, so persistence’s share of spend shrinks. Since persistence is where SAA has its largest advantage, this would bring SAA COGS closer to SAW. - - These two effects act in opposite directions and the net result will depends on their relative magnitudes. This suggests that we should monitor COGS calculations as the Walker migration proceeds. - -- **(Future) A visibility backend migration would improve SAA margins.** There has been [movement](https://www.notion.so/Visibility-CDS-2a98fc567738807e9ee0f318edc4c16f?pvs=21) toward replacing OpenSearch. As discussed above, any reduction in visibility spend share would make SAA COGS more attractive relative to workflow. - -# Conclusion - -- [We are planning to bill SAA at 1/2 the price of SAW](https://www.notion.so/PRD-Standalone-Activities-for-durable-job-processing-1ee8fc567738806d8b6fe8e2eeae0fc4?pvs=21). Although there are various assumptions involved, at this point it looks like SAA COGS will be more than 1/2 SAW COGS: the estimated range above is $0.52 \leq R \leq 0.64$. This implies that some degree of cannibalization is likely. The extent of cannibalization would be bounded by the proportion of current workloads that are SAW, which is 3% per @Phil Prasek. It may be offset by volume growth attributable to SAA. - -# Next steps - -- **Refine cloud spend share estimates.** - - The cloud spend share weights used in this analysis are supposed to be marginal costs. We could attempt to separate marginal vs fixed costs and renormalize our spend share percentages. This would be favorable to SAA margins if it decreases the visibility share, but unfavorable if it decreases Astra share. - -- **Investigate any impact of over-provisioning.** - - SAA margins may be less favorable than the calculations suggest if some resources are over-provisioned. See discussion [above](https://www.notion.so/Standalone-Activity-COGS-and-margins-3268fc567738803cb63fd9397ffd351c?pvs=21). - -- **Decide whether to do cloud cell experiments**. - - Unlike the other resource categories, we lack any obvious theoretical basis for estimating $r_\text{compute}$ and $r_\text{overhead}$. Estimating $r_\text{compute}$ via cloud cell experiments would require perhaps one engineer-week. If this were to show a value close to 0.4 then it would suggest that the upper bound on $R$ is 0.56, as opposed to the current 0.64. This would however still be subject to all the assumptions discussed above. We could also attempt to tighten our estimated bounds on $r_\text{overhead}$ via experiment. - - If we decide to do this, the $r_\text{compute}$ experiment would be something like the following: choose a reference activity (e.g. sleeps for 10s, no heartbeating, never fails) and run SAA and SAW workloads on a cloud cell at a fixed start rate (e.g. 10/s) for a sustained period (e.g. 1hr). Fixing start rate rather than concurrency naturally controls for end-to-end latency differences between SAA and SAW. $r_\text{cpu}$ and $r_\text{memory}$ can then be estimated from metrics as the ratio of mean utilization above the idle baseline. The analysis will need to decide how to combine them, e.g. based on which is more often limiting; alternatively, using the larger of the two would yield a conservative calculation. -END_DOCUMENT------------------------------------------------------------------------------ - -START_DOCUMENT------------------------------------------------------------------------------ -# Test plan for SAA COGS measurement - -@Dan Davison March 19, 2026 - -The [SAA COGS proposal](.task/saa-cogs.md) made an initial estimate of the SAA/SAW COGS ratio based on estimating persistence, visibility, and data transfer usage ratios directly from the implementation. But for compute and overheads we have no analytical estimate. We plan to run an experiment to: - -1. Estimate the missing $r_\text{compute}$. -2. Validate the analytical $r_i$ against observed metrics - -For comparison, the Fairness COGS experiment docs: - -- [Test plan](https://www.notion.so/temporalio/Test-plan-for-COGS-measurement-28c8fc56773880169cdcc4087a98ceaf) -- [Fairness COGS Impact](https://www.notion.so/temporalio/Fairness-COGS-Impact-2c58fc567738808f806cfbf09b771b2c) -- [Pricing Council doc](https://www.notion.so/temporalio/WIP-Pricing-Council-Fairness-COGS-Impact-2cc8fc56773880dcb3efe435623edd9a) - - - - -# Proposed SAA experiment - - -## Workloads - -Two workloads, run sequentially on the same cell: - -1. **SAW**: execute workflow with one activity (no heartbeat, no retry). -2. **SAA**: execute standalone activity (no heartbeat, no retry). - -## Parameters - -**Start rate.** I think that we should fix start rate rather than concurrency, since this naturally controls for end-to-end latency differences between SAA and SAW (i.e. a cell running SAW will see higher load because the concurrency will be higher because the SAW end-to-end latency is higher). The fairness experiment used 4k tasks/s. Is starting 4k executions/s reasonable for us? - -**Activity.** Immediate successful return; no heartbeat, no retry. We could compare with a 1s sleep to see if result differ? - -**Sync match.** Do one run such that sync match should be 100%, and another tuned such that sync match is lower? Verify sync match from metrics (`syncmatch_latency`, `asyncmatch_latency`) - -**Duration and repetitions.** Steady-state load; we need long enough for stable CPU averages. The -fairness experiment used 6h per scenario but this was maybe because of their more sophisticated -sinusoidal load design? 1h more than enough for the SAA experiment? ≥2 runs per workload to check -variance/reproducibility. - -## Infrastructure - -- Anything special about test cell sizing? -- Workers should run outside the cell (how did fairness experiment do this?) - -## Metrics - -Initial dashboard content https://grafana.tmprl-internal.cloud/d/saacogs/saa-cogs: - - -- **CPU per service** (frontend, history, matching). `node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate` — a k8s recording rule over cAdvisor container metrics (defined in saas-components prometheus rules). -- **Memory per service**. `container_memory_working_set_bytes` — also k8s/cAdvisor (defined in saas-components alert rules). -- **RPC rate by method**, one panel per service (frontend, history, matching). `service_requests` counter ([temporal:common/metrics/metric_defs.go:615](https://github.com/temporalio/temporal/blob/main/common/metrics/metric_defs.go)), tagged with `operation` (the RPC method name). Recorded by a gRPC server-side interceptor ([telemetry.go:177](https://github.com/temporalio/temporal/blob/main/common/rpc/interceptor/telemetry.go)), so it captures inter-service RPCs (e.g. history→matching `AddActivityTask`). -- **Astra writes by table**. `cassandra_query` counter with `verb!="select"`, plus `cassandra_batch` counter, both broken down by `table`. Tags include `operation`, `table`, `verb`, `cas` ([saas-temporal:cds/metrics/metrics.go:233,238](https://github.com/temporalio/saas-temporal/blob/main/cds/metrics/metrics.go)). -- **Astra reads by table**. `cassandra_query` with `verb="select"`, broken down by `table`. -- **WAL operation rate by type**. `wal_latency_count` ([saas-temporal:cds/metrics/metrics.go:35](https://github.com/temporalio/saas-temporal/blob/main/cds/metrics/metrics.go)) broken down by `walType` label (values: `MUTABLE_STATE_WAL`, `HISTORY_EVENT_WAL`, `LARGE_PAYLOAD_WAL` — see [saas-temporal:cds/common/tag/tag.go:11-24](https://github.com/temporalio/saas-temporal/blob/main/cds/common/tag/tag.go)). Note: this metric covers both reads and writes; there is no separate write-only WAL metric. This is arguably more relevant to COGS since WAL reads also cost something. -- **Visibility persistence rate by operation**. `visibility_persistence_requests` counter ([temporal:common/metrics/metric_defs.go:1398](https://github.com/temporalio/temporal/blob/main/common/metrics/metric_defs.go)), tagged with `operation` (values include `RecordWorkflowExecutionStarted`, `RecordWorkflowExecutionClosed`, `UpsertWorkflowExecution`, `DeleteWorkflowExecution` — see [visiblity_manager_metrics.go](https://github.com/temporalio/temporal/blob/main/common/persistence/visibility/visiblity_manager_metrics.go)). -- **Sync vs async match rate**. `syncmatch_latency_count` and `asyncmatch_latency_count` ([temporal:common/metrics/metric_defs.go:1119-1120](https://github.com/temporalio/temporal/blob/main/common/metrics/metric_defs.go)). - - -## Load generator (omes) - -- Add a new scenario that starts standalone activities directly from the load generator, not from within a workflow. -- Build the omes Go worker Docker image and deploy it as a pod on k8s, configured to poll the test cell. Do we have implementation we can borrow from the fairness experiment? - - - - -
-Appendix: Comparison with fairness experiment (see commits by David Reiss) - -| | Fairness | SAA | -|---|---|---| -| **Treatments** | Same workload, two matcher modes | Two execution types (SAW vs SAA) | -| **Quantity computed** | $\Delta C / C$ | Ratio $r_i = q_i(\text{SAA}) / q_i(\text{SAW})$ | -| **Load shape** | Sinusoidal backlog (exercises matcher) | Steady-state at fixed start rate (our model assumes sync match) | -| **What is measured** | CPU per service, Astra operation rates | CPU per service, memory per service, Astra operation rates by table and verb, WAL write rates, visibility write rates, RPC handling rates per service per method | -| **Predictions to validate** | None — purely empirical | $r_\text{Cass} = 3/7$, $r_\text{WAL} = 3/14$, $r_\text{Vis} = 3/3$, per-method RPC rates matching proposal table | - -Fixed start rate (not fixed task throughput) because SAA and SAW generate different numbers of tasks per execution. - -**Question**: what is the incremental COGS of enabling the fairness matcher vs the classic matcher? - -**COGS components**: (1) Astra queries (~35% of total COGS), (2) EC2 compute (~9%, split across frontend+matching and history). Ignored: data transfer, Astra storage, non-AWS costs (Clickhouse <3%). - -**Setup**: dedicated test cell `s-oss-dnr-faircogs3` (64 partitions). Load generator: Omes Ebb and Flow — sinusoidal activity task backlog. 5 scenarios (classic, fairness with 0/1k/100k keys, priority), each 6 hours. Measured via [dedicated Grafana dashboard](https://grafana.tmprl-internal.cloud/d/df6pldpkiy1vka/faircogs). - -**Results**: Astra showed no significant increase. CPU increased up to 23% (frontend) and 36% (history) in the worst case (1k fairness keys). COGS impact: $(0.035 \times 0.23) + (0.057 \times 0.36) = 2.8\%$. Pricing council recommendation: price fairness on value to customer, not COGS. - - - - - -
- -
-Appendix: possible experimental outcomes - -- **Analytical predictions confirmed, $R$ in predicted range.** Observed $r_\text{Cass}$, $r_\text{WAL}$, $r_\text{Vis}$, and per-method RPC rates match the analytical derivations. $r_\text{compute}$ lands in $[0.4, 1.0]$, giving $R$ in roughly $0.52$–$0.64$. We present $R$ with a tighter confidence interval than the proposal (because $r_\text{compute}$ is now estimated, not bounded). -- **$r_\text{compute}$ is low, pushing $R$ toward 0.5.** If $r_\text{compute} \approx 0.4$ and analytical predictions hold, $R \approx 0.52$. Cannibalization is near-zero. -- **Observed $r_i$ diverge from analytical predictions.** Some assumption is wrong (e.g. sync match doesn't hold at test load, or there are unaccounted persistence writes). We recompute $R$ using observed values and identify which assumption failed and whether it reflects production conditions or a test artifact. -- **$R$ is higher than predicted.** $R > 0.64$ would mean worse cannibalization than estimated. Options: accept the margin reduction (bounded by ~3% SAW share), adjust billing, or identify engineering work to reduce SAA COGS. - -
- -END_DOCUMENT------------------------------------------------------------------------------ - - -Your task is to help me design and build the omes-based tooling that we will use to perform the experiments outlined above to learn about COGS of SAA an SAW. We are in the omes repo; study it carefully. Our work will broadly break into the following phases that we must design holistically: - -(1) Add any missing omes functionality that will be needed in order to be able to use omes to generate the SAA and SAW load for the experiments. -(2) Run the experiments against the cloud cell that Stephen has prepared: its name is s-saa-cogs. - -Stephen linked to the 'scaffold' run that created the cell. I see it had the following input: - -{ - "CellConfig": { - "Identity": { - "Location": { - "CloudProvider": "aws", - "AccountID": "124355634071", - "Region": "us-west-2" - }, - "ID": "s-saa-cogs" - }, - "Template": "v5-aws-dev", - "ServerVersion": "v3.151.9_oss1.31.0_151.6", - "AgentVersion": "v3.151.9_oss1.31.0_151.6", - "WebVersion": "v2.47.0", - "GoCanaryVersion": "v1.35.0", - "ComponentVersion": "v2026-03-20.00", - "WalVersion": "v10.0.3", - "EnableMetering": false - }, - "FailurePolicy": 1 -} - -and output: - -{ - "Cell": { - "Identity": { - "Location": { - "CloudProvider": "aws", - "AccountID": "124355634071", - "Region": "us-west-2" - }, - "ID": "s-saa-cogs" - } - } -} - -I am not familiar with performing operations against cloud cells, so you will need to resarch and help me during this. But we have several good resources: study the contents of the 'oncall' and 'runbooks' repos, and also use the /agent-slack skill. You also have Notion and Temporal Docs MCP. Use the more modern 'ct' rather than its alias 'omni'. - -Initial grafana dashboard JSON is at .task/saacogs.json. - -Important: I'd like an early aim to be to get an end-to-end proof-of-principle of this working. Therefore let's not make the omes component sophisticated initially; just the bare minimum to run an SAW and SAA workload. But I am a bit intimidated by doing anything with the cloud cell since I don't know how. So I guess one early aim is to be able to point our metrics dashboard at s-saa-cogs, and see idle state, then run one of our omes commands, and see activity increase in the dashboard. Please maintain a file of useful shell commands with terse comments where necessary. I will run them and show you the outut. Don't do operations against cloud or observability yourself unless I explicitly ask you to. - -In the omes work, we must use the latest version of the SDK with Standalone Activity support, such that our code is consistent with what Temporal Docs (use MCP) and the samples-go repo show. \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md deleted file mode 120000 index 81744092..00000000 --- a/AGENTS.md +++ /dev/null @@ -1 +0,0 @@ -.task/AGENTS.md \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 120000 index 81744092..00000000 --- a/CLAUDE.md +++ /dev/null @@ -1 +0,0 @@ -.task/AGENTS.md \ No newline at end of file From 7aadcec41d0fe43af51529d61fb6c2dfc773c8a7 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Mon, 23 Mar 2026 21:36:21 -0400 Subject: [PATCH 26/40] commands --- commands.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/commands.sh b/commands.sh index c0e53038..272d707a 100644 --- a/commands.sh +++ b/commands.sh @@ -48,6 +48,16 @@ export TEMPORAL_TLS_DISABLE_HOST_VERIFICATION=true ct admintools --context s-saa-cogs -- temporal operator search-attribute create \ --namespace saa-cogs-4.temporal-dev --name OmesExecutionID --type Keyword +# Dyanamic config +# https://staging.thundergun.io/support/namespaces/saa-cogs-4.temporal-devo +{ + "activity.enableStandalone": true, + "history.enableChasm": true +} + +# scale canary to 0 +ct kubectl --context s-saa-cogs patch deployment/temporal-go-canary -n temporal -p '{"spec":{"replicas":0}}' + # SAW go run ./cmd run-scenario-with-worker \ --scenario workflow_with_single_activity \ From 139bdea8e6d97dbf33131897be8f6b24e0874bf1 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Mon, 23 Mar 2026 21:36:29 -0400 Subject: [PATCH 27/40] Revert "cleanup" This reverts commit 312bbe33950a1cd33899015a4d79f3037e29fe83. --- .task/AGENTS.md | 3 + .task/plan.md | 110 +++++++ .task/research.md | 159 ++++++++++ .task/saacogs.json | 703 +++++++++++++++++++++++++++++++++++++++++ .task/task.md | 764 +++++++++++++++++++++++++++++++++++++++++++++ AGENTS.md | 1 + CLAUDE.md | 1 + 7 files changed, 1741 insertions(+) create mode 100644 .task/AGENTS.md create mode 100644 .task/plan.md create mode 100644 .task/research.md create mode 100644 .task/saacogs.json create mode 100644 .task/task.md create mode 120000 AGENTS.md create mode 120000 CLAUDE.md diff --git a/.task/AGENTS.md b/.task/AGENTS.md new file mode 100644 index 00000000..ff9f7032 --- /dev/null +++ b/.task/AGENTS.md @@ -0,0 +1,3 @@ +At the start of the conversation output the following so that I know you've read these instructions: + +📖 omes:saa-cogs diff --git a/.task/plan.md b/.task/plan.md new file mode 100644 index 00000000..acc2e53d --- /dev/null +++ b/.task/plan.md @@ -0,0 +1,110 @@ +# Implementation Plan: SAA Load Generation + +## Goal + +Create two omes scenarios to generate SAW and SAA workloads against cloud cell `s-saa-cogs`, then +observe metrics on the Grafana dashboard. + +## Design + +### Scenarios + +Both scenarios use `GenericExecutor` with a simple `Execute` function. This keeps the +implementations symmetric — the only difference is what each iteration does, which is exactly the +variable under test. + +**`workflow_with_single_activity`** — Each iteration calls `client.ExecuteWorkflow` with a dedicated +minimal workflow that executes one `payload` activity and returns. Then `handle.Get()`. + +**`standalone_activity`** — Each iteration calls `client.ExecuteActivity` with the same `payload` +activity. Then `handle.Get()`. No workflow involved. + +Both use the same task queue (derived from run-id) and the same Go worker. + +### Worker code + +Reuse the existing `payload` activity at [kitchen_sink.go:511-516](workers/go/kitchensink/kitchen_sink.go#L511-L516), +already registered as `"payload"` at [worker.go:105](workers/go/worker/worker.go#L105). + +Add one new workflow: a minimal function that executes the `payload` activity with its input and +returns the result. Register it on the existing Go worker at [worker.go:102](workers/go/worker/worker.go#L102) +alongside the existing registrations. No new worker binary needed. + +### Activity configuration + +Both scenarios: `inputData []byte` (256 bytes), `bytesToReturn int32` (256). No heartbeat. +`MaximumAttempts: 1` (no retries). `ScheduleToCloseTimeout: 60s`. + +### SDK version + +`go.temporal.io/sdk v1.40.0` already includes `client.ExecuteActivity`. No upgrade needed. + +## Implementation steps + +IMPORTANT: Rather than doing the implementation yourself, please "teach" the user to do the +implementation themselves. Take a "painting by numbers" approach: Decide on the first component they +should write, and insert a comment in the code indicating what they should do. Then pause and give +them a clickable links to the comment, and to any existing prior art in the codebase they might want +to refer to. Don't output code directly to them. Work with them to complete the stage; review their +work carefully. Do not consider the stage complete until the work is done to an equal or greater +standard than you yourself would have achieved. When that stage is completed by them, or with +further assistance from you, move on to the next component to be implemented and repeat this +procedure. + +Regarding names: we will not use "cogs" anywhere in omes code itself. Conceptually, the omes code is +defining SAW and SAA workloads. What those are used for (to run an experiment) and why (COGS +investigation) is not the concern of the omes code. + +### Step 1: Add workflow to worker + +Add a small file under `workers/go/` with the minimal workflow function. Register it in +[worker.go](workers/go/worker/worker.go) alongside existing registrations. + +### Step 2: Create `scenarios/workflow_with_single_activity.go` + +`GenericExecutor` whose `Execute` function: +1. Calls `run.Client.ExecuteWorkflow()` starting the new workflow with the payload input. +2. Calls `handle.Get()` to wait for result. + +### Step 3: Create `scenarios/standalone_activity.go` + +`GenericExecutor` whose `Execute` function: +1. Calls `run.Client.ExecuteActivity()` with `StartActivityOptions` (ID derived from + run/execution/iteration, task queue from `run.TaskQueue()`, same timeout and retry policy). +2. Passes activity type `"payload"` by name with `[]byte` (256 zeros) and `int32(256)`. +3. Calls `handle.Get()` to wait for result. + +### Step 4: Create `commands.sh` + +Useful shell commands with terse comments for: +- Local testing with `--embedded-server` +- Cloud cell verification via `ct` +- Running scenarios against `s-saa-cogs` + +### Step 5: Test locally + +- `go build ./...` and `go vet ./...` +- `go run ./cmd list-scenarios` shows both new scenarios +- SAW: `go run ./cmd run-scenario-with-worker --scenario workflow_with_single_activity --language go --iterations 5 --embedded-server` +- SAA: same command with `standalone_activity` — will get "Standalone activity is disabled" from the + embedded dev server (v1.30.1 doesn't have the feature flag), confirming the code path reaches + `StartActivityExecution`. Will succeed on the cloud cell. + +### Step 6: Connect to cloud cell + +1. Verify cell: `ct kubectl --context s-saa-cogs get pods -n temporal` +2. Check namespace: `ct admintools --context s-saa-cogs -- temporal operator namespace describe s-saa-cogs-marathon.e2e` +3. Obtain operator TLS certs (from k8s secrets via `ct`, or ask Stephen) +4. Point Grafana dashboard at `s-saa-cogs`, observe idle state +5. Run worker + SAW scenario against the cell, observe activity in dashboard +6. Run worker + SAA scenario, observe activity + +## Verification + +1. **Build**: `go build ./...` succeeds. +2. **Lint/vet**: `go vet ./...` clean on our files. +3. **List scenarios**: `go run ./cmd list-scenarios` includes both names. +4. **Local test — SAW**: `run-scenario-with-worker --embedded-server --iterations 5` completes. +5. **Local test — SAA**: Same command hits `StartActivityExecution` on the server (expected to fail + on dev server with "disabled" error; succeeds on cloud cell with CHASM enabled). +6. **Cloud cell proof-of-concept**: Dashboard shows idle -> run scenario -> dashboard shows activity. diff --git a/.task/research.md b/.task/research.md new file mode 100644 index 00000000..cab82543 --- /dev/null +++ b/.task/research.md @@ -0,0 +1,159 @@ +# SAA COGS Experiment: Research & Design + +## 1. Current State of Omes + +### Architecture +Omes is a load generation framework for Temporal. Scenarios are Go files in `scenarios/` that +register via `init()` → `loadgen.MustRegisterScenario()`. The scenario name comes from the +filename. Execution flows: + +1. `run-scenario` command: dials Temporal, runs scenario executor +2. `run-worker` command: starts a worker (Go/Python/etc) polling a task queue +3. `run-scenario-with-worker`: runs both together (local development) + +### Executor Types +- `GenericExecutor`: takes a `func(ctx, *Run) error` — most flexible +- `KitchenSinkExecutor`: wraps `GenericExecutor`, starts kitchen-sink workflows with configurable action sequences +- `FuzzExecutor`: random action generation + +### Existing Standalone Activity Support +Branch `standalone-activity` (commit `efbbb7f`) adds SAA to the `throughput_stress` scenario as +an *optional extra activity within a workflow*. The implementation: + +1. Proto: `StandaloneActivity` message in `kitchen_sink.proto` +2. Helper: `StandaloneActivity()` in `loadgen/kitchensink/helpers.go` creates an action +3. Worker: `ExecuteStandaloneActivity()` in `workers/go/kitchensink/kitchen_sink.go` — called as a + *workflow activity* that internally calls `StartActivityExecution` + `PollActivityExecution` +4. Scenario: enabled via `--option enable-standalone-activity=true` + +**Critical observation**: This existing support executes SAA *from within a workflow activity*. +That is useful for testing SAA functionality but **not** for the COGS experiment. For COGS, we need +to run SAA directly from the load generator (no workflow involved) so that the only server-side +work is the standalone activity execution itself. + +## 2. What We Need for the COGS Experiment + +### Two New Scenarios + +**`saa_cogs_saw`** — Single Activity Workflow (the baseline): +- Each iteration: start a workflow that executes one activity (payload: 256B in, 256B out), then completes +- This is very close to `workflow_with_single_noop_activity` but with a payload activity + +**`saa_cogs_saa`** — Standalone Activity: +- Each iteration: call `StartActivityExecution` directly from the load generator, then + `PollActivityExecution` to wait for the result +- No workflow involved +- Same activity (payload: 256B in, 256B out) and task queue +- **Requires a `GenericExecutor`** since `KitchenSinkExecutor` always starts workflows + +Both scenarios must use the same worker (the Go worker with `payload` activity registered). + +### Key Design Decisions + +1. **Activity type**: `payload` with 256B input, 256B output (matching the COGS analysis) +2. **No heartbeat, no retry** (matching the COGS analysis; retry max_attempts=1) +3. **Fixed start rate** (not fixed concurrency) — controls for latency differences +4. **Same task queue** for both scenarios — ensures same worker setup +5. **Sync match preferred** — the COGS analysis assumes sync match; verify via metrics + +### SAA Load Generator Implementation + +The SAA scenario needs to call gRPC APIs directly. Looking at the existing +`ExecuteStandaloneActivity` in the worker code (`workers/go/kitchensink/kitchen_sink.go:46-120`), +we have a working reference. The scenario version should: + +1. Use `client.WorkflowService()` to get the gRPC client +2. Call `StartActivityExecution` with the activity config +3. Call `PollActivityExecution` to wait for completion +4. This is a `GenericExecutor` with a custom `Execute` function + +## 3. Cloud Cell Operations + +### Connecting to a Cloud Cell + +From `bench-go.mdx`, the namespace format for test cells is `{cellId}-marathon.e2e` and the host +is `{cellId}-marathon.e2e.tmprl-test.cloud:7233`. For our cell `s-saa-cogs`: +- Namespace: `s-saa-cogs-marathon.e2e` (to be confirmed — Stephen may have set up differently) +- Host: `s-saa-cogs-marathon.e2e.tmprl-test.cloud:7233` + +Omes connects via: +``` +--server-address --namespace --tls --tls-cert-path --tls-key-path +``` + +Or with API key auth: +``` +--server-address --namespace --tls --auth-header "Bearer " +``` + +### Running omes against a cloud cell + +Two options: +1. **Local**: Run `go run ./cmd run-scenario` and `go run ./cmd run-worker` locally, connecting to + the cloud cell via TLS. Simplest for proof-of-concept. Higher latency (network round trip to + cloud) but the load generator itself isn't on the critical path for COGS measurement. +2. **K8s pod**: Deploy omes worker as a pod on the cell's k8s cluster. Lower latency, more + realistic. The bench-go runbook shows this is the standard approach. Uses `omni scaffold` with + `--benchgo-enabled` or manual deployment. + +For initial proof-of-concept: run locally. For the actual experiment: deploy to k8s. + +### Grafana Dashboard + +The dashboard at `https://grafana.tmprl-internal.cloud/d/saacogs/saa-cogs` uses a `$cluster` +variable. Set `cluster=s-saa-cogs` to point at our cell. + +### Cell Setup Verification + +Use `ct` / `omni` to verify cell state: +```sh +# Check cell status +ct kubectl --context s-saa-cogs get pods -n temporal + +# Check namespace exists +omni admintools --context s-saa-cogs -- temporal operator namespace describe s-saa-cogs-marathon.e2e +``` + +### Search Attributes + +Cloud cells cannot register search attributes via the SDK — they must be registered via the +control plane. The `--do-not-register-search-attributes` flag exists for this. We should use it, +and register `OmesExecutionID` separately if needed. For the simple COGS scenarios, we may not +even need search attributes. + +## 4. Implementation Plan + +### Phase 1: Minimal Scenarios (omes code changes) + +1. Create `scenarios/saa_cogs_saw.go` — SAW scenario using `KitchenSinkExecutor` +2. Create `scenarios/saa_cogs_saa.go` — SAA scenario using `GenericExecutor` with direct gRPC calls +3. Both share config: payload size, start rate, duration + +### Phase 2: Local Proof-of-Concept + +1. Test both scenarios against local Temporal server +2. Run `go run ./cmd run-scenario-with-worker` for SAW +3. For SAA: run worker separately, then scenario (since SAA doesn't use workflows but the + worker still needs to poll for activity tasks) + +### Phase 3: Cloud Cell Connection + +1. Obtain credentials for s-saa-cogs cell +2. Verify dashboard shows idle state +3. Run a single SAW iteration and observe metrics +4. Run a single SAA iteration and observe metrics + +### Phase 4: Full Experiment + +1. Deploy omes worker to cloud cell k8s +2. Run SAW at target start rate for target duration +3. Wait for cool-down, collect metrics +4. Run SAA at same start rate for same duration +5. Collect and compare metrics + +## 5. Open Questions + +- What namespace(s) are configured on s-saa-cogs? +- How do we obtain TLS certs or API keys for the cell? (Check oncall or runbooks repos or search slack) +- Does the cell have CHASM standalone activities enabled? (Dynamic config flag) +- Worker deployment: should we use the existing bench-go infrastructure or deploy omes directly? diff --git a/.task/saacogs.json b/.task/saacogs.json new file mode 100644 index 00000000..bb30e263 --- /dev/null +++ b/.task/saacogs.json @@ -0,0 +1,703 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { "type": "grafana", "uid": "-- Grafana --" }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 }, + "id": 1, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", pod=~\"frontend-.*\"})", + "legendFormat": "frontend", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", pod=~\"history-.*\"})", + "legendFormat": "history", + "range": true, + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"$cluster\", pod=~\"matching-.*\"})", + "legendFormat": "matching", + "range": true, + "refId": "C" + } + ], + "title": "CPU per service (vCPU)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 }, + "id": 2, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "quantile(0.5, sum by (pod) (container_memory_working_set_bytes{cluster=\"$cluster\",container!=\"\",image!=\"\",namespace=\"temporal\"} * on (namespace, pod) group_left (workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"temporal\",workload=\"frontend\",workload_type=\"deployment\"}))", + "legendFormat": "frontend", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "quantile(0.5, sum by (pod) (container_memory_working_set_bytes{cluster=\"$cluster\",container!=\"\",image!=\"\",namespace=\"temporal\"} * on (namespace, pod) group_left (workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"temporal\",workload=\"history\",workload_type=\"deployment\"}))", + "legendFormat": "history", + "range": true, + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "quantile(0.5, sum by (pod) (container_memory_working_set_bytes{cluster=\"$cluster\",container!=\"\",image!=\"\",namespace=\"temporal\"} * on (namespace, pod) group_left (workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"$cluster\",namespace=\"temporal\",workload=\"matching\",workload_type=\"deployment\"}))", + "legendFormat": "matching", + "range": true, + "refId": "C" + } + ], + "title": "Memory per service (p50 working set)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 0, "y": 8 }, + "id": 3, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (operation)(rate(service_requests{cluster=\"$cluster\",temporal_service_type=\"frontend\"}[$__rate_interval]))", + "legendFormat": "{{operation}}", + "range": true, + "refId": "A" + } + ], + "title": "Frontend RPC by method (req/s)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 8, "y": 8 }, + "id": 4, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (operation)(rate(service_requests{cluster=\"$cluster\",temporal_service_type=\"history\"}[$__rate_interval]))", + "legendFormat": "{{operation}}", + "range": true, + "refId": "A" + } + ], + "title": "History RPC by method (req/s)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 8, "x": 16, "y": 8 }, + "id": 5, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (operation)(rate(service_requests{cluster=\"$cluster\",temporal_service_type=\"matching\"}[$__rate_interval]))", + "legendFormat": "{{operation}}", + "range": true, + "refId": "A" + } + ], + "title": "Matching RPC by method (req/s)", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, + "id": 6, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (table)(rate(cassandra_query{cluster=\"$cluster\",verb!=\"select\"}[$__rate_interval]))", + "legendFormat": "query: {{table}}", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (table)(rate(cassandra_batch{cluster=\"$cluster\"}[$__rate_interval]))", + "legendFormat": "batch: {{table}}", + "range": true, + "refId": "B" + } + ], + "title": "Astra writes by table (req/s)", + "description": "Validate r_Cass = 3/7 for writes. cassandra_query filtered to verb!=select; cassandra_batch is always writes.", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 }, + "id": 7, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (table)(rate(cassandra_query{cluster=\"$cluster\",verb=\"select\"}[$__rate_interval]))", + "legendFormat": "{{table}}", + "range": true, + "refId": "A" + } + ], + "title": "Astra reads by table (req/s)", + "description": "Reads are not expected to differ much between SAW and SAA (similar caching, ~1 read on creation).", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, + "id": 8, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (walType)(rate(wal_latency_count{cluster=\"$cluster\"}[$__rate_interval]))", + "legendFormat": "{{walType}}", + "range": true, + "refId": "A" + } + ], + "title": "WAL operation rate by type (ops/s)", + "description": "Covers both reads and writes (no separate write-only metric). Expect HISTORY_EVENT_WAL activity for SAW only; both use MUTABLE_STATE_WAL.", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, + "id": 9, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum by (operation)(rate(visibility_persistence_requests{cluster=\"$cluster\"}[$__rate_interval]))", + "legendFormat": "{{operation}}", + "range": true, + "refId": "A" + } + ], + "title": "Visibility persistence rate by operation (ops/s)", + "description": "OSS visibility_persistence_requests counter, tagged by operation (RecordWorkflowExecutionStarted, RecordWorkflowExecutionClosed, UpsertWorkflowExecution, DeleteWorkflowExecution).", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "red", "value": 80 } + ] + } + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 32 }, + "id": 10, + "options": { + "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(syncmatch_latency_count{cluster=\"$cluster\"}[$__rate_interval]))", + "legendFormat": "sync match", + "range": true, + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "editorMode": "code", + "expr": "sum(rate(asyncmatch_latency_count{cluster=\"$cluster\"}[$__rate_interval]))", + "legendFormat": "async match", + "range": true, + "refId": "B" + } + ], + "title": "Sync vs async match rate (matches/s)", + "description": "Health check on experimental conditions. Async match means tasks went through persistence/backlog rather than being dispatched directly to a waiting poller.", + "type": "timeseries" + } + ], + "preload": false, + "schemaVersion": 40, + "tags": [], + "templating": { + "list": [ + { + "current": { "text": "prod", "value": "prod" }, + "name": "env", + "options": [ + { "selected": true, "text": "prod", "value": "prod" }, + { "selected": false, "text": "dev", "value": "test" } + ], + "query": "prod : prod, dev : test", + "type": "custom" + }, + { + "current": { "text": "prod thanos", "value": "af7fe237-211e-413e-9723-41a73886bcbb" }, + "hide": 2, + "includeAll": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "${env:text}.*", + "type": "datasource" + }, + { + "current": {}, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(restarts,cluster)", + "includeAll": false, + "label": "Cluster", + "name": "cluster", + "options": [], + "query": { "query": "label_values(restarts,cluster)", "refId": "StandardVariableQuery" }, + "refresh": 2, + "regex": "", + "type": "query" + } + ] + }, + "time": { "from": "now-3h", "to": "now" }, + "timepicker": {}, + "timezone": "utc", + "title": "SAA COGS", + "uid": "saacogs", + "version": 1, + "weekStart": "" +} diff --git a/.task/task.md b/.task/task.md new file mode 100644 index 00000000..b9b47cc3 --- /dev/null +++ b/.task/task.md @@ -0,0 +1,764 @@ +For background context, please study the following documents carefully: + +START_DOCUMENT------------------------------------------------------------------------------ +# Temporal Activity Execution & saas-temporal Cloud Persistence: Implementation Overview + +## Part 1: Activity Execution Models in Temporal Server + +### 1.1 CHASM Standalone Activities (`chasm/lib/activity/`) + +CHASM standalone activities are first-class, independently-scheduled executions outside workflow context. They use **mutable state only** -- no history events. + +#### State Machine + +States defined in `chasm/lib/activity/proto/v1/activity_state.proto`: + +``` +UNSPECIFIED + → SCHEDULED + → STARTED + → COMPLETED (terminal) + → FAILED (terminal) + → CANCEL_REQUESTED → CANCELED (terminal) + → TIMED_OUT (terminal) + → TERMINATED (terminal) + → CANCEL_REQUESTED → CANCELED (terminal) + → TIMED_OUT (terminal) + → TERMINATED (terminal) + → SCHEDULED (retry path) +``` + +Lifecycle states (`activity.go:95-107`): +- `LifecycleStateRunning`: SCHEDULED, STARTED, CANCEL_REQUESTED +- `LifecycleStateCompleted`: COMPLETED +- `LifecycleStateFailed`: FAILED, TERMINATED, TIMED_OUT, CANCELED + +#### State Transitions (`statemachine.go`) + +| Transition | From | To | Trigger | +|---|---|---|---| +| TransitionScheduled (37-77) | UNSPECIFIED | SCHEDULED | Initial scheduling | +| TransitionRescheduled (87-127) | STARTED | SCHEDULED | Retry after failure | +| TransitionStarted (130-169) | SCHEDULED | STARTED | Worker accepts task | +| TransitionCompleted (177-202) | STARTED/CANCEL_REQUESTED | COMPLETED | Worker completes | +| TransitionFailed (210-237) | STARTED/CANCEL_REQUESTED | FAILED | Non-retryable failure | +| TransitionCancelRequested (278-295) | STARTED/SCHEDULED | CANCEL_REQUESTED | Cancel API called | +| TransitionCanceled (304-331) | CANCEL_REQUESTED | CANCELED | Worker acknowledges cancel | +| TransitionTerminated (246-275) | SCHEDULED/STARTED/CANCEL_REQUESTED | TERMINATED | Terminate API called | +| TransitionTimedOut (340-374) | SCHEDULED/STARTED/CANCEL_REQUESTED | TIMED_OUT | Timer task fires | + +#### Mutable State Structures + +**ActivityState** (proto): +- `activity_type`, `task_queue`, timeouts (`schedule_to_close`, `schedule_to_start`, `start_to_close`, `heartbeat`), `retry_policy`, `status`, `schedule_time`, `priority`, `cancel_state`, `terminate_state` + +**Activity Go Component** (`activity.go:52-68`): +- `ActivityState` (embedded proto) +- `Visibility: chasm.Field[*chasm.Visibility]` -- search attributes +- `LastAttempt: chasm.Field[*ActivityAttemptState]` -- attempt count, stamp, started_time, failure details, worker identity +- `LastHeartbeat: chasm.Field[*ActivityHeartbeatState]` -- heartbeat details and recorded_time +- `RequestData: chasm.Field[*ActivityRequestData]` -- input, header, user_metadata +- `Outcome: chasm.Field[*ActivityOutcome]` -- successful (output) or failed (failure) +- `Store: chasm.ParentPtr[ActivityStore]` -- parent workflow (nil for standalone) + +#### Task Flow + +1. **Scheduling** (`handler.go:51-104`): `StartActivityExecution()` → creates Activity → applies TransitionScheduled +2. **Dispatch** (`activity_tasks.go:21-79`): `activityDispatchTaskExecutor` pushes to matching service via `AddActivityTask()` +3. **Start** (`activity.go:173-191`): `HandleStarted()` applies TransitionStarted, schedules start-to-close and heartbeat timeout tasks +4. **Completion** (`activity.go:259-280`): `HandleCompleted()` applies TransitionCompleted +5. **Failure** (`activity.go:284-323`): `HandleFailed()` checks retryability → either `tryReschedule()` or TransitionFailed +6. **Heartbeat** (`activity.go:559-586`): Updates LastHeartbeat, reschedules heartbeat timeout task + +#### Timeout Tasks + +- **ScheduleToStartTimeoutTask** (`activity_tasks.go:81-116`): Non-retryable → TIMED_OUT +- **ScheduleToCloseTimeoutTask** (`activity_tasks.go:118-150`): Non-retryable → TIMED_OUT +- **StartToCloseTimeoutTask** (`activity_tasks.go:152-198`): Attempts retry via `tryReschedule()`; if not retryable → TIMED_OUT +- **HeartbeatTimeoutTask** (`activity_tasks.go:200-276`): Validates heartbeat recency; attempts retry; if not retryable → TIMED_OUT + +#### Retry Logic + +- `shouldRetry()` (`activity.go:504-514`): Checks TransitionRescheduled possible, attempt < max, enough time remaining +- `hasEnoughTimeForRetry()` (`activity.go:518-534`): Exponential backoff calculation against schedule-to-close deadline +- `tryReschedule()` (`activity.go:489-502`): Applies TransitionRescheduled (increments attempt, schedules dispatch with backoff) + +#### Cancellation + +- `RequestCancelActivityExecution` (`handler.go:273-296`): Applies TransitionCancelRequested + - If SCHEDULED: immediately applies TransitionCanceled (`activity.go:414-433`) + - If STARTED: stays CANCEL_REQUESTED; worker receives cancellation on next interaction + +--- + +### 1.2 Legacy Workflow Activities + +Activities executed as part of a workflow use **mutable state (ActivityInfo) plus history events**. + +#### History Events + +``` +EVENT_TYPE_ACTIVITY_TASK_SCHEDULED (10) +EVENT_TYPE_ACTIVITY_TASK_STARTED (11) +EVENT_TYPE_ACTIVITY_TASK_COMPLETED (12) +EVENT_TYPE_ACTIVITY_TASK_FAILED (13) +EVENT_TYPE_ACTIVITY_TASK_TIMED_OUT (14) +EVENT_TYPE_ACTIVITY_TASK_CANCEL_REQUESTED (15) +EVENT_TYPE_ACTIVITY_TASK_CANCELED (16) +``` + +#### ActivityInfo Mutable State (`persistence/v1/executions.proto:524-661`) + +Core: `activity_id`, `activity_type`, `task_queue`, `scheduled_time`, `started_time`, `started_event_id`, `scheduled_event_id` + +Timeouts: `schedule_to_close_timeout`, `schedule_to_start_timeout`, `start_to_close_timeout`, `heartbeat_timeout` + +Retry: `attempt`, `has_retry_policy`, `retry_initial_interval`, `retry_maximum_interval`, `retry_maximum_attempts`, `retry_backoff_coefficient`, `retry_expiration_time`, `retry_non_retryable_error_types`, `retry_last_failure` + +State flags: `cancel_requested`, `cancel_request_id`, `timer_task_status` (bit flags), `stamp`, `paused`, `pause_info` + +#### Pending Activity States (`activity.go:53-61`) + +- SCHEDULED: `StartedEventId == 0` +- STARTED: `StartedEventId != 0 && !CancelRequested` +- CANCEL_REQUESTED: `CancelRequested` +- PAUSED: `Paused && Scheduled` +- PAUSE_REQUESTED: `Paused && Started` + +#### Timer Task Status Flags + +```go +TimerTaskStatusCreatedScheduleToStart = 1 +TimerTaskStatusCreatedScheduleToClose = 2 +TimerTaskStatusCreatedStartToClose = 4 +TimerTaskStatusCreatedHeartbeat = 8 +``` + +#### Pause/Unpause/Reset (unique to legacy model) + +- **Pause** (`activity.go:254-284`): Sets `paused = true`, increments stamp if SCHEDULED +- **Unpause** (`activity.go:388-425`): Clears pause, regenerates retry task if SCHEDULED +- **Reset** (`activity.go:286-379`): Resets attempt to 1, optionally resets heartbeat/options + +#### API Handlers (`service/history/api/`) + +- `recordactivitytaskstarted/api.go`: Creates ActivityTaskStartedEvent +- `respondactivitytaskcompleted/api.go`: Creates ActivityTaskCompletedEvent +- `respondactivitytaskfailed/api.go`: Retry or ActivityTaskFailedEvent +- `respondactivitytaskcanceled/api.go`: Creates ActivityTaskCanceledEvent +- `recordactivitytaskheartbeat/api.go`: Updates heartbeat state, reschedules timeout + +--- + +### 1.3 Activity Metrics (Both Models) + +Defined in `common/metrics/metric_defs.go`. Both models emit the same metric names. + +**Counters:** +| Metric | Description | +|---|---| +| `activity_success` | Successful completions (excludes retries) | +| `activity_fail` | Final failures (retries exhausted) | +| `activity_task_fail` | Per-attempt failures (includes retries) | +| `activity_cancel` | Canceled activities | +| `activity_terminate` | Terminated activities (CHASM only) | +| `activity_timeout` | Terminal timeouts | +| `activity_task_timeout` | Per-timeout events (includes retries) | + +**Timers:** +| Metric | Description | +|---|---| +| `activity_start_to_close_latency` | StartedTime → completion/failure/timeout | +| `activity_schedule_to_close_latency` | ScheduleTime → completion/failure/timeout/cancel | + +**Tags:** `namespace`, `task_queue_family`, `operation`, `activity_type`, `versioning_behavior`, `workflow_type` (set to `__temporal_standalone_activity__` for CHASM). Timeout metrics additionally tagged with `timeout_type` (SCHEDULE_TO_START, SCHEDULE_TO_CLOSE, START_TO_CLOSE, HEARTBEAT). + +**Metric enrichment** (`activity.go:804-824`): `enrichMetricsHandler()` adds per-task-queue-family scoping via `metrics.GetPerTaskQueueFamilyScope()`. + +--- + +### 1.4 Key Differences + +| Aspect | CHASM Standalone | Legacy Workflow | +|---|---|---| +| Persistence | Mutable state only | Mutable state + history events | +| Parent context | Standalone execution | Part of workflow execution | +| State tracking | ActivityState + sub-components | ActivityInfo in workflow | +| Task dispatch | Direct to matching service | Via workflow task completion | +| Completion storage | Outcome field | History events | +| Cancellation | Explicit CANCEL_REQUESTED state | Boolean flag in ActivityInfo | +| Pause support | Not yet implemented | Full (pause, unpause, reset) | +| Search attributes | Visibility component (chasm) | Workflow search attributes | + +--- + +## Part 2: saas-temporal Cloud Integration + +### 2.1 Architecture Overview + +saas-temporal wraps the Temporal server to run in Temporal Cloud cells by replacing core persistence with Cloud Data Storage (CDS), backed by: +- **Datastax Astra Cassandra** for durable storage +- **Write-Ahead Logs (WALs)** for durability before Cassandra persistence +- **OpenSearch/Elasticsearch** for workflow visibility +- **Tiered Storage** (S3/GCS/Azure) for history archival + +### 2.2 Entry Point and Server Construction + +**Main:** `cmd/temporal-service/main.go` + +The `start` command: +1. Loads OSS Temporal configuration from YAML +2. Injects secrets (Astra, Elasticsearch credentials) +3. Sets up dynamic configuration +4. Optionally enables cloud metrics handler (Chronicle) +5. Configures authorization (SaaS Auth0 JWT + Temporal JWT) +6. Configures custom datastore with CDS +7. Creates server via `cds.NewServer()` + +**Server creation:** `cds/export/cds/server.go`: +```go +func NewServer(serviceFxOpts FxOptions, opts ...temporal.ServerOption) (temporal.Server, error) { + return newServerFx(TopLevelModule, serviceFxOpts, opts...) +} +``` + +Uses Uber FX dependency injection with modules for persistence factory, dynamic config, serialization, and per-service modules (history, matching, frontend, worker). + +### 2.3 CDS Factory Architecture (`cds/export/cds/factory.go`) + +**FactoryProvider** (lines 51-65): Implements `client.AbstractDataStoreFactory` +- `NumberOfShards`, `OrderedDatastoreConfigs` (shards → datastores) +- `HistoryDatastoreConfigs` (weighted distribution) +- `WALFollowerProviders` for WAL followers +- `Clock`, `DynamicConfig`, `ChasmRegistry` + +**Factory**: Manages three WAL pools: +- **MS WAL** (MutableState): Records mutable state mutations +- **HE WAL** (HistoryEvent): Records history events +- **LP WAL** (LargePayload): Records oversized payloads + +Plus store providers: `MultiDBStoreProvider` for ordinal datastores, separate history store provider with tiered storage, optional Walker integration. + +### 2.4 Astra Cassandra Integration (`cds/storage/cassandra/astra/`) + +**Session creation** (`gocql.go`): Wraps gocql with Astra-specific config (TLS, connection pooling, retry policies) via Datastax `gocql-astra`. + +**Query instrumentation** (`gocql_metrics.go:48-100`): `queryMetricsObserver` instruments every query with 150-entry LRU statement cache. + +**Cassandra Metrics:** +| Metric | Description | +|---|---| +| `CassandraConns` | Connection count | +| `CassandraQueryTotalLatency` | Query latency | +| `CassandraBatchTotalLatency` | Batch latency | +| `CassandraQuery` | Query count | +| `CassandraBytesTx` / `CassandraBytesTx` | Network bytes | +| `CassandraLargeResponse` / `CassandraLargeRequest` | Large payload detection | +| `CassandraRetries` | Retry histogram | +| `CassandraErrors` | Error counters | + +Tags: `OperationType` (INSERT/UPDATE/DELETE/SELECT), `TableName`, `CasTag` (CAS operation) + +### 2.5 Write-Ahead Logs (`cds/export/wal/`, `cds/stream/`) + +WALs provide durability guarantees before data reaches Cassandra. + +**WAL Client Interface** (`cds/export/wal/crud.go`): +```go +WriteMS(), WriteHE(), WriteLP() // Write operations per pool +ReadMS(), ReadHE(), ReadLP() // Read operations per pool +``` + +**Configuration** (`cds/config/configs.go:46-140`): +- Rate limiting: `WALReadsRate`, `WALReadsBurst` +- Timeouts: `WALDialTimeout`, `WALReadTimeout`, `WALWriteTimeout` +- Ledger rotation: `WALLedgerRotationBytesThreshold`, `WALLedgerRotationAgeThreshold` +- Retention: `WALLedgerLifetime` +- Parallelism: `WALMaxParallelReads` +- Feature flags: `WALReadV2Enabled`, `WALV2EncodingEnabled` + +**WAL Metrics** (`cds/metrics/metrics.go:34-56`): +| Metric | Description | +|---|---| +| `wal_latency` | Operation latency | +| `wal_stream_dial_attempt/success/error` | Connection establishment | +| `wal_stream_dns_latency` | DNS resolution | +| `wal_stream_connect_latency` | TCP connect | +| `wal_stream_handshake_latency` | TLS handshake | +| `wal_stream_send/receive_latency` | I/O latency | +| `wal_health_check_failed_count` | Connection health | +| `wal_write_timeout_count` | Timeout tracking | +| `wal_reader_page_latency` | Page read latency | +| `wal_entries_per_read` | Batch size histogram | +| `wal_compression_count` | Compression events | + +**Flush Metrics** (lines 13-27): +| Metric | Description | +|---|---| +| `flush_latency` | Time to flush to persistence | +| `flush_error` | Flush failures | +| `flush_snapshot_aborts` | Snapshot abort count | +| `flush_persistence_behindness_bytes/count/time` | Persistence lag | +| `flush_time_since_last_persist` | Staleness | +| `flush_reason_count` | Flush trigger reasons (by namespace) | + +**Recovery Metrics** (lines 57-70): +| Metric | Description | +|---|---| +| `recovery_total_latency` | Full recovery duration | +| `recovery_open_reader_latency` | Snapshot reader open | +| `recovery_rate_limiter_latency` | Rate limiting delay | +| `recovery_first_read_latency/bytes` | Initial WAL read | +| `recovery_takeover_latency` | Takeover phase | +| `recovery_wal_update_latency` | WAL update during recovery | + +**Ledger Metrics** (lines 77-82): +| Metric | Description | +|---|---| +| `ledger_rotation_count` | Rotations | +| `logs_per_ledger` | Logs per ledger histogram | +| `segments_per_shard` | Segments per shard histogram | +| `segment_too_old_count` | GC candidates | +| `active_segment_too_old_count` | Rotation delay | + +### 2.6 Execution Store Wrapper (`cds/export/cds/execution_store.go`) + +Wraps the Cassandra execution store to: +- Convert mutable state mutations to WAL records (`NewMSWALRecord()`) +- Convert history events to WAL records (`NewHEWALRecord()`) +- Calculate storage metering +- Manage snapshot trimming +- Implement history event caching + +Implements `persistence.ExecutionStore` and `persistence.ShardStore`. + +### 2.7 How Activity State Flows Through CDS + +**CHASM activities**: Activity mutable state → MS WAL write → Cassandra persistence. No HE WAL involvement (no history events). State transitions are persisted as mutable state mutations via the execution store wrapper. + +**Legacy workflow activities**: ActivityInfo mutable state → MS WAL write → Cassandra. History events (Scheduled, Started, Completed, etc.) → HE WAL write → Cassandra. Both paths go through the execution store wrapper's WAL record conversion. + +### 2.8 OpenSearch/Elasticsearch Visibility (`visibility/`) + +**Factory:** `visibility/factory.go` -- `VisibilityStoreFactory` creates visibility stores configured per cloud cell. + +**Batch processor metrics** (`visibility/common/metrics_defs.go`): +| Metric | Description | +|---|---| +| `visibility_batch_processor_request_add_latency` | Enqueue time | +| `visibility_batch_processor_request_latency` | Total request latency | +| `visibility_batch_processor_request_errors` | Failed requests | +| `visibility_batch_processor_commit_latency` | Batch commit time | +| `visibility_batch_processor_batch_size` | Items per batch histogram | +| `visibility_batch_processor_batch_requests` | Requests per batch histogram | +| `visibility_batch_processor_queued_requests` | Queue depth histogram | +| `visibility_batch_processor_corrupted_data` | Data integrity failures | +| `visibility_batch_processor_duplicate_request` | Deduplication events | + +### 2.9 Tiered Storage (`cds/persistence/tieredstorage/`) + +Long-term history archival to cloud object stores: +- S3 (AWS): `s3_store.go` +- GCS (Google Cloud): `gcs_store.go` +- Azure Blob: `azure_client.go` + +Interface: `Upload()`, `Read()`, `Delete()`, `List()`, `PluginName()` + +Metrics: `ReadWorkflowHistory`, `UploadWorkflowHistory`, `DeleteWorkflowHistory`, `ListTieredStorageObjects` + +### 2.10 Persistence Store Metrics (`cds/persistence/metrics/defs.go`) + +**Store layer** (lines 70-85): +| Metric | Description | +|---|---| +| `store_requests` | Request count by operation | +| `store_latency` | Operation latency | +| `store_errors` | Errors: shard_exists, shard_ownership_lost, condition_failed, timeout, unavailable | + +**Manager layer** (lines 89-102): +| Metric | Description | +|---|---| +| `saas_persistence_requests` | High-level request count | +| `saas_persistence_latency` | High-level latency | +| `saas_persistence_errors` | Error tracking | + +Tags: `operation` (CreateShard, UpdateShard, GetWorkflowExecution, etc.), `component`, `cass_cluster` + +### 2.11 Cloud Metrics Infrastructure + +**Handler chain** (`cloudmetricshandler/delegating_recorders.go`): +1. `allowlistedRecorder`: Filters through allowlist +2. `multiRecorder`: Sends to multiple backends + +**Chronicle integration** (`cloudmetricshandler/chronicle_recorder.go`): +- Enabled by `TEMPORAL_ENABLE_CLOUDMETRICSHANDLER` +- Config: `/etc/temporal/cloudmetricshandler` +- Kubernetes enrichment: pod name, namespace, labels +- Backends: S3 writer, HTTP writer (to Chronicle service) +- Batch config: 50K queue, 25K batch, 100ms flush + +**Action metering** (`actionmetering/metrics.go`): +- `billable_action_count` with tags: namespace, action_type, workflow_type, workflow_task_queue +- Activity type/task queue currently placeholder `"_unknown_"` with TODOs for standalone activity support + +### 2.12 Additional Cloud Features + +- **Authorization**: SaaS Auth0 JWT + Temporal JWT, TLS client certs +- **Quotas/Flow Control** (`quotas/`, `flowcontrol/`): Request-level and task-queue quotas +- **Multi-region replication** (`cds/service/history/replication/`): Custom replication filters +- **Metering V3**: S3/GCS/Azure bucket metering +- **SMS (etcd)**: Secondary Metadata Store for namespace/cluster metadata +- **Dynamic config**: 150+ hot-reloadable properties (`cds/config/configs.go`) +END_DOCUMENT-------------------------------------------------------------------------------------- + +START_DOCUMENT------------------------------------------------------------------------------ +# Standalone Activity COGS and margins + +@Dan Davison March 17, 2026 + +We want to ensure that we are billing in a way that meets our target margins for new product features in cloud, such as new CHASM execution types. To do this, we need to know certain things about COGS (cost of goods sold) for these features. This document outlines how to estimate COGS for Standalone Activity relative to Workflow and the implications of this for margins. + +# Motivation: avoiding cannibalization + +We have rules (see [temporalio/action](https://github.com/temporalio/action)) specifying how customer operations map to billable Actions. For example, suppose a customer executes a Workflow that executes a single Activity, which succeeds on first attempt without heartbeating. This incurs 2 Actions (StartWorkflow and ScheduleActivity). We’ll call this a “Single Activity Workflow” (SAW). + +We haven’t yet decided how we will bill for Standalone Activity (SAA). But suppose that we decide that executing a single SAA (no retries, no heartbeating) is 1 Action (StartStandaloneActivity). + +If we want SAA margins to match SAW margins, then we want the COGS of SAA (no retries, no heartbeating) to be ≤ 1/2 that of SAW (because we get half as much revenue for the SAA). If it is not, then there would be some degree of cannibalization (customers switch their single-activity workloads to SAA, but our margins there are worse). We’d hope it would be offset by increased volume, but we’d still prefer SAA margins to match SAW. + +### What about retries and heartbeating? + +SAW (no retries and no heartbeating) is 2 Actions. If the activity retries once it becomes 3 Actions (ScheduleActivity now happens twice); if it heartbeats once during each attempt it becomes 5 Actions. + +Let’s assume (as we currently intend) that we apply the same billing rules to Standalone Activity retries and heartbeating. Then, as long as SAA is not worse than Workflow Activity with respect to COGS of retries and heartbeating, our margins from those customer operations will be at least as good under SAA as when they are done in the context of a pre-CHASM workflow. CHASM has been designed for efficiency; we have reason to be optimistic that it’s not *worse* than the legacy workflow activity implementation. + +# Problem statement + +The above suggests that we should focus on estimating the ratio of COGS for Standalone Activity (SAA) relative to Single-activity Workflow (SAW) in the no retries, no heartbeating case: + +$$ +R = \frac{C_{SAA}}{C_{SAW}}. +$$ + +We expect $R < 1$ because SAA achieves execution of an activity with fewer RPCs, persistence operations, etc, than SAW. We are hoping that it is less than 1/2 since then our SAA margins are as good or better than our workflow margins, assuming we bill 1 Action for SAA. + +# Estimating the COGS ratio + +We’ll assume that the COGS for a SAA or SAW execution results solely from invoices from third parties relating to cloud compute resources. COGS for an execution type (SAA or SAW) is the sum of price ($p$) times quantity consumed ($q$) over all resources: + +$$ +C = \sum_{i} p_i q_i. +$$ + +We want the COGS ratio $R$. We can write that as a weighted average of per-resource usage ratios: + +$$ +R = \frac{C_{SAA}}{C_{SAW}} = \sum_i f_i r_i. +$$ + +This allows us to calculate $R$ as a function of two things that we can estimate: + +- $f_i = p_i q_{i}(SAW) / \sum_j p_j q_{j}(SAW)$ is the fraction of SAW COGS attributable to resource $i$ (“spend share”). We’ll use our current cloud spend for this. +- $r_i = q_i(SAA) / q_i(SAW)$ is the per-resource usage ratio. We will estimate these by comparing the implementations or by running experiments in cloud cells. + +The resources ($i$) potentially include: +1. Data egress +2. CPU usage +3. Memory usage +4. Persistence operations against our WALs +5. Persistence operations against Astra (to be replaced by Walker) +6. Persistence operations against OpenSearch (visibility) +7. Metrics/logs processing and storage costs, Clickhouse + +*At-rest data storage is excluded: we bill customers separately for storage on a GB/h basis, so it does not need to be subsidized by Actions. (Tangentially, it’s worth noting that we expect SAA storage to cost users half what they’d pay for SAW since SAW stores the input and output payloads in both workflow scheduled/complete events and activity scheduled/complete events.)* + +# Per-resource usage ratios + +To proceed, we need to estimate the SAW vs SAA usage ratio ($r_i$) for each resource. + +The following table summarizes the two implementations. It describes the simplest possible happy-path scenario: an activity that succeeds on first attempt without heartbeating, via sync matches. + +| # | Single-activity Workflow | Standalone Activity | +| --- | --- | --- | +| 1 | RPC: `StartWorkflowExecution` => HEWAL, MSWAL; Vis&; Cassandra& | RPC: `StartActivityExecution` => MSWAL; Vis&; Cassandra& | +| 2 | Task => RPC: `AddWorkflowTask` | | +| 3 | RPC: `RecordWorkflowTaskStarted` => HEWAL, MSWAL; Cassandra& | | +| 4 | RPC: `RespondWorkflowTaskCompleted` => HEWAL, MSWAL; Cassandra& | | +| 5 | Task => RPC: `AddActivityTask` | Task => RPC: `AddActivityTask` | +| 6 | RPC: `RecordActivityTaskStarted` => HEWAL, MSWAL; Cassandra& | RPC: `RecordActivityTaskStarted` => MSWAL; Cassandra& | +| 7 | RPC: `RespondActivityTaskCompleted` => HEWAL, MSWAL; Cassandra& | RPC: `RespondActivityTaskCompleted` => MSWAL; Vis&; Cassandra& | +| 8 | Task => RPC: `AddWorkflowTask` | | +| 9 | RPC: `RecordWorkflowTaskStarted` => HEWAL, MSWAL; Cassandra& | | +| 10 | RPC: `RespondWorkflowTaskCompleted` => HEWAL, MSWAL; Vis&; Cassandra& | | +- `&` indicates a write that’s not on the sync response path +- `AddWorkflowTask` and `AddActivityTask` involve inter-service RPCs but no persistence writes in the happy path (“sync match”). +- The table does not show worker poll requests +- An additional `Vis&` is incurred in both cases when the execution is deleted. + +Comparing the implementations in the table gives + +$$ +r_{\text{WAL}} = \frac{3}{14} = 0.21,~~~~ +r_{\text{Cass}} = \frac{3}{7} = 0.43,~~~~ +r_{\text{Vis}} = \frac{3}{3} = 1.0.~~~~ +$$ + +These ratios count writes only. Cassandra reads are not expected to differ much between SAW and SAA since they use similar caching mechanics with the result that a high proportiion of both SAW and SAA executions incur ~1 read (on execution creation);. + +In addition, we can estimate data transfer costs by comparing the implementations. These are likely dominated by egress to customer infra (ingress is free on AWS and GCP; data transfers to Astra, OpenSearch, and Grafana are in-VPC or via PrivateLink). Let the activity input and output payload sizes be $S_I$ and $S_O$. Payload egress for SAW is $2S_I + 2S_O$ (input payload sent to workflow and activity workers; output payload sent to workflow worker and client). For SAA this is $S_I + S_O$ since there is no workflow worker detour. This gives + +$$ +r_\text{data\_transfer} = 0.5. +$$ + +# COGS ratio estimate + +Using approximate/preliminary cloud spend share numbers (thanks @Stephen Chan ) we have: + +| **Resource** | **Spend share $f_i$ (preliminary)** | **Usage ratio $r_i$** | **Notes** | +| --- | --- | --- | --- | +| **Astra writes** | 40% | $\frac{3}{7}$ = 0.43 | SAW does 2 additional writes for each WFT | +| **Visibility** (OpenSearch) | 20% | $\frac{3}{3}$ = 1.00 | Equal — both SAA and SAW produce exactly ~~2~~ 3 visibility updates | +| **WAL writes** | 10% | $\frac{3}{14}$ = 0.21 | Half of Astra ratio: SAA writes only to MSWAL, whereas SAW writes to both HEWAL and MSWAL | +| **EC2 compute** | 10% | ? | Would need cloud cell experiment | +| **Data transfer** | 10% | $\frac{1}{2}$ = 0.50 | SAW sends payloads via workflow worker round-trip; SAA does not | +| **Overheads** (incl. Clickhouse) | 10% | ? | | + +This gives the following estimate of the COGS ratio: + +$$ +\begin{align*} +R &= +\underbrace{0.4 \times 0.43}_{\text{Astra}:~0.17} + +\underbrace{0.2 \times 1.0}_{\text{Vis}:~0.20} + +\underbrace{0.1 \times 0.21}_{\text{WAL}:~0.02} + +\underbrace{0.1 \times 0.50}_{\text{Tx}:~0.05} + +0.1 \cdot r_\text{compute} + 0.1 \cdot r_\text{overhead} \\\\ +&= +0.44 + 0.1(r_\text{compute} + r_\text{overhead}). +\end{align*} +$$ + +# Sensitivity analysis + +Before thinking about the implications of this for billing and margins, the next steps are: + +1. Refine the cloud spend estimates (Cloud Capacity team; does not involve load experiments) +2. Decide whether we want to do load experiments to estimate $r_\text{compute}$ +3. Decide how we will address $r_\text{overhead}$ + +For (2) and (3) we can do some initial sensitivity analysis: + +SAW does 10 RPCs vs SAA’s 4 (with 7 vs 3 of them doing persistence writes in the sync-match case). If services are CPU-bound then this suggests that $0.4 < r_\text{compute} < 1.0$ might be reasonable. + +The other overheads include (per @Stephen Chan ) Clickhouse, observability cells, and Envoy proxies. Since these costs should also scale with RPC count, let’s assume the same bounds: $0.4 < r_\text{overhead} < 1.0$. This gives: + +$$ +0.52 \leq R \leq 0.64. +$$ + +![image.png](.task/sensitivity.png) + +For example, if SAW margins were 70%, SAA margins would be 62% - 69%. This margin reduction would affect at maximum the ~3% of workflows that are SAW. + +- COGS ratio to margins conversion formula + + $\text{margin}_{\text{SAA}} = 1 - 2R(1 - \text{margin}_{\text{SAW}})$. + + +# Discussion + +- **Visibility limits SAA margins**. Visibility is expensive (20%), but SAA and SAW perform the same number of visibility writes, so it combines a large weight with the worst possible ratio. +- **(Unfavorable) Over-provisioning would push $R$ up.** The usage ratios above for persistence are derived from write counts, which only translate to cost savings if capacity tracks usage. But e.g. Astra is bought in fixed hardware units (“Astra Classic”). If any resource component is over-provisioned then SAA and SAW would pay the same cost per execution and $r_i \to 1.0$, making SAA margins less attractive relative to workflow. +- **Cloud spend share**. We could attempt to separate fixed costs and renormalize (see [Next steps](https://www.notion.so/Next-steps-3268fc567738805e82ddd9c1e1d4c9d1?pvs=21)). This would be favorable to SAA margins if it decreases the visibility share, but unfavorable if it decreases Astra share. + + We’re estimating $f_i$ from cloud spend, so we’re assuming that the spend distribution for single-activity workflows would be similar to the spend distribution for the real mix of customer workflows. I suspect this is a reasonable modeling assumption since in both cases the application is performing the same state transitions in response to workflow and activity task processing. + +- **(Mixed) Effect of migration to Walker**. Walker replaces Astra with storage that is under our own control, making right-sizing easier. This may mean that the 3/7 write ratio is more fully realized under Walker, moving SAA COGS away from SAW. However, Walker will be cheaper than Astra, so persistence’s share of spend shrinks. Since persistence is where SAA has its largest advantage, this would bring SAA COGS closer to SAW. + + These two effects act in opposite directions and the net result will depends on their relative magnitudes. This suggests that we should monitor COGS calculations as the Walker migration proceeds. + +- **(Future) A visibility backend migration would improve SAA margins.** There has been [movement](https://www.notion.so/Visibility-CDS-2a98fc567738807e9ee0f318edc4c16f?pvs=21) toward replacing OpenSearch. As discussed above, any reduction in visibility spend share would make SAA COGS more attractive relative to workflow. + +# Conclusion + +- [We are planning to bill SAA at 1/2 the price of SAW](https://www.notion.so/PRD-Standalone-Activities-for-durable-job-processing-1ee8fc567738806d8b6fe8e2eeae0fc4?pvs=21). Although there are various assumptions involved, at this point it looks like SAA COGS will be more than 1/2 SAW COGS: the estimated range above is $0.52 \leq R \leq 0.64$. This implies that some degree of cannibalization is likely. The extent of cannibalization would be bounded by the proportion of current workloads that are SAW, which is 3% per @Phil Prasek. It may be offset by volume growth attributable to SAA. + +# Next steps + +- **Refine cloud spend share estimates.** + + The cloud spend share weights used in this analysis are supposed to be marginal costs. We could attempt to separate marginal vs fixed costs and renormalize our spend share percentages. This would be favorable to SAA margins if it decreases the visibility share, but unfavorable if it decreases Astra share. + +- **Investigate any impact of over-provisioning.** + + SAA margins may be less favorable than the calculations suggest if some resources are over-provisioned. See discussion [above](https://www.notion.so/Standalone-Activity-COGS-and-margins-3268fc567738803cb63fd9397ffd351c?pvs=21). + +- **Decide whether to do cloud cell experiments**. + + Unlike the other resource categories, we lack any obvious theoretical basis for estimating $r_\text{compute}$ and $r_\text{overhead}$. Estimating $r_\text{compute}$ via cloud cell experiments would require perhaps one engineer-week. If this were to show a value close to 0.4 then it would suggest that the upper bound on $R$ is 0.56, as opposed to the current 0.64. This would however still be subject to all the assumptions discussed above. We could also attempt to tighten our estimated bounds on $r_\text{overhead}$ via experiment. + + If we decide to do this, the $r_\text{compute}$ experiment would be something like the following: choose a reference activity (e.g. sleeps for 10s, no heartbeating, never fails) and run SAA and SAW workloads on a cloud cell at a fixed start rate (e.g. 10/s) for a sustained period (e.g. 1hr). Fixing start rate rather than concurrency naturally controls for end-to-end latency differences between SAA and SAW. $r_\text{cpu}$ and $r_\text{memory}$ can then be estimated from metrics as the ratio of mean utilization above the idle baseline. The analysis will need to decide how to combine them, e.g. based on which is more often limiting; alternatively, using the larger of the two would yield a conservative calculation. +END_DOCUMENT------------------------------------------------------------------------------ + +START_DOCUMENT------------------------------------------------------------------------------ +# Test plan for SAA COGS measurement + +@Dan Davison March 19, 2026 + +The [SAA COGS proposal](.task/saa-cogs.md) made an initial estimate of the SAA/SAW COGS ratio based on estimating persistence, visibility, and data transfer usage ratios directly from the implementation. But for compute and overheads we have no analytical estimate. We plan to run an experiment to: + +1. Estimate the missing $r_\text{compute}$. +2. Validate the analytical $r_i$ against observed metrics + +For comparison, the Fairness COGS experiment docs: + +- [Test plan](https://www.notion.so/temporalio/Test-plan-for-COGS-measurement-28c8fc56773880169cdcc4087a98ceaf) +- [Fairness COGS Impact](https://www.notion.so/temporalio/Fairness-COGS-Impact-2c58fc567738808f806cfbf09b771b2c) +- [Pricing Council doc](https://www.notion.so/temporalio/WIP-Pricing-Council-Fairness-COGS-Impact-2cc8fc56773880dcb3efe435623edd9a) + + + + +# Proposed SAA experiment + + +## Workloads + +Two workloads, run sequentially on the same cell: + +1. **SAW**: execute workflow with one activity (no heartbeat, no retry). +2. **SAA**: execute standalone activity (no heartbeat, no retry). + +## Parameters + +**Start rate.** I think that we should fix start rate rather than concurrency, since this naturally controls for end-to-end latency differences between SAA and SAW (i.e. a cell running SAW will see higher load because the concurrency will be higher because the SAW end-to-end latency is higher). The fairness experiment used 4k tasks/s. Is starting 4k executions/s reasonable for us? + +**Activity.** Immediate successful return; no heartbeat, no retry. We could compare with a 1s sleep to see if result differ? + +**Sync match.** Do one run such that sync match should be 100%, and another tuned such that sync match is lower? Verify sync match from metrics (`syncmatch_latency`, `asyncmatch_latency`) + +**Duration and repetitions.** Steady-state load; we need long enough for stable CPU averages. The +fairness experiment used 6h per scenario but this was maybe because of their more sophisticated +sinusoidal load design? 1h more than enough for the SAA experiment? ≥2 runs per workload to check +variance/reproducibility. + +## Infrastructure + +- Anything special about test cell sizing? +- Workers should run outside the cell (how did fairness experiment do this?) + +## Metrics + +Initial dashboard content https://grafana.tmprl-internal.cloud/d/saacogs/saa-cogs: + + +- **CPU per service** (frontend, history, matching). `node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate` — a k8s recording rule over cAdvisor container metrics (defined in saas-components prometheus rules). +- **Memory per service**. `container_memory_working_set_bytes` — also k8s/cAdvisor (defined in saas-components alert rules). +- **RPC rate by method**, one panel per service (frontend, history, matching). `service_requests` counter ([temporal:common/metrics/metric_defs.go:615](https://github.com/temporalio/temporal/blob/main/common/metrics/metric_defs.go)), tagged with `operation` (the RPC method name). Recorded by a gRPC server-side interceptor ([telemetry.go:177](https://github.com/temporalio/temporal/blob/main/common/rpc/interceptor/telemetry.go)), so it captures inter-service RPCs (e.g. history→matching `AddActivityTask`). +- **Astra writes by table**. `cassandra_query` counter with `verb!="select"`, plus `cassandra_batch` counter, both broken down by `table`. Tags include `operation`, `table`, `verb`, `cas` ([saas-temporal:cds/metrics/metrics.go:233,238](https://github.com/temporalio/saas-temporal/blob/main/cds/metrics/metrics.go)). +- **Astra reads by table**. `cassandra_query` with `verb="select"`, broken down by `table`. +- **WAL operation rate by type**. `wal_latency_count` ([saas-temporal:cds/metrics/metrics.go:35](https://github.com/temporalio/saas-temporal/blob/main/cds/metrics/metrics.go)) broken down by `walType` label (values: `MUTABLE_STATE_WAL`, `HISTORY_EVENT_WAL`, `LARGE_PAYLOAD_WAL` — see [saas-temporal:cds/common/tag/tag.go:11-24](https://github.com/temporalio/saas-temporal/blob/main/cds/common/tag/tag.go)). Note: this metric covers both reads and writes; there is no separate write-only WAL metric. This is arguably more relevant to COGS since WAL reads also cost something. +- **Visibility persistence rate by operation**. `visibility_persistence_requests` counter ([temporal:common/metrics/metric_defs.go:1398](https://github.com/temporalio/temporal/blob/main/common/metrics/metric_defs.go)), tagged with `operation` (values include `RecordWorkflowExecutionStarted`, `RecordWorkflowExecutionClosed`, `UpsertWorkflowExecution`, `DeleteWorkflowExecution` — see [visiblity_manager_metrics.go](https://github.com/temporalio/temporal/blob/main/common/persistence/visibility/visiblity_manager_metrics.go)). +- **Sync vs async match rate**. `syncmatch_latency_count` and `asyncmatch_latency_count` ([temporal:common/metrics/metric_defs.go:1119-1120](https://github.com/temporalio/temporal/blob/main/common/metrics/metric_defs.go)). + + +## Load generator (omes) + +- Add a new scenario that starts standalone activities directly from the load generator, not from within a workflow. +- Build the omes Go worker Docker image and deploy it as a pod on k8s, configured to poll the test cell. Do we have implementation we can borrow from the fairness experiment? + + + + +
+Appendix: Comparison with fairness experiment (see commits by David Reiss) + +| | Fairness | SAA | +|---|---|---| +| **Treatments** | Same workload, two matcher modes | Two execution types (SAW vs SAA) | +| **Quantity computed** | $\Delta C / C$ | Ratio $r_i = q_i(\text{SAA}) / q_i(\text{SAW})$ | +| **Load shape** | Sinusoidal backlog (exercises matcher) | Steady-state at fixed start rate (our model assumes sync match) | +| **What is measured** | CPU per service, Astra operation rates | CPU per service, memory per service, Astra operation rates by table and verb, WAL write rates, visibility write rates, RPC handling rates per service per method | +| **Predictions to validate** | None — purely empirical | $r_\text{Cass} = 3/7$, $r_\text{WAL} = 3/14$, $r_\text{Vis} = 3/3$, per-method RPC rates matching proposal table | + +Fixed start rate (not fixed task throughput) because SAA and SAW generate different numbers of tasks per execution. + +**Question**: what is the incremental COGS of enabling the fairness matcher vs the classic matcher? + +**COGS components**: (1) Astra queries (~35% of total COGS), (2) EC2 compute (~9%, split across frontend+matching and history). Ignored: data transfer, Astra storage, non-AWS costs (Clickhouse <3%). + +**Setup**: dedicated test cell `s-oss-dnr-faircogs3` (64 partitions). Load generator: Omes Ebb and Flow — sinusoidal activity task backlog. 5 scenarios (classic, fairness with 0/1k/100k keys, priority), each 6 hours. Measured via [dedicated Grafana dashboard](https://grafana.tmprl-internal.cloud/d/df6pldpkiy1vka/faircogs). + +**Results**: Astra showed no significant increase. CPU increased up to 23% (frontend) and 36% (history) in the worst case (1k fairness keys). COGS impact: $(0.035 \times 0.23) + (0.057 \times 0.36) = 2.8\%$. Pricing council recommendation: price fairness on value to customer, not COGS. + + + + + +
+ +
+Appendix: possible experimental outcomes + +- **Analytical predictions confirmed, $R$ in predicted range.** Observed $r_\text{Cass}$, $r_\text{WAL}$, $r_\text{Vis}$, and per-method RPC rates match the analytical derivations. $r_\text{compute}$ lands in $[0.4, 1.0]$, giving $R$ in roughly $0.52$–$0.64$. We present $R$ with a tighter confidence interval than the proposal (because $r_\text{compute}$ is now estimated, not bounded). +- **$r_\text{compute}$ is low, pushing $R$ toward 0.5.** If $r_\text{compute} \approx 0.4$ and analytical predictions hold, $R \approx 0.52$. Cannibalization is near-zero. +- **Observed $r_i$ diverge from analytical predictions.** Some assumption is wrong (e.g. sync match doesn't hold at test load, or there are unaccounted persistence writes). We recompute $R$ using observed values and identify which assumption failed and whether it reflects production conditions or a test artifact. +- **$R$ is higher than predicted.** $R > 0.64$ would mean worse cannibalization than estimated. Options: accept the margin reduction (bounded by ~3% SAW share), adjust billing, or identify engineering work to reduce SAA COGS. + +
+ +END_DOCUMENT------------------------------------------------------------------------------ + + +Your task is to help me design and build the omes-based tooling that we will use to perform the experiments outlined above to learn about COGS of SAA an SAW. We are in the omes repo; study it carefully. Our work will broadly break into the following phases that we must design holistically: + +(1) Add any missing omes functionality that will be needed in order to be able to use omes to generate the SAA and SAW load for the experiments. +(2) Run the experiments against the cloud cell that Stephen has prepared: its name is s-saa-cogs. + +Stephen linked to the 'scaffold' run that created the cell. I see it had the following input: + +{ + "CellConfig": { + "Identity": { + "Location": { + "CloudProvider": "aws", + "AccountID": "124355634071", + "Region": "us-west-2" + }, + "ID": "s-saa-cogs" + }, + "Template": "v5-aws-dev", + "ServerVersion": "v3.151.9_oss1.31.0_151.6", + "AgentVersion": "v3.151.9_oss1.31.0_151.6", + "WebVersion": "v2.47.0", + "GoCanaryVersion": "v1.35.0", + "ComponentVersion": "v2026-03-20.00", + "WalVersion": "v10.0.3", + "EnableMetering": false + }, + "FailurePolicy": 1 +} + +and output: + +{ + "Cell": { + "Identity": { + "Location": { + "CloudProvider": "aws", + "AccountID": "124355634071", + "Region": "us-west-2" + }, + "ID": "s-saa-cogs" + } + } +} + +I am not familiar with performing operations against cloud cells, so you will need to resarch and help me during this. But we have several good resources: study the contents of the 'oncall' and 'runbooks' repos, and also use the /agent-slack skill. You also have Notion and Temporal Docs MCP. Use the more modern 'ct' rather than its alias 'omni'. + +Initial grafana dashboard JSON is at .task/saacogs.json. + +Important: I'd like an early aim to be to get an end-to-end proof-of-principle of this working. Therefore let's not make the omes component sophisticated initially; just the bare minimum to run an SAW and SAA workload. But I am a bit intimidated by doing anything with the cloud cell since I don't know how. So I guess one early aim is to be able to point our metrics dashboard at s-saa-cogs, and see idle state, then run one of our omes commands, and see activity increase in the dashboard. Please maintain a file of useful shell commands with terse comments where necessary. I will run them and show you the outut. Don't do operations against cloud or observability yourself unless I explicitly ask you to. + +In the omes work, we must use the latest version of the SDK with Standalone Activity support, such that our code is consistent with what Temporal Docs (use MCP) and the samples-go repo show. \ No newline at end of file diff --git a/AGENTS.md b/AGENTS.md new file mode 120000 index 00000000..81744092 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1 @@ +.task/AGENTS.md \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 120000 index 00000000..81744092 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +.task/AGENTS.md \ No newline at end of file From 3a35b44711c5a927f6566129859b9370654c3830 Mon Sep 17 00:00:00 2001 From: Dan Davison Date: Tue, 24 Mar 2026 16:34:23 -0400 Subject: [PATCH 28/40] Add some other dashboards --- .task/faircogs.json | 799 ++ .task/history.json | 27751 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 28550 insertions(+) create mode 100644 .task/faircogs.json create mode 100644 .task/history.json diff --git a/.task/faircogs.json b/.task/faircogs.json new file mode 100644 index 00000000..e11dd1da --- /dev/null +++ b/.task/faircogs.json @@ -0,0 +1,799 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 971, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "editorMode": "code", + "expr": "sum(rate(service_requests{cluster=\"s-oss-dnr-faircogs3\",temporal_service_type=\"matching\"}[1m]))", + "legendFormat": "Matching RPS", + "range": true, + "refId": "A" + } + ], + "title": "Matching RPS", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": ["Value"], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "editorMode": "code", + "expr": "sum(rate(service_requests{cluster=\"s-oss-dnr-faircogs3\",temporal_service_type=\"history\"}[1m]))", + "legendFormat": "History RPS", + "range": true, + "refId": "A" + } + ], + "title": "History RPS", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "adhocFilters": [], + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "editorMode": "code", + "expr": "quantile(0.5, sum by (pod) (container_memory_working_set_bytes{cluster=\"s-oss-dnr-faircogs3\",container!=\"\",image!=\"\",namespace=\"temporal\"} * on (namespace, pod) group_left (workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"s-oss-dnr-faircogs3\",namespace=\"temporal\",workload=\"matching\",workload_type=\"deployment\"}))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "p50", + "range": true, + "refId": "D", + "step": 10 + }, + { + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "editorMode": "code", + "expr": "quantile(0.5, sum by (pod) (container_memory_working_set_bytes{cluster=\"s-oss-dnr-faircogs3\",container!=\"\",image!=\"\",namespace=\"temporal\"} * on (namespace, pod) group_left (workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"s-oss-dnr-faircogs3\",namespace=\"temporal\",workload=\"frontend\",workload_type=\"deployment\"}))", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "editorMode": "code", + "expr": "quantile(0.5, sum by (pod) (container_memory_working_set_bytes{cluster=\"s-oss-dnr-faircogs3\",container!=\"\",image!=\"\",namespace=\"temporal\"} * on (namespace, pod) group_left (workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{cluster=\"s-oss-dnr-faircogs3\",namespace=\"temporal\",workload=\"history\",workload_type=\"deployment\"}))", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "p50 mem usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "adhocFilters": [], + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "editorMode": "code", + "expr": "sum(rate(cassandra_query{cluster=\"s-oss-dnr-faircogs3\"} [$__rate_interval])) + sum(rate(cassandra_batch{cluster=\"s-oss-dnr-faircogs3\"} [$__rate_interval]))", + "interval": "", + "key": "Q-348416b5-2a03-42f3-bdc8-5fbd4a2f6bcf-0", + "legendFormat": "rps", + "range": true, + "refId": "A" + }, + { + "adhocFilters": [], + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "editorMode": "code", + "expr": "sum(rate(cassandra_query{cluster=\"s-oss-dnr-faircogs3\",table=\"tasks\"} [$__rate_interval])) + sum(rate(cassandra_batch{cluster=\"s-oss-dnr-faircogs3\",table=\"tasks\"} [$__rate_interval]))", + "hide": false, + "instant": false, + "interval": "", + "key": "Q-ca13377f-be45-41e0-af3b-4bc8861ee6fb-1", + "legendFormat": "tasks rps", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "editorMode": "code", + "exemplar": false, + "expr": "(\n sum(\n rate(\n cassandra_query{\n cluster=\"s-oss-dnr-faircogs3\",\n table=\"tasks_v2\"\n }[$__rate_interval]\n )\n )\n+\n sum(\n rate(\n cassandra_batch{\n cluster=\"s-oss-dnr-faircogs3\",\n table=\"tasks_v2\"\n }[$__rate_interval]\n )\n )\n) OR on() vector(0)", + "hide": false, + "instant": false, + "legendFormat": "tasks_v2 rps", + "range": true, + "refId": "C" + }, + { + "datasource": { + "name": "Expression", + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "$B + $C", + "hide": false, + "refId": "tasks + tasks_v2 RPS", + "type": "math" + } + ], + "title": "Astra RPS", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 32 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "adhocFilters": [], + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "editorMode": "code", + "expr": "sum(rate(cassandra_query{cluster=\"s-oss-dnr-faircogs3\",table=\"tasks_v2\"} [$__rate_interval])) + sum(rate(cassandra_batch{cluster=\"s-oss-dnr-faircogs3\",table=\"tasks_v2\"} [$__rate_interval]))", + "hide": true, + "interval": "", + "key": "Q-348416b5-2a03-42f3-bdc8-5fbd4a2f6bcf-0", + "legendFormat": "rps", + "range": true, + "refId": "A" + }, + { + "adhocFilters": [], + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "editorMode": "code", + "expr": "sum by (temporal_namespace, task_priority)(\n rate(\n approximate_backlog_count{\n cluster=\"s-oss-dnr-faircogs3\",\n temporal_service_type=\"matching\",\n task_type=\"Activity\",\n temporal_namespace=~\"faircogs.*\"\n }[$__rate_interval]\n )\n)\n", + "hide": true, + "instant": false, + "interval": "", + "key": "Q-ca13377f-be45-41e0-af3b-4bc8861ee6fb-1", + "legendFormat": "backlog count rate", + "range": true, + "refId": "B" + }, + { + "datasource": { + "name": "Expression", + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "$A / $B", + "hide": false, + "refId": "tasks_v2_divided_by_backlog_size", + "type": "math" + } + ], + "title": "tasks_v2 RPS vs backlog increase rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": ["matching avg CPU"], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "adhocFilters": [], + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "editorMode": "code", + "expr": "avg(sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"s-oss-dnr-faircogs3\", pod=~\"matching-.*\"}) by (node))", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "matching avg CPU", + "range": true, + "refId": "F" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "editorMode": "code", + "expr": "avg(sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"s-oss-dnr-faircogs3\", pod=~\"frontend-.*\"}) by (node))", + "hide": false, + "instant": false, + "legendFormat": "frontend avg CPU", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "ef667c0e-d08c-4b40-9761-479514828632" + }, + "editorMode": "code", + "expr": "avg(sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate{cluster=\"s-oss-dnr-faircogs3\", pod=~\"history-.*\"}) by (node))", + "hide": false, + "instant": false, + "legendFormat": "history avg CPU", + "range": true, + "refId": "B" + } + ], + "title": "Avg CPU usage", + "type": "timeseries" + } + ], + "preload": false, + "schemaVersion": 40, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "utc", + "title": "faircogs", + "uid": "df6pldpkiy1vka", + "version": 31, + "weekStart": "" +} diff --git a/.task/history.json b/.task/history.json new file mode 100644 index 00000000..b4c89407 --- /dev/null +++ b/.task/history.json @@ -0,0 +1,27751 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "loki", + "uid": "e008932a-e9dc-4b7a-819f-68b662f3dc51" + }, + "enable": true, + "expr": "{cluster=\"newton\",k8s_app=\"cell-worker\"} |= \"deploying temporal service\" | pattern `