From e198e4d1078c52500a6a62697fb955a5ce5083ff Mon Sep 17 00:00:00 2001 From: sufjanfana Date: Mon, 6 Apr 2026 05:29:00 -0700 Subject: [PATCH] Add agent team tracing with deferred turns, dedup, and token rewrite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Team-aware Turn lifecycle: defer emission while team is active - Agent grouping spans with shutdown_response sentinel - check_team_dedup protocol for lead/teammate dedup - New hooks: TeammateIdle, TaskCompleted (11 total) - parse_transcript replaces per-line jq loops (O(N) → O(1)) - Uppercase OpenInference span kinds, rename subagent.* → agent.* - Performance: batched state reads, del_states, get_or_set_state Co-Authored-By: Claude Opus 4.6 (1M context) --- README.md | 12 +- .../.claude-plugin/plugin.json | 24 +- plugins/claude-code-tracing/hooks/common.sh | 267 ++++++++++++++++-- .../claude-code-tracing/hooks/notification.sh | 26 +- .../hooks/permission_request.sh | 26 +- .../hooks/post_tool_use.sh | 190 ++++++++++--- .../claude-code-tracing/hooks/session_end.sh | 4 + .../hooks/session_start.sh | 4 + plugins/claude-code-tracing/hooks/stop.sh | 66 +++-- .../hooks/subagent_stop.sh | 115 +++++--- .../hooks/task_completed.sh | 56 ++++ .../hooks/teammate_idle.sh | 18 ++ .../hooks/user_prompt_submit.sh | 40 +-- 13 files changed, 668 insertions(+), 180 deletions(-) create mode 100755 plugins/claude-code-tracing/hooks/task_completed.sh create mode 100755 plugins/claude-code-tracing/hooks/teammate_idle.sh diff --git a/README.md b/README.md index 5c18c6a..012569d 100644 --- a/README.md +++ b/README.md @@ -118,11 +118,12 @@ await client.close(); The plugin is fully compatible with the Agent SDK with some caveats: -- **TypeScript SDK** — All 9 hooks are supported. Full feature parity with the CLI. -- **Python SDK** — `SessionStart`, `SessionEnd`, `Notification`, and `PermissionRequest` hooks are not fired by the Python SDK. The plugin handles this gracefully: +- **TypeScript SDK** — All 11 hooks are supported. Full feature parity with the CLI. +- **Python SDK** — `SessionStart`, `SessionEnd`, `Notification`, `PermissionRequest`, `TeammateIdle`, and `TaskCompleted` hooks are not fired by the Python SDK. The plugin handles this gracefully: - Session initialization happens lazily on the first `UserPromptSubmit` if `SessionStart` didn't fire - Stale state files are garbage-collected periodically by the `Stop` hook - Notification and permission request spans are simply not created (non-critical, informational only) + - TeammateIdle and TaskCompleted are agent-team hooks not supported by the Python SDK; team state caching and task completion spans are skipped --- @@ -132,8 +133,8 @@ Trace your Claude Code sessions to [Arize AX](https://arize.com) or [Phoenix](ht ## Features -- **9 Hooks** — Most comprehensive tracing coverage available - - SessionStart, UserPromptSubmit, PreToolUse, PostToolUse, Stop, SubagentStop, Notification, PermissionRequest, SessionEnd +- **11 Hooks** — Most comprehensive tracing coverage available + - SessionStart, UserPromptSubmit, PreToolUse, PostToolUse, Stop, SubagentStop, Notification, PermissionRequest, TeammateIdle, TaskCompleted, SessionEnd - **Dual Target Support** — Send traces to Arize AX (cloud) or Phoenix (self-hosted) - **OpenInference Format** — Standard span format compatible with any OpenInference tool - **Guided Setup Skill** — `/setup-claude-code-tracing` walks you through configuration @@ -225,6 +226,7 @@ For on-prem Arize instances, also set the OTLP endpoint: | `ARIZE_TRACE_ENABLED` | No | `true` | Enable/disable tracing | | `ARIZE_DRY_RUN` | No | `false` | Print spans instead of sending | | `ARIZE_VERBOSE` | No | `false` | Enable verbose logging | +| `ARIZE_USER_ID` | No | - | User identifier for trace attribution (adds `user.id` to all spans) | | `ARIZE_LOG_FILE` | No | `/tmp/arize-claude-code.log` | Log file path (set empty to disable) | ## Usage @@ -264,6 +266,8 @@ ARIZE_VERBOSE=true claude | `SubagentStop` | Subagent completes | Agent type, model, token counts, output | CLI, TS, Python | | `Notification` | System notification | Title, message, notification type | CLI, TS | | `PermissionRequest` | Permission requested | Permission type, tool name | CLI, TS | +| `TeammateIdle` | Teammate goes idle | Team name, teammate name (state-only, no span) | CLI, TS | +| `TaskCompleted` | Task marked complete | Task ID, subject, description, team context | CLI, TS | | `SessionEnd` | Session closes | Trace count, tool count | CLI, TS | **SDK Support key:** CLI = Claude Code CLI, TS = TypeScript Agent SDK, Python = Python Agent SDK diff --git a/plugins/claude-code-tracing/.claude-plugin/plugin.json b/plugins/claude-code-tracing/.claude-plugin/plugin.json index e05cb32..20685d5 100644 --- a/plugins/claude-code-tracing/.claude-plugin/plugin.json +++ b/plugins/claude-code-tracing/.claude-plugin/plugin.json @@ -1,6 +1,6 @@ { "name": "claude-code-tracing", - "description": "Automatic tracing of Claude Code sessions to Arize AX or Phoenix with OpenInference spans. Supports 9 hooks for comprehensive observability.", + "description": "Automatic tracing of Claude Code sessions to Arize AX or Phoenix with OpenInference spans. Supports 11 hooks for comprehensive observability.", "version": "1.0.0", "author": { "name": "Arize AI", @@ -8,7 +8,7 @@ "url": "https://arize.com" }, "repository": "https://github.com/Arize-ai/arize-claude-code-plugin", - "license": "Apache-2.0", + "license": "MIT", "keywords": [ "tracing", "observability", @@ -98,6 +98,26 @@ ] } ], + "TeammateIdle": [ + { + "hooks": [ + { + "type": "command", + "command": "bash \"${CLAUDE_PLUGIN_ROOT}/hooks/teammate_idle.sh\"" + } + ] + } + ], + "TaskCompleted": [ + { + "hooks": [ + { + "type": "command", + "command": "bash \"${CLAUDE_PLUGIN_ROOT}/hooks/task_completed.sh\"" + } + ] + } + ], "SessionEnd": [ { "hooks": [ diff --git a/plugins/claude-code-tracing/hooks/common.sh b/plugins/claude-code-tracing/hooks/common.sh index 9895744..acd94aa 100755 --- a/plugins/claude-code-tracing/hooks/common.sh +++ b/plugins/claude-code-tracing/hooks/common.sh @@ -36,19 +36,42 @@ generate_uuid() { od -x /dev/urandom | head -1 | awk '{print $2$3"-"$4"-4"substr($5,2)"-a"substr($6,2)"-"$7$8$9}' } +generate_trace_id() { generate_uuid | tr -d '-'; } +generate_span_id() { generate_uuid | tr -d '-' | cut -c1-16; } + get_timestamp_ms() { - python3 -c "import time; print(int(time.time() * 1000))" 2>/dev/null || \ - date +%s%3N 2>/dev/null || date +%s000 + # GNU date (Linux) supports %N; macOS outputs literal "3N" so verify digits-only + local _ts; _ts=$(date +%s%3N 2>/dev/null) + if [[ "$_ts" =~ ^[0-9]{13,}$ ]]; then echo "$_ts"; return; fi + # macOS/BSD: perl is ~5x lighter than python3 for this + perl -MTime::HiRes -e 'printf "%d\n", Time::HiRes::time()*1000' 2>/dev/null || \ + python3 -c "import time; print(int(time.time() * 1000))" 2>/dev/null || \ + echo "$(date +%s)000" +} + +get_file_birth_ms() { + local _fb_path="$1" + [[ -z "$_fb_path" || ! -f "$_fb_path" ]] && return 0 + if stat -f %B "$_fb_path" &>/dev/null; then + echo $(( $(stat -f %B "$_fb_path") * 1000 )) + elif stat -c %W "$_fb_path" &>/dev/null; then + local _fb_val; _fb_val=$(stat -c %W "$_fb_path") + [[ "$_fb_val" =~ ^[0-9]+$ && "$_fb_val" -gt 0 ]] && echo $(( _fb_val * 1000 )) + fi } # --- State (per-session JSON file with mkdir-based locking) --- +_STATE_INITIALIZED="" + init_state() { + [[ -n "$_STATE_INITIALIZED" ]] && return 0 mkdir -p "$STATE_DIR" if [[ ! -f "$STATE_FILE" ]]; then echo '{}' > "$STATE_FILE" else jq empty "$STATE_FILE" 2>/dev/null || echo '{}' > "$STATE_FILE" fi + _STATE_INITIALIZED=1 } _LOCK_DIR="${STATE_DIR}/.lock_${_CLAUDE_PID:-$$}" @@ -72,7 +95,7 @@ _unlock_state() { } get_state() { - jq -r ".[\"$1\"] // empty" "$STATE_FILE" 2>/dev/null || echo "" + jq -r --arg k "$1" '.[$k] // empty' "$STATE_FILE" 2>/dev/null || echo "" } set_state() { @@ -85,14 +108,124 @@ set_state() { del_state() { _lock_state local tmp="${STATE_FILE}.tmp.$$" - jq "del(.[\"$1\"])" "$STATE_FILE" > "$tmp" 2>/dev/null && mv "$tmp" "$STATE_FILE" || rm -f "$tmp" + jq --arg k "$1" 'del(.[$k])' "$STATE_FILE" > "$tmp" 2>/dev/null && mv "$tmp" "$STATE_FILE" || rm -f "$tmp" + _unlock_state +} + +del_states() { + [[ $# -eq 0 ]] && return 0 + _lock_state + local tmp="${STATE_FILE}.tmp.$$" + jq '[($ARGS.positional[])] as $keys | with_entries(select(.key as $k | $keys | index($k) | not))' \ + --args "$@" < "$STATE_FILE" > "$tmp" 2>/dev/null && mv "$tmp" "$STATE_FILE" || rm -f "$tmp" _unlock_state } +get_or_set_state() { + _lock_state + local val + val=$(jq -r --arg k "$1" '.[$k] // empty' "$STATE_FILE" 2>/dev/null || echo "") + if [[ -z "$val" ]]; then + val="$2" + local tmp="${STATE_FILE}.tmp.$$" + jq --arg k "$1" --arg v "$val" '. + {($k): $v}' "$STATE_FILE" > "$tmp" 2>/dev/null && mv "$tmp" "$STATE_FILE" || rm -f "$tmp" + fi + _unlock_state + echo "$val" +} + +# Returns 1 (skip) if this event is a duplicate in a team scenario. +# Convention: returns 0 = proceed, 1 = skip. Callers: `check_team_dedup ... || exit 0` +# The teammate's first event sets a marker and proceeds. Repeat teammate events +# for the same item_id are skipped. The lead's event finds the marker and skips. +# In single-agent mode (no agent_id, no markers), always returns 0. +check_team_dedup() { + local item_id="$1" prefix="$2" agent_id="$3" + [[ -z "$item_id" ]] && return 0 + if [[ -n "$agent_id" ]]; then + [[ -n "$(get_state "${prefix}_${item_id}")" ]] && return 1 + set_state "${prefix}_${item_id}" "1"; return 0 + elif [[ -n "$(get_state "${prefix}_${item_id}")" ]]; then + del_state "${prefix}_${item_id}"; return 1 + fi + return 0 +} + +# Remove per-team ephemeral keys (teammate_, active_agent_, seen_tool_, tc_seen_). +# agent_* keys are intentionally kept: SubagentStop needs agent_*_span_id / +# agent_*_start_time to emit grouping spans, and agent_*_shutdown_complete is +# the sentinel that tells SubagentStop to skip span creation when post_tool_use +# already emitted the grouping span via the shutdown_response path. +clean_team_state() { + _lock_state + local tmp="${STATE_FILE}.tmp.$$" + jq 'with_entries(select(.key | test("^(teammate_|active_agent_|seen_tool_|tc_seen_)") | not))' \ + "$STATE_FILE" > "$tmp" 2>/dev/null && mv "$tmp" "$STATE_FILE" || rm -f "$tmp" + _unlock_state +} + +# Send Turn span with full transcript data, then clean up. +# Called by UserPromptSubmit (new prompt) and SessionEnd (session closes). +# $1 = transcript path (optional) +close_active_turn() { + local _cat_transcript="${1:-}" + local _cat_tid _cat_sid + _cat_tid=$(get_state "current_trace_id"); _cat_sid=$(get_state "current_trace_span_id") + [[ -z "$_cat_tid" || -z "$_cat_sid" ]] && return 0 + + local _cat_start _cat_prompt _cat_count _cat_proj _cat_uid _cat_sess _cat_tn _cat_sline + _cat_start=$(get_state "current_trace_start_time") + _cat_prompt=$(get_state "current_trace_prompt") + _cat_count=$(get_state "trace_count") + _cat_proj=$(get_state "project_name") + _cat_uid=$(get_state "user_id") + _cat_sess=$(get_state "session_id") + _cat_tn=$(get_state "turn_team_name") + [[ -z "$_cat_tn" ]] && _cat_tn=$(get_state "team_name") + _cat_sline=$(get_state "trace_start_line") + + local _cat_output="" _cat_model="" _cat_in=0 _cat_out=0 + local _cat_end + _cat_end=$(get_state "deferred_turn_end_time") + [[ -z "$_cat_end" ]] && _cat_end=$(get_timestamp_ms) + [[ -z "$_cat_transcript" ]] && _cat_transcript=$(get_state "deferred_transcript") + if [[ -n "$_cat_transcript" && -f "$_cat_transcript" ]]; then + parse_transcript "$_cat_transcript" "${_cat_sline:-0}" + _cat_output=$(printf '%s' "$_pt_all_text" | head -c "10000") + _cat_model="$_pt_model"; _cat_in=$_pt_in_tokens; _cat_out=$_pt_out_tokens + fi + [[ -z "$_cat_output" ]] && _cat_output="(No response)" + local _cat_tot=$((_cat_in + _cat_out)) + + local _cat_msgs; _cat_msgs=$(jq -nc --arg o "$_cat_output" \ + '[{"message.role":"assistant","message.content":$o}]') + local _cat_attrs; _cat_attrs=$(jq -nc \ + --arg sid "$_cat_sess" --arg num "$_cat_count" --arg proj "$_cat_proj" \ + --arg in "$_cat_prompt" --arg out "$_cat_output" --arg model "$_cat_model" \ + --argjson in_tok "$_cat_in" --argjson out_tok "$_cat_out" --argjson total "$_cat_tot" \ + --argjson msgs "$_cat_msgs" --arg tn "$_cat_tn" --arg uid "$_cat_uid" \ + '{"session.id":$sid,"trace.number":$num,"project.name":$proj, + "openinference.span.kind":"AGENT","llm.model_name":$model, + "llm.token_count.prompt":$in_tok,"llm.token_count.completion":$out_tok, + "llm.token_count.total":$total,"input.value":$in,"output.value":$out, + "llm.output_messages":$msgs} + + (if $tn != "" then {"team.name":$tn} else {} end) + + (if $uid != "" then {"user.id":$uid} else {} end)') + + send_span "$(build_span "Turn $_cat_count" "AGENT" "$_cat_sid" "$_cat_tid" "" \ + "$_cat_start" "$_cat_end" "$_cat_attrs")" || true + set_state "last_trace_id" "$_cat_tid" + set_state "last_trace_span_id" "$_cat_sid" + del_states current_trace_id current_trace_span_id current_trace_start_time \ + current_trace_prompt turn_team_name \ + deferred_transcript deferred_turn_end_time + log "Turn $_cat_count closed" +} + inc_state() { _lock_state local val - val=$(jq -r ".[\"$1\"] // \"0\"" "$STATE_FILE" 2>/dev/null) + val=$(jq -r --arg k "$1" '.[$k] // "0"' "$STATE_FILE" 2>/dev/null) local tmp="${STATE_FILE}.tmp.$$" jq --arg k "$1" --arg v "$((${val:-0} + 1))" '. + {($k): $v}' "$STATE_FILE" > "$tmp" 2>/dev/null && mv "$tmp" "$STATE_FILE" || rm -f "$tmp" _unlock_state @@ -117,11 +250,11 @@ send_to_phoenix() { name: .name, context: { trace_id: .traceId, span_id: .spanId }, parent_id: .parentSpanId, - span_kind: "CHAIN", + span_kind: ((.attributes[] | select(.key == "openinference.span.kind") | .value.stringValue) // "CHAIN"), start_time: ((.startTimeUnixNano | tonumber) / 1e9 | strftime("%Y-%m-%dT%H:%M:%SZ")), end_time: ((.endTimeUnixNano | tonumber) / 1e9 | strftime("%Y-%m-%dT%H:%M:%SZ")), status_code: "OK", - attributes: (reduce .attributes[] as $a ({}; . + {($a.key): ($a.value.stringValue // $a.value.intValue // "")})) + attributes: (reduce .attributes[] as $a ({}; . + {($a.key): ($a.value.stringValue // $a.value.intValue // $a.value.doubleValue // "")})) }] }') @@ -181,7 +314,9 @@ send_to_arize() { # --- Main send function --- send_span() { local span_json="$1" - local target=$(get_target) + local span_name="${2:-}" + local target + target=$(get_target) if [[ "$ARIZE_DRY_RUN" == "true" ]]; then log_always "DRY RUN:" @@ -197,17 +332,23 @@ send_span() { *) error "No target. Set PHOENIX_ENDPOINT or ARIZE_API_KEY + ARIZE_SPACE_ID"; return 1 ;; esac - local span_name - span_name=$(echo "$span_json" | jq -r '.resourceSpans[0].scopeSpans[0].spans[0].name // "unknown"' 2>/dev/null) + [[ -z "$span_name" ]] && span_name=$(echo "$span_json" | jq -r '.resourceSpans[0].scopeSpans[0].spans[0].name // "unknown"' 2>/dev/null) log "Sent span: $span_name ($target)" } # --- Build OTLP span --- build_span() { - local name="$1" kind="$2" span_id="$3" trace_id="$4" - local parent="${5:-}" start="$6" end="${7:-$start}" attrs + # $_kind is intentionally unused in the OTLP output below. OTLP SpanKind (SERVER/CLIENT/etc.) + # is for RPC roles; OpenInference types (LLM/TOOL/CHAIN) are carried via the + # openinference.span.kind attribute instead. kind=1 (INTERNAL) is correct for all spans. + local name="$1" _kind="$2" span_id="$3" trace_id="$4" + local parent="${5:-}" start="$6" + local end="${7:-$start}" attrs attrs="${8:-"{}"}" + local name_json + name_json="\"$(printf '%s' "$name" | sed 's/\\/\\\\/g;s/"/\\"/g;s/ /\\t/g')\"" + local parent_json="" [[ -n "$parent" ]] && parent_json="\"parentSpanId\": \"$parent\"," @@ -216,7 +357,7 @@ build_span() { {"key":"service.name","value":{"stringValue":"claude-code"}} ]},"scopeSpans":[{"scope":{"name":"arize-claude-plugin"},"spans":[{ "traceId":"$trace_id","spanId":"$span_id",$parent_json - "name":"$name","kind":1, + "name":$name_json,"kind":1, "startTimeUnixNano":"${start}000000","endTimeUnixNano":"${end}000000", "attributes":$(echo "$attrs" | jq -c '[to_entries[]|{"key":.key,"value":(if (.value|type)=="number" then (if ((.value|floor) == .value) then {"intValue":.value} else {"doubleValue":.value} end) else {"stringValue":(.value|tostring)} end)}]'), "status":{"code":1} @@ -230,8 +371,8 @@ EOF # Call after reading stdin in each hook. Falls back to PID-based key if no session_id. resolve_session() { local input="${1:-'{}'}" - local sid - sid=$(echo "$input" | jq -r '.session_id // empty' 2>/dev/null || echo "") + local sid="${2:-}" + [[ -z "$sid" ]] && sid=$(echo "$input" | jq -r '.session_id // empty' 2>/dev/null || echo "") if [[ -n "$sid" ]]; then _SESSION_KEY="$sid" @@ -244,6 +385,7 @@ resolve_session() { STATE_FILE="${STATE_DIR}/state_${_SESSION_KEY}.json" _LOCK_DIR="${STATE_DIR}/.lock_${_SESSION_KEY}" + _STATE_INITIALIZED="" # new state file needs validation init_state } @@ -302,9 +444,102 @@ gc_stale_state_files() { done } +# --- Transcript Parsing --- +# Sets caller-scope variables: _pt_model, _pt_in_tokens, _pt_out_tokens, _pt_all_text +parse_transcript() { + local _pt_file="$1" _pt_skip="${2:-0}" + _pt_model="" _pt_in_tokens=0 _pt_out_tokens=0 _pt_all_text="" + + [[ ! -f "$_pt_file" ]] && return 0 + + # Token dedup: streaming chunks share a requestId. Only the final entry per + # requestId carries cumulative counts, so the reduce commits on rid change. + # Branches: no requestId -> sum directly; new requestId -> finalize prev group; same -> replace (last is cumulative) + local _pt_raw + _pt_raw=$(tail -n +"$((_pt_skip + 1))" "$_pt_file" | jq -rsc ' + [.[] | select(.type == "assistant")] | + { + model: (map(.message.model // empty) | map(select(. != "")) | last // ""), + text: [.[] | .message.content | + if type == "array" then [.[] | select(.type == "text") | .text] | join("\n") + else (. // "") + end | select(. != "" and . != "null") + ] | join("\n") | gsub("^\\n+|\\n+$"; ""), + in_tokens: (reduce .[] as $e ( + {last_rid: null, sum: 0, current: 0}; + if ($e.requestId == null or $e.requestId == "") + then {last_rid: null, sum: (.sum + .current + (($e.message.usage.input_tokens // 0) + ($e.message.usage.cache_read_input_tokens // 0) + ($e.message.usage.cache_creation_input_tokens // 0))), current: 0} + elif ($e.requestId != .last_rid) + then {last_rid: $e.requestId, sum: (.sum + .current), current: (($e.message.usage.input_tokens // 0) + ($e.message.usage.cache_read_input_tokens // 0) + ($e.message.usage.cache_creation_input_tokens // 0))} + else {last_rid: .last_rid, sum: .sum, current: (($e.message.usage.input_tokens // 0) + ($e.message.usage.cache_read_input_tokens // 0) + ($e.message.usage.cache_creation_input_tokens // 0))} + end) | (.sum + .current)), + out_tokens: (reduce .[] as $e ( + {last_rid: null, sum: 0, current: 0}; + if ($e.requestId == null or $e.requestId == "") + then {last_rid: null, sum: (.sum + .current + ($e.message.usage.output_tokens // 0)), current: 0} + elif ($e.requestId != .last_rid) + then {last_rid: $e.requestId, sum: (.sum + .current), current: ($e.message.usage.output_tokens // 0)} + else {last_rid: .last_rid, sum: .sum, current: ($e.message.usage.output_tokens // 0)} + end) | (.sum + .current)) + } | (([.model // "", (.in_tokens // 0 | tostring), (.out_tokens // 0 | tostring)] | join("\u001f")), (.text // "")) + ' 2>/dev/null) || return 0 + + # Metadata on first line (unit-separator-delimited), text on remaining lines + { IFS=$'\x1f' read -r _pt_model _pt_in_tokens _pt_out_tokens; _pt_all_text=$(cat); } <<< "$_pt_raw" +} + +# --- Trace Context Resolution --- +# Resolve current trace ID, falling back to last completed turn's trace +resolve_trace_id() { + local tid + tid=$(get_state "current_trace_id") + [[ -z "$tid" ]] && tid=$(get_state "last_trace_id") + echo "$tid" +} + +# Resolve parent span ID with agent grouping override +# Args: $1 = agent_id (optional), $2 = agent_type (optional, fallback) +resolve_parent_span_id() { + local agent_id="${1:-}" + local agent_type="${2:-}" + local parent_span_id + parent_span_id=$(get_state "current_trace_span_id") + [[ -z "$parent_span_id" ]] && parent_span_id=$(get_state "last_trace_span_id") + # Direct lookup by agent_id + if [[ -n "$agent_id" ]]; then + local agent_pid + agent_pid=$(get_state "agent_${agent_id}_span_id") + [[ -n "$agent_pid" ]] && parent_span_id="$agent_pid" + elif [[ -n "$agent_type" ]]; then + # Fallback: reverse-map agent_type → agent_id → span_id + local mapped_id + mapped_id=$(get_state "active_agent_${agent_type}") + if [[ -n "$mapped_id" ]]; then + local agent_pid + agent_pid=$(get_state "agent_${mapped_id}_span_id") + [[ -n "$agent_pid" ]] && parent_span_id="$agent_pid" + fi + fi + echo "$parent_span_id" +} + +# --- Role Resolution --- +# Sets caller-scope: _role ("lead", "teammate", "subagent", or "") +# Args: $1 = agent_id, $2 = team_name +determine_role() { + _role="" + if [[ -n "$1" ]]; then + _role="subagent" + [[ -n "$2" ]] && _role="teammate" + elif [[ -n "$2" ]]; then + _role="lead" + fi + return 0 +} + # --- Init --- check_requirements() { [[ "$ARIZE_TRACE_ENABLED" != "true" ]] && exit 0 - command -v jq &>/dev/null || { error "jq required. Install: brew install jq"; exit 1; } + command -v jq &>/dev/null || { error "jq required. See: https://jqlang.github.io/jq/download/"; exit 1; } init_state } diff --git a/plugins/claude-code-tracing/hooks/notification.sh b/plugins/claude-code-tracing/hooks/notification.sh index 6daafab..0eb5323 100755 --- a/plugins/claude-code-tracing/hooks/notification.sh +++ b/plugins/claude-code-tracing/hooks/notification.sh @@ -8,27 +8,41 @@ input=$(cat 2>/dev/null || echo '{}') resolve_session "$input" -trace_id=$(get_state "current_trace_id") +trace_id=$(resolve_trace_id) [[ -z "$trace_id" ]] && exit 0 session_id=$(get_state "session_id") message=$(echo "$input" | jq -r '.message // empty' 2>/dev/null || echo "") title=$(echo "$input" | jq -r '.title // empty' 2>/dev/null || echo "") -notif_type=$(echo "$input" | jq -r '.notification_type // "info"' 2>/dev/null || echo "info") +notif_type=$(echo "$input" | jq -r '.notification_type // "info"' 2>/dev/null) +[[ "$notif_type" == "idle_prompt" ]] && exit 0 -span_id=$(generate_uuid | tr -d '-' | cut -c1-16) +agent_type=$(echo "$input" | jq -r '.agent_type // empty' 2>/dev/null || echo "") +agent_id=$(echo "$input" | jq -r '.agent_id // empty' 2>/dev/null || echo "") + +span_id=$(generate_span_id) ts=$(get_timestamp_ms) -parent=$(get_state "current_trace_span_id") +parent=$(resolve_parent_span_id "$agent_id" "$agent_type") + +team_name=$(get_state "team_name") +determine_role "$agent_id" "$team_name" user_id=$(get_state "user_id") -attrs=$(jq -n \ +attrs=$(jq -nc \ --arg sid "$session_id" \ --arg msg "$message" \ --arg title "$title" \ --arg type "$notif_type" \ + --arg aid "$agent_id" --arg aname "$agent_type" --arg role "$_role" \ + --arg tn "$team_name" \ --arg uid "$user_id" \ - '{"session.id":$sid,"openinference.span.kind":"chain","notification.message":$msg,"notification.title":$title,"notification.type":$type,"input.value":$msg} + (if $uid != "" then {"user.id":$uid} else {} end)') + '{"session.id":$sid,"openinference.span.kind":"CHAIN","notification.message":$msg,"notification.title":$title,"notification.type":$type,"input.value":$msg} + + (if $aid != "" then {"agent.id":$aid,"agent.name":$aname,"agent.role":$role} + elif $role == "lead" then {"agent.role":"lead"} + else {} end) + + (if $tn != "" then {"team.name":$tn} else {} end) + + (if $uid != "" then {"user.id":$uid} else {} end)') span=$(build_span "Notification: $notif_type" "CHAIN" "$span_id" "$trace_id" "$parent" "$ts" "$ts" "$attrs") send_span "$span" || true diff --git a/plugins/claude-code-tracing/hooks/permission_request.sh b/plugins/claude-code-tracing/hooks/permission_request.sh index 355a577..dcce448 100755 --- a/plugins/claude-code-tracing/hooks/permission_request.sh +++ b/plugins/claude-code-tracing/hooks/permission_request.sh @@ -8,26 +8,38 @@ input=$(cat 2>/dev/null || echo '{}') resolve_session "$input" -_log_to_file "DEBUG permission_request input: $(echo "$input" | jq -c .)" +log "permission_request input: $(echo "$input" | jq -c .)" -trace_id=$(get_state "current_trace_id") +trace_id=$(resolve_trace_id) [[ -z "$trace_id" ]] && exit 0 permission=$(echo "$input" | jq -r '.permission // empty' 2>/dev/null || echo "") tool=$(echo "$input" | jq -r '.tool_name // empty' 2>/dev/null || echo "") tool_input=$(echo "$input" | jq -c '.tool_input // empty' 2>/dev/null || echo "") -span_id=$(generate_uuid | tr -d '-' | cut -c1-16) +agent_type=$(echo "$input" | jq -r '.agent_type // empty' 2>/dev/null || echo "") +agent_id=$(echo "$input" | jq -r '.agent_id // empty' 2>/dev/null || echo "") + +span_id=$(generate_span_id) ts=$(get_timestamp_ms) -parent=$(get_state "current_trace_span_id") +parent=$(resolve_parent_span_id "$agent_id" "$agent_type") session_id=$(get_state "session_id") user_id=$(get_state "user_id") -attrs=$(jq -n --arg sid "$session_id" --arg perm "$permission" --arg tool "$tool" --arg tinput "$tool_input" --arg uid "$user_id" \ - '{"session.id":$sid,"openinference.span.kind":"chain","permission.type":$perm,"permission.tool":$tool,"input.value":$tinput} + (if $uid != "" then {"user.id":$uid} else {} end)') +team_name=$(get_state "team_name") +determine_role "$agent_id" "$team_name" + +attrs=$(jq -nc --arg sid "$session_id" --arg perm "$permission" --arg tool "$tool" --arg tinput "$tool_input" \ + --arg aid "$agent_id" --arg aname "$agent_type" --arg role "$_role" \ + --arg tn "$team_name" --arg uid "$user_id" \ + '{"session.id":$sid,"openinference.span.kind":"CHAIN","permission.type":$perm,"permission.tool":$tool,"input.value":$tinput} + + (if $aid != "" then {"agent.id":$aid,"agent.name":$aname,"agent.role":$role} + elif $role == "lead" then {"agent.role":"lead"} + else {} end) + + (if $tn != "" then {"team.name":$tn} else {} end) + + (if $uid != "" then {"user.id":$uid} else {} end)') span=$(build_span "Permission Request" "CHAIN" "$span_id" "$trace_id" "$parent" "$ts" "$ts" "$attrs") send_span "$span" || true - diff --git a/plugins/claude-code-tracing/hooks/post_tool_use.sh b/plugins/claude-code-tracing/hooks/post_tool_use.sh index 781ef1c..c545567 100755 --- a/plugins/claude-code-tracing/hooks/post_tool_use.sh +++ b/plugins/claude-code-tracing/hooks/post_tool_use.sh @@ -6,31 +6,62 @@ check_requirements input=$(cat 2>/dev/null || echo '{}') [[ -z "$input" ]] && input='{}' -resolve_session "$input" +IFS=$'\x1f' read -r _input_sid tool_name tool_id agent_id agent_type < <( + echo "$input" | jq -r '[.session_id // "", .tool_name // "unknown", .tool_use_id // "", .agent_id // "", .agent_type // ""] | join("\u001f")' 2>/dev/null +) -session_id=$(get_state "session_id") -[[ -z "$session_id" ]] && exit 0 - -trace_id=$(get_state "current_trace_id") -parent_span_id=$(get_state "current_trace_span_id") -inc_state "tool_count" +resolve_session "$input" "$_input_sid" -tool_name=$(echo "$input" | jq -r '.tool_name // "unknown"' 2>/dev/null || echo "unknown") -tool_id=$(echo "$input" | jq -r '.tool_use_id // empty' 2>/dev/null || echo "") tool_input_raw=$(echo "$input" | jq -c '.tool_input // {}' 2>/dev/null || echo '{}') tool_input=$(echo "$tool_input_raw" | head -c 5000) -tool_response=$(echo "$input" | jq -r '.tool_response // empty' 2>/dev/null | head -c 5000) || true +raw_response=$(echo "$input" | jq -r '.tool_response // empty' 2>/dev/null || echo "") +tool_response=$(echo "$raw_response" | head -c 5000) + +IFS=$'\x1f' read -r session_id _s_trace_id _s_last_trace_id \ + _s_trace_span_id _s_last_trace_span_id \ + _s_team_name user_id _s_tool_start < <( + jq -r --arg tk "tool_${tool_id}_start" '[ + (.session_id // ""), (.current_trace_id // ""), (.last_trace_id // ""), + (.current_trace_span_id // ""), (.last_trace_span_id // ""), + (.team_name // ""), (.user_id // ""), (.[$tk] // "") + ] | join("\u001f")' "$STATE_FILE" 2>/dev/null +) + +[[ -z "$session_id" ]] && exit 0 + +trace_id="$_s_trace_id" +[[ -z "$trace_id" ]] && trace_id="$_s_last_trace_id" +[[ -z "$trace_id" ]] && exit 0 + +parent_span_id="$_s_trace_span_id" +[[ -z "$parent_span_id" ]] && parent_span_id="$_s_last_trace_span_id" + +# team_name_check: frozen at read time for dedup/role checks. +# team_name: may be mutated below by TeamCreate/TeamDelete handling. +team_name_check="$_s_team_name" +team_name="$_s_team_name" + +start_time="$_s_tool_start" +[[ -z "$start_time" ]] && start_time=$(get_timestamp_ms) + +inc_state "tool_count" # Track whether content was truncated -tool_input_truncated="false" -tool_response_truncated="false" -[[ ${#tool_input_raw} -gt 5000 ]] && tool_input_truncated="true" -raw_response=$(echo "$input" | jq -r '.tool_response // empty' 2>/dev/null || echo "") -[[ ${#raw_response} -gt 5000 ]] && tool_response_truncated="true" truncated="false" -[[ "$tool_input_truncated" == "true" || "$tool_response_truncated" == "true" ]] && truncated="true" +[[ ${#tool_input_raw} -gt 5000 || ${#raw_response} -gt 5000 ]] && truncated="true" + +IFS=$'\x1f' read -r _f_command _f_file_path _f_url _f_query _f_pattern _f_path \ + _f_description _f_to _f_message_str _f_name _f_task_subject _f_is_shutdown < <( + echo "$tool_input_raw" | jq -r '[ + (.command // ""), (.file_path // .pattern // ""), (.url // ""), + (.query // ""), (.pattern // ""), (.path // ""), + (.description // ""), (.to // ""), + ((.message // "") | if type == "string" then . else "" end), + (.name // ""), (.task_subject // ""), + (if (.message | type) == "object" then (.message.type // "") else "" end) + ] | join("\u001f")' 2>/dev/null +) -# Extract tool-specific metadata for structured attributes tool_description="" tool_command="" tool_file_path="" @@ -39,53 +70,136 @@ tool_query="" case "$tool_name" in Bash) - tool_command=$(echo "$tool_input_raw" | jq -r '.command // empty' 2>/dev/null || echo "") + tool_command="$_f_command" tool_description=$(echo "$tool_command" | head -c 200) ;; Read|Write|Edit|Glob) - tool_file_path=$(echo "$tool_input_raw" | jq -r '.file_path // .pattern // empty' 2>/dev/null || echo "") + tool_file_path="$_f_file_path" tool_description=$(echo "$tool_file_path" | head -c 200) ;; - WebSearch) - tool_query=$(echo "$tool_input_raw" | jq -r '.query // empty' 2>/dev/null || echo "") + WebSearch|ToolSearch) + tool_query="$_f_query" tool_description=$(echo "$tool_query" | head -c 200) ;; WebFetch) - tool_url=$(echo "$tool_input_raw" | jq -r '.url // empty' 2>/dev/null || echo "") + tool_url="$_f_url" tool_description=$(echo "$tool_url" | head -c 200) ;; Grep) - tool_query=$(echo "$tool_input_raw" | jq -r '.pattern // empty' 2>/dev/null || echo "") - tool_file_path=$(echo "$tool_input_raw" | jq -r '.path // empty' 2>/dev/null || echo "") + tool_query="$_f_pattern" + tool_file_path="$_f_path" tool_description="grep: $(echo "$tool_query" | head -c 100)" ;; + Agent) + tool_description=$(echo "$_f_description" | head -c 200) + ;; + SendMessage) + tool_description=$(echo "$_f_to" | head -c 200) + sm_msg="$_f_message_str" + [[ -n "$sm_msg" ]] && tool_input=$(printf 'to: %s\n%s' "$tool_description" "$sm_msg" | head -c 5000) + ;; + TaskCreate|TaskUpdate) + tool_description=$(echo "$_f_task_subject" | head -c 200) + ;; *) tool_description=$(echo "$tool_input" | head -c 200) + [[ "$tool_description" == "{}" ]] && tool_description="" ;; esac -start_time=$(get_state "tool_${tool_id}_start") -[[ -z "$start_time" ]] && start_time=$(get_timestamp_ms) +is_shutdown="$_f_is_shutdown" + end_time=$(get_timestamp_ms) del_state "tool_${tool_id}_start" -span_id=$(generate_uuid | tr -d '-' | cut -c1-16) +span_id=$(generate_span_id) -# Build base attributes -user_id=$(get_state "user_id") +# Dedup: keep teammate's span, skip lead's duplicate +check_team_dedup "$tool_id" "seen_tool" "$agent_id" "$team_name_check" || exit 0 -attrs=$(jq -n \ +# Agent grouping: lazy-init an AGENT span for this agent's work period. +# Works for both subagents and teammates — keyed on agent_id (unique per instance). +if [[ -n "$agent_id" && -n "$agent_type" && "$agent_type" != "unknown" && "$agent_type" != "null" && "$is_shutdown" != "shutdown_response" ]]; then + agent_parent=$(get_or_set_state "agent_${agent_id}_span_id" "$(generate_span_id)") + if [[ "$(get_state "agent_${agent_id}_start_time")" == "" ]]; then + set_state "agent_${agent_id}_start_time" "$start_time" + set_state "active_agent_${agent_type}" "$agent_id" + fi + parent_span_id="$agent_parent" +elif [[ "$is_shutdown" == "shutdown_response" && -n "$agent_id" ]]; then + shutdown_parent=$(get_state "agent_${agent_id}_span_id") + [[ -z "$shutdown_parent" ]] && shutdown_parent=$(generate_span_id) + shutdown_turn_parent="$parent_span_id" + parent_span_id="$shutdown_parent" +fi + +if [[ "$tool_name" == "TeamCreate" ]]; then + team_from_response=$(echo "$raw_response" | jq -r '.team_name // empty' 2>/dev/null || echo "") + if [[ -n "$team_from_response" ]]; then + set_state "team_name" "$team_from_response" + team_name="$team_from_response" + fi +fi + +# Race fix: preemptively mark agents active on the lead side so Stop +# sees active_agent_* > 0 before the worker's first tool fires. +if [[ -z "$agent_id" && -n "$team_name_check" ]]; then + if [[ "$tool_name" == "Agent" ]]; then + [[ -n "$_f_name" ]] && set_state "active_agent_${_f_name}" "spawning" + elif [[ "$tool_name" == "SendMessage" ]]; then + [[ -n "$_f_to" ]] && set_state "active_agent_${_f_to}" "waking" + fi +fi + +if [[ "$tool_name" == "TeamDelete" ]]; then + # Only clean state if deletion succeeded — failed deletes keep the team active + if [[ "$raw_response" != *"active member"* ]]; then + set_state "turn_team_name" "$team_name" + clean_team_state + del_state "team_name" + team_name="" + else + log "TeamDelete blocked (active members) — keeping team state" + fi +fi + +determine_role "$agent_id" "$team_name_check" + +attrs=$(jq -nc \ --arg sid "$session_id" --arg tool "$tool_name" \ --arg in "$tool_input" --arg out "$tool_response" \ --arg desc "$tool_description" --arg trunc "$truncated" \ + --arg cmd "$tool_command" --arg fpath "$tool_file_path" \ + --arg url "$tool_url" --arg query "$tool_query" \ + --arg aid "$agent_id" --arg aname "$agent_type" --arg role "$_role" \ + --arg tn "$team_name" \ --arg uid "$user_id" \ - '{"session.id":$sid,"openinference.span.kind":"tool","tool.name":$tool,"input.value":$in,"output.value":$out,"tool.description":$desc,"tool.truncated":$trunc} + (if $uid != "" then {"user.id":$uid} else {} end)') - -# Add tool-specific structured attributes -[[ -n "$tool_command" ]] && attrs=$(echo "$attrs" | jq --arg v "$tool_command" '. + {"tool.command":$v}') -[[ -n "$tool_file_path" ]] && attrs=$(echo "$attrs" | jq --arg v "$tool_file_path" '. + {"tool.file_path":$v}') -[[ -n "$tool_url" ]] && attrs=$(echo "$attrs" | jq --arg v "$tool_url" '. + {"tool.url":$v}') -[[ -n "$tool_query" ]] && attrs=$(echo "$attrs" | jq --arg v "$tool_query" '. + {"tool.query":$v}') + '{"session.id":$sid,"openinference.span.kind":"TOOL","tool.name":$tool,"input.value":$in,"output.value":$out,"tool.description":$desc,"tool.truncated":$trunc} + + (if $cmd != "" then {"tool.command":$cmd} else {} end) + + (if $fpath != "" then {"tool.file_path":$fpath} else {} end) + + (if $url != "" then {"tool.url":$url} else {} end) + + (if $query != "" then {"tool.query":$query} else {} end) + + (if $aid != "" then {"agent.id":$aid,"agent.name":$aname,"agent.role":$role} + elif $role == "lead" then {"agent.role":"lead"} + else {} end) + + (if $tn != "" then {"team.name":$tn} else {} end) + + (if $uid != "" then {"user.id":$uid} else {} end)') span=$(build_span "$tool_name" "TOOL" "$span_id" "$trace_id" "$parent_span_id" "$start_time" "$end_time" "$attrs") -send_span "$span" || true +send_span "$span" "$tool_name" || true + +if [[ -n "${shutdown_parent:-}" ]]; then + _sd_start=$(get_state "agent_${agent_id}_start_time") + [[ -z "$_sd_start" ]] && _sd_start="$start_time" + _sd_attrs=$(jq -nc --arg sid "$session_id" --arg aid "$agent_id" --arg at "$agent_type" \ + --arg tn "$team_name" --arg uid "$user_id" \ + '{"session.id":$sid,"openinference.span.kind":"AGENT","agent.id":$aid,"agent.name":$at,"agent.role":"teammate"} + + (if $tn != "" then {"team.name":$tn} else {} end) + + (if $uid != "" then {"user.id":$uid} else {} end)') + send_span "$(build_span "Teammate: $agent_type" "AGENT" "$shutdown_parent" "$trace_id" \ + "$shutdown_turn_parent" "$_sd_start" "$end_time" "$_sd_attrs")" "Teammate: $agent_type" || true + del_states "agent_${agent_id}_span_id" "agent_${agent_id}_start_time" + # Sentinel: tells SubagentStop that the grouping span was already emitted here, + # so it must not create a second AGENT span for the same lifecycle. + set_state "agent_${agent_id}_shutdown_complete" "1" +fi diff --git a/plugins/claude-code-tracing/hooks/session_end.sh b/plugins/claude-code-tracing/hooks/session_end.sh index fb57609..275c1d3 100755 --- a/plugins/claude-code-tracing/hooks/session_end.sh +++ b/plugins/claude-code-tracing/hooks/session_end.sh @@ -12,6 +12,10 @@ resolve_session "$input" session_id=$(get_state "session_id") [[ -z "$session_id" ]] && exit 0 +# Close any active turn (team in-progress or orphaned) +transcript=$(echo "$input" | jq -r '.transcript_path // empty' 2>/dev/null || echo "") +close_active_turn "$transcript" +del_state "team_name" trace_count=$(get_state "trace_count") tool_count=$(get_state "tool_count") diff --git a/plugins/claude-code-tracing/hooks/session_start.sh b/plugins/claude-code-tracing/hooks/session_start.sh index 1c7e558..1d426fe 100755 --- a/plugins/claude-code-tracing/hooks/session_start.sh +++ b/plugins/claude-code-tracing/hooks/session_start.sh @@ -8,6 +8,10 @@ input=$(cat 2>/dev/null || echo '{}') [[ -z "$input" ]] && input='{}' resolve_session "$input" +# Reset state for fresh session +_lock_state +echo '{}' > "$STATE_FILE" +_unlock_state ensure_session_initialized "$input" log "Session started: $(get_state 'session_id')" diff --git a/plugins/claude-code-tracing/hooks/stop.sh b/plugins/claude-code-tracing/hooks/stop.sh index 18ae1b7..43adef2 100755 --- a/plugins/claude-code-tracing/hooks/stop.sh +++ b/plugins/claude-code-tracing/hooks/stop.sh @@ -18,42 +18,36 @@ user_prompt=$(get_state "current_trace_prompt") project_name=$(get_state "project_name") trace_count=$(get_state "trace_count") -# Parse transcript for AI response and tokens transcript=$(echo "$input" | jq -r '.transcript_path // empty' 2>/dev/null || echo "") + +_tn=$(get_state 'team_name') +if [[ -n "$_tn" ]]; then + # Team still alive — defer Turn span until TeamDelete clears team_name. + # Flush happens via close_active_turn in user_prompt_submit / session_end. + [[ -n "$transcript" ]] && set_state "deferred_transcript" "$transcript" + set_state "deferred_turn_end_time" "$(get_timestamp_ms)" + log "Turn $trace_count: team active, deferring" + exit 0 +fi + +_turn_tn=$(get_state 'turn_team_name') +[[ -n "$_turn_tn" ]] && _tn="$_turn_tn" + +# Parse transcript for AI response and tokens output="" model="" in_tokens=0 out_tokens=0 if [[ -f "$transcript" ]]; then start_line=$(get_state "trace_start_line") - skip_lines=$((${start_line:-0})) - - # Use tail to skip already-processed lines instead of iterating from line 0 - while IFS= read -r line; do - [[ -z "$line" ]] && continue - - [[ $(echo "$line" | jq -r '.type' 2>/dev/null) == "assistant" ]] || continue - - # Extract text - text=$(echo "$line" | jq -r '.message.content | if type=="array" then [.[]|select(.type=="text")|.text]|join("\n") else . end' 2>/dev/null) - [[ -n "$text" && "$text" != "null" ]] && output="${output:+$output -}$text" - - # Extract model and tokens (safe: validate numeric before arithmetic) - model=$(echo "$line" | jq -r '.message.model // empty' 2>/dev/null) - val=$(echo "$line" | jq -r '.message.usage.input_tokens // 0' 2>/dev/null) - [[ "$val" =~ ^[0-9]+$ ]] && in_tokens=$((in_tokens + val)) - val=$(echo "$line" | jq -r '.message.usage.output_tokens // 0' 2>/dev/null) - [[ "$val" =~ ^[0-9]+$ ]] && out_tokens=$((out_tokens + val)) - val=$(echo "$line" | jq -r '.message.usage.cache_read_input_tokens // 0' 2>/dev/null) - [[ "$val" =~ ^[0-9]+$ ]] && in_tokens=$((in_tokens + val)) - val=$(echo "$line" | jq -r '.message.usage.cache_creation_input_tokens // 0' 2>/dev/null) - [[ "$val" =~ ^[0-9]+$ ]] && in_tokens=$((in_tokens + val)) - done < <(tail -n +"$((skip_lines + 1))" "$transcript") + parse_transcript "$transcript" "${start_line:-0}" + output="$_pt_all_text" + model="$_pt_model" + in_tokens=$_pt_in_tokens + out_tokens=$_pt_out_tokens fi -output=$(printf '%s' "$output" | head -c 5000) +output=$(printf '%s' "$output" | head -c "10000") [[ -z "$output" ]] && output="(No response)" -# Compute total token count total_tokens=$((in_tokens + out_tokens)) output_messages=$(jq -nc --arg out "$output" '[{"message.role":"assistant","message.content":$out}]') @@ -63,18 +57,22 @@ user_id=$(get_state "user_id") attrs=$(jq -nc \ --arg sid "$session_id" --arg num "$trace_count" --arg proj "$project_name" \ --arg in "$user_prompt" --arg out "$output" --arg model "$model" \ - --arg uid "$user_id" \ + --arg uid "$user_id" --arg tn "$_tn" \ --argjson in_tok "$in_tokens" --argjson out_tok "$out_tokens" --argjson total_tok "$total_tokens" \ --argjson out_msgs "$output_messages" \ - '{"session.id":$sid,"trace.number":$num,"project.name":$proj,"openinference.span.kind":"LLM","llm.model_name":$model,"llm.token_count.prompt":$in_tok,"llm.token_count.completion":$out_tok,"llm.token_count.total":$total_tok,"input.value":$in,"output.value":$out,"llm.output_messages":$out_msgs} + (if $uid != "" then {"user.id":$uid} else {} end)') + '{"session.id":$sid,"trace.number":$num,"project.name":$proj,"openinference.span.kind":"AGENT","llm.model_name":$model,"llm.token_count.prompt":$in_tok,"llm.token_count.completion":$out_tok,"llm.token_count.total":$total_tok,"input.value":$in,"output.value":$out,"llm.output_messages":$out_msgs} + + (if $uid != "" then {"user.id":$uid} else {} end) + + (if $tn != "" then {"team.name":$tn} else {} end)') -span=$(build_span "Turn $trace_count" "LLM" "$trace_span_id" "$trace_id" "" "$trace_start_time" "$(get_timestamp_ms)" "$attrs") +span=$(build_span "Turn $trace_count" "AGENT" "$trace_span_id" "$trace_id" "" "$trace_start_time" "$(get_timestamp_ms)" "$attrs") send_span "$span" || true -del_state "current_trace_id" -del_state "current_trace_span_id" -del_state "current_trace_start_time" -del_state "current_trace_prompt" +# Preserve for late-arriving teammate hooks (SubagentStop, TaskCompleted) +set_state "last_trace_id" "$trace_id" +set_state "last_trace_span_id" "$trace_span_id" + +del_states current_trace_id current_trace_span_id current_trace_start_time current_trace_prompt turn_team_name \ + deferred_transcript deferred_turn_end_time log "Turn $trace_count sent" # Opportunistic GC for environments without SessionEnd (e.g., Python Agent SDK) diff --git a/plugins/claude-code-tracing/hooks/subagent_stop.sh b/plugins/claude-code-tracing/hooks/subagent_stop.sh index dd3e925..0feff3c 100755 --- a/plugins/claude-code-tracing/hooks/subagent_stop.sh +++ b/plugins/claude-code-tracing/hooks/subagent_stop.sh @@ -8,7 +8,7 @@ input=$(cat 2>/dev/null || echo '{}') resolve_session "$input" -trace_id=$(get_state "current_trace_id") +trace_id=$(resolve_trace_id) [[ -z "$trace_id" ]] && exit 0 session_id=$(get_state "session_id") @@ -21,59 +21,72 @@ if [[ -z "$agent_type" || "$agent_type" == "unknown" || "$agent_type" == "null" exit 0 fi -span_id=$(generate_uuid | tr -d '-' | cut -c1-16) +# Guard: if post_tool_use already emitted the grouping AGENT span via the +# shutdown_response path, skip creating a duplicate here. +if [[ -n "$agent_id" && -n "$(get_state "agent_${agent_id}_shutdown_complete")" ]]; then + del_state "agent_${agent_id}_shutdown_complete" + log "Skipping SubagentStop span — grouping span already emitted by shutdown_response (agent_id=$agent_id)" + exit 0 +fi + end_time=$(get_timestamp_ms) -parent=$(get_state "current_trace_span_id") +parent=$(resolve_parent_span_id) -# Try to parse subagent transcript for output transcript_path=$(echo "$input" | jq -r '.agent_transcript_path // empty' 2>/dev/null || echo "") subagent_output="" -start_time="" model="" in_tokens=0 out_tokens=0 -if [[ -n "$transcript_path" && -f "$transcript_path" ]]; then - # Use file birth time (creation time) for start estimate - if stat -f %B "$transcript_path" &>/dev/null; then - # macOS: %B = birth time - file_time_s=$(stat -f %B "$transcript_path") - start_time=$((file_time_s * 1000)) - elif stat -c %W "$transcript_path" &>/dev/null; then - # Linux: %W = birth time (may be 0 if unsupported) - file_time_s=$(stat -c %W "$transcript_path") - if [[ "$file_time_s" =~ ^[0-9]+$ && "$file_time_s" -gt 0 ]]; then - start_time=$((file_time_s * 1000)) +# Check for pre-created grouping span (set by post_tool_use when agent used tools) +span_id=$(get_state "agent_${agent_id}_span_id") +team_name=$(get_state "team_name") + +if [[ -z "$span_id" ]]; then + # No tools fired during this work period — create grouping span now + span_id=$(generate_span_id) + set_state "agent_${agent_id}_span_id" "$span_id" + + # Estimate start_time + if [[ -n "$team_name" ]]; then + # Teammate: prefer previous turn's end time + prev_stop=$(get_state "teammate_${agent_type}_last_stop_time") + if [[ -n "$prev_stop" ]]; then + set_state "agent_${agent_id}_start_time" "$prev_stop" + elif [[ -n "$transcript_path" && -f "$transcript_path" ]]; then + file_birth=$(get_file_birth_ms "$transcript_path") + set_state "agent_${agent_id}_start_time" "${file_birth:-$end_time}" + else + set_state "agent_${agent_id}_start_time" "$end_time" fi + else + # Subagent: estimate from transcript file birth time + _sa_start=$(get_file_birth_ms "$transcript_path") + set_state "agent_${agent_id}_start_time" "${_sa_start:-$end_time}" fi - - # Parse subagent transcript for output and token usage - while IFS= read -r line; do - [[ -z "$line" ]] && continue - [[ $(echo "$line" | jq -r '.type' 2>/dev/null) == "assistant" ]] || continue - - # Extract last assistant message as output - text=$(echo "$line" | jq -r '.message.content | if type=="array" then [.[]|select(.type=="text")|.text]|join("\n") else . end' 2>/dev/null) - [[ -n "$text" && "$text" != "null" ]] && subagent_output="$text" - - # Accumulate token counts - model=$(echo "$line" | jq -r '.message.model // empty' 2>/dev/null) - val=$(echo "$line" | jq -r '.message.usage.input_tokens // 0' 2>/dev/null) - [[ "$val" =~ ^[0-9]+$ ]] && in_tokens=$((in_tokens + val)) - val=$(echo "$line" | jq -r '.message.usage.output_tokens // 0' 2>/dev/null) - [[ "$val" =~ ^[0-9]+$ ]] && out_tokens=$((out_tokens + val)) - val=$(echo "$line" | jq -r '.message.usage.cache_read_input_tokens // 0' 2>/dev/null) - [[ "$val" =~ ^[0-9]+$ ]] && in_tokens=$((in_tokens + val)) - val=$(echo "$line" | jq -r '.message.usage.cache_creation_input_tokens // 0' 2>/dev/null) - [[ "$val" =~ ^[0-9]+$ ]] && in_tokens=$((in_tokens + val)) - done < "$transcript_path" - - subagent_output=$(echo "$subagent_output" | head -c 5000) fi -# Fall back to current time if no start time found +start_time=$(get_state "agent_${agent_id}_start_time") [[ -z "$start_time" ]] && start_time="$end_time" +if [[ -n "$transcript_path" && -f "$transcript_path" ]]; then + _pt_offset=0 + [[ -n "$team_name" ]] && _pt_offset=$(get_state "teammate_${agent_type}_transcript_offset") && _pt_offset=${_pt_offset:-0} + parse_transcript "$transcript_path" "$_pt_offset" + subagent_output=$(echo "$_pt_all_text" | head -c "10000") + model="$_pt_model" + in_tokens=$_pt_in_tokens + out_tokens=$_pt_out_tokens + # Teammates: store transcript offset for next work period + [[ -n "$team_name" ]] && set_state "teammate_${agent_type}_transcript_offset" "$(wc -l < "$transcript_path" | tr -d ' ')" +fi + total_tokens=$((in_tokens + out_tokens)) +if [[ -n "$team_name" ]]; then + _role="teammate"; span_name="Teammate: $agent_type" +else + _role="subagent"; span_name="Subagent: $agent_type" +fi + user_id=$(get_state "user_id") attrs=$(jq -nc \ @@ -82,9 +95,27 @@ attrs=$(jq -nc \ --arg agent_type "$agent_type" \ --arg output "$subagent_output" \ --arg model "$model" \ + --arg role "$_role" \ + --arg tn "$team_name" \ --arg uid "$user_id" \ --argjson in_tok "$in_tokens" --argjson out_tok "$out_tokens" --argjson total_tok "$total_tokens" \ - '{"session.id":$sid,"openinference.span.kind":"chain","subagent.id":$agent_id,"subagent.type":$agent_type,"llm.model_name":$model,"llm.token_count.prompt":$in_tok,"llm.token_count.completion":$out_tok,"llm.token_count.total":$total_tok} + (if $output != "" then {"output.value":$output} else {} end) + (if $uid != "" then {"user.id":$uid} else {} end)') + '{"session.id":$sid,"openinference.span.kind":"AGENT","agent.id":$agent_id,"agent.name":$agent_type,"agent.role":$role,"llm.model_name":$model,"llm.token_count.prompt":$in_tok,"llm.token_count.completion":$out_tok,"llm.token_count.total":$total_tok} + + (if $output != "" then {"output.value":$output} else {} end) + + (if $tn != "" then {"team.name":$tn} else {} end) + + (if $uid != "" then {"user.id":$uid} else {} end)') -span=$(build_span "Subagent: $agent_type" "CHAIN" "$span_id" "$trace_id" "$parent" "$start_time" "$end_time" "$attrs") +span=$(build_span "$span_name" "AGENT" "$span_id" "$trace_id" "$parent" "$start_time" "$end_time" "$attrs") send_span "$span" || true + +del_states "agent_${agent_id}_span_id" "agent_${agent_id}_start_time" + +# Clean up reverse mapping if still pointing to this agent +_current_active=$(get_state "active_agent_${agent_type}") +if [[ "$_current_active" == "$agent_id" || "$_current_active" == "spawning" || "$_current_active" == "waking" ]]; then + del_state "active_agent_${agent_type}" +fi + +if [[ -n "$team_name" ]]; then + # Teammate: persist turn-end time for next work period's start estimate + set_state "teammate_${agent_type}_last_stop_time" "$end_time" +fi diff --git a/plugins/claude-code-tracing/hooks/task_completed.sh b/plugins/claude-code-tracing/hooks/task_completed.sh new file mode 100755 index 0000000..4bb84cd --- /dev/null +++ b/plugins/claude-code-tracing/hooks/task_completed.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# TaskCompleted - Create span for task completion with team context +source "$(dirname "$0")/common.sh" +check_requirements + +input=$(cat 2>/dev/null || echo '{}') +[[ -z "$input" ]] && input='{}' + +resolve_session "$input" + +trace_id=$(resolve_trace_id) +[[ -z "$trace_id" ]] && exit 0 + +session_id=$(get_state "session_id") +[[ -z "$session_id" ]] && exit 0 + +team_name=$(echo "$input" | jq -r '.team_name // empty' 2>/dev/null || echo "") +teammate_name=$(echo "$input" | jq -r '.teammate_name // empty' 2>/dev/null || echo "") +agent_id=$(echo "$input" | jq -r '.agent_id // empty' 2>/dev/null || echo "") +agent_type=$(echo "$input" | jq -r '.agent_type // empty' 2>/dev/null || echo "") +task_id=$(echo "$input" | jq -r '.task_id // empty' 2>/dev/null || echo "") +task_subject=$(echo "$input" | jq -r '.task_subject // empty' 2>/dev/null || echo "") +task_description=$(echo "$input" | jq -r '.task_description // empty' 2>/dev/null | head -c 5000) + +[[ -n "$team_name" ]] && set_state "team_name" "$team_name" + +# Dedup: keep teammate's span, skip lead's duplicate +check_team_dedup "$task_id" "tc_seen" "$agent_id" || exit 0 + +span_id=$(generate_span_id) +end_time=$(get_timestamp_ms) +# Nest under agent grouping span if available +parent=$(resolve_parent_span_id "$agent_id" "${agent_type:-$teammate_name}") + +input_val="${task_description:-$task_subject}" +user_id=$(get_state "user_id") +attrs=$(jq -nc \ + --arg sid "$session_id" \ + --arg tn "$team_name" --arg tmate "$teammate_name" \ + --arg tid "$task_id" --arg tsubj "$task_subject" \ + --arg tdesc "$task_description" --arg inval "$input_val" \ + --arg uid "$user_id" \ + '{"session.id":$sid,"openinference.span.kind":"CHAIN"} + + (if $tn != "" then {"team.name":$tn} else {} end) + + (if $tmate != "" then {"agent.name":$tmate,"agent.role":"teammate"} else {} end) + + (if $tid != "" then {"task.id":$tid} else {} end) + + (if $tsubj != "" then {"task.subject":$tsubj} else {} end) + + (if $tdesc != "" then {"task.description":$tdesc} else {} end) + + (if $inval != "" then {"input.value":$inval} else {} end) + + (if $uid != "" then {"user.id":$uid} else {} end)') + +span_name="Task Completed" +[[ -n "$task_subject" ]] && span_name="Task Completed: $task_subject" + +span=$(build_span "$span_name" "CHAIN" "$span_id" "$trace_id" "$parent" "$end_time" "$end_time" "$attrs") +send_span "$span" || true diff --git a/plugins/claude-code-tracing/hooks/teammate_idle.sh b/plugins/claude-code-tracing/hooks/teammate_idle.sh new file mode 100755 index 0000000..7fef8d2 --- /dev/null +++ b/plugins/claude-code-tracing/hooks/teammate_idle.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# TeammateIdle - Cache team_name in session state +# No span emitted (SubagentStop already covers teammate turn-end events) +source "$(dirname "$0")/common.sh" +check_requirements + +input=$(cat 2>/dev/null || echo '{}') +[[ -z "$input" ]] && input='{}' + +resolve_session "$input" + +team_name=$(echo "$input" | jq -r '.team_name // empty' 2>/dev/null || echo "") +teammate_name=$(echo "$input" | jq -r '.teammate_name // empty' 2>/dev/null || echo "") + +if [[ -n "$team_name" ]]; then + set_state "team_name" "$team_name" + log "Cached team_name=$team_name (teammate=$teammate_name)" +fi diff --git a/plugins/claude-code-tracing/hooks/user_prompt_submit.sh b/plugins/claude-code-tracing/hooks/user_prompt_submit.sh index 0f86b55..55f785a 100755 --- a/plugins/claude-code-tracing/hooks/user_prompt_submit.sh +++ b/plugins/claude-code-tracing/hooks/user_prompt_submit.sh @@ -6,7 +6,6 @@ check_requirements input=$(cat 2>/dev/null || echo '{}') [[ -z "$input" ]] && input='{}' -# Resolve state file from session_id in input JSON resolve_session "$input" # Lazy init: if SessionStart never fired (e.g., Python Agent SDK), initialize now @@ -14,43 +13,22 @@ ensure_session_initialized "$input" session_id=$(get_state "session_id") -# --- Fail-safe: close any prior Turn span that Stop never emitted --- -prev_trace_id=$(get_state "current_trace_id") -prev_span_id=$(get_state "current_trace_span_id") -if [[ -n "$prev_trace_id" && -n "$prev_span_id" ]]; then - prev_start=$(get_state "current_trace_start_time") - prev_prompt=$(get_state "current_trace_prompt") - prev_count=$(get_state "trace_count") - project_name=$(get_state "project_name") - end_time=$(get_timestamp_ms) - - user_id=$(get_state "user_id") - - attrs=$(jq -nc \ - --arg sid "$session_id" --arg num "$prev_count" --arg proj "$project_name" \ - --arg in "$prev_prompt" --arg uid "$user_id" \ - '{"session.id":$sid,"trace.number":$num,"project.name":$proj,"openinference.span.kind":"LLM","input.value":$in,"output.value":"(Turn closed by fail-safe: Stop hook did not fire)"} + (if $uid != "" then {"user.id":$uid} else {} end)') - - span=$(build_span "Turn $prev_count" "LLM" "$prev_span_id" "$prev_trace_id" "" "$prev_start" "$end_time" "$attrs") - send_span "$span" || true - - del_state "current_trace_id" - del_state "current_trace_span_id" - del_state "current_trace_start_time" - del_state "current_trace_prompt" - log "Fail-safe: closed orphaned Turn $prev_count" -fi +transcript=$(echo "$input" | jq -r '.transcript_path // empty' 2>/dev/null || echo "") + +# Close any active turn from a prior prompt (team in-progress or orphaned) +close_active_turn "$transcript" + +del_states last_trace_id last_trace_span_id inc_state "trace_count" # Generate trace IDs now, create span at Stop (so it has output) -set_state "current_trace_id" "$(generate_uuid | tr -d '-')" -set_state "current_trace_span_id" "$(generate_uuid | tr -d '-' | cut -c1-16)" +set_state "current_trace_id" "$(generate_trace_id)" +set_state "current_trace_span_id" "$(generate_span_id)" set_state "current_trace_start_time" "$(get_timestamp_ms)" -set_state "current_trace_prompt" "$(echo "$input" | jq -r '.prompt // empty' 2>/dev/null | head -c 1000)" +set_state "current_trace_prompt" "$(echo "$input" | jq -r '.prompt // empty' 2>/dev/null | head -c 10000)" # Track transcript position for parsing AI response later -transcript=$(echo "$input" | jq -r '.transcript_path // empty' 2>/dev/null || echo "") if [[ -n "$transcript" && -f "$transcript" ]]; then set_state "trace_start_line" "$(wc -l < "$transcript" | tr -d ' ')" else