From 9b7b0a71aa0ebe4e52726e2b1fa0c05c16624d97 Mon Sep 17 00:00:00 2001 From: gxt Date: Sun, 19 Apr 2026 12:42:12 +0530 Subject: [PATCH 1/5] Show token cost breakdown in check output --- .env.example | 23 - .evalview/badge.json | 6 - .evalview/healing/2026-03-25T11-12-54.json | 72 -- .evalview/healing/2026-03-25T11-13-40.json | 72 -- .evalview/healing/2026-03-25T11-23-25.json | 45 - .evalview/healing/2026-03-28T22-06-19.json | 47 - .evalview/latest-check.html | 1002 -------------------- .evalview/latest-redesign-v2.html | 681 ------------- .evalview/latest-redesign.html | 956 ------------------- .evalview/latest-v3.html | 681 ------------- .evalview/latest-v4.html | 708 -------------- .evalview/latest-v5.html | 689 -------------- .evalview/latest-v6.html | 616 ------------ .evalview/latest-v6b.html | 616 ------------ .evalview/latest-v6c.html | 603 ------------ evalview/commands/check_display.py | 118 +++ tests/test_check_cmd.py | 224 +++++ 17 files changed, 342 insertions(+), 6817 deletions(-) delete mode 100644 .env.example delete mode 100644 .evalview/badge.json delete mode 100644 .evalview/healing/2026-03-25T11-12-54.json delete mode 100644 .evalview/healing/2026-03-25T11-13-40.json delete mode 100644 .evalview/healing/2026-03-25T11-23-25.json delete mode 100644 .evalview/healing/2026-03-28T22-06-19.json delete mode 100644 .evalview/latest-check.html delete mode 100644 .evalview/latest-redesign-v2.html delete mode 100644 .evalview/latest-redesign.html delete mode 100644 .evalview/latest-v3.html delete mode 100644 .evalview/latest-v4.html delete mode 100644 .evalview/latest-v5.html delete mode 100644 .evalview/latest-v6.html delete mode 100644 .evalview/latest-v6b.html delete mode 100644 .evalview/latest-v6c.html diff --git a/.env.example b/.env.example deleted file mode 100644 index 0752640..0000000 --- a/.env.example +++ /dev/null @@ -1,23 +0,0 @@ -# EvalView Environment Configuration -# Copy this file to .env and fill in your values: -# cp .env.example .env - -# OpenAI API Key (required for LLM-as-judge evaluation) -# Get yours at: https://platform.openai.com/api-keys -OPENAI_API_KEY=your-openai-api-key-here - -# Optional: Anthropic API Key (for Claude-based evaluation) -# ANTHROPIC_API_KEY=your-anthropic-api-key-here - -# Optional: Custom model for LLM-as-judge (default: gpt-4o-mini) -# EVALVIEW_JUDGE_MODEL=gpt-4o - -# Optional: Enable debug logging -# DEBUG=1 - -# ============================================ -# Telemetry -# ============================================ - -# Disable anonymous usage analytics (enabled by default) -# EVALVIEW_TELEMETRY_DISABLED=1 diff --git a/.evalview/badge.json b/.evalview/badge.json deleted file mode 100644 index 07ad528..0000000 --- a/.evalview/badge.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "evalview", - "message": "3 regressions", - "color": "red", - "schemaVersion": 1 -} diff --git a/.evalview/healing/2026-03-25T11-12-54.json b/.evalview/healing/2026-03-25T11-12-54.json deleted file mode 100644 index 3fdc9b9..0000000 --- a/.evalview/healing/2026-03-25T11-12-54.json +++ /dev/null @@ -1,72 +0,0 @@ -{ - "results": [ - { - "test_name": "Service Outage Login Issue", - "original_status": "tools_changed", - "diagnosis": { - "action": "flag_review", - "trigger": "structural_change", - "reason": "tool change detected — review needed", - "root_cause_category": null, - "details": { - "tool_changes": [ - "added: ? -> escalate_to_human" - ] - } - }, - "attempted": false, - "healed": false, - "proposed": false, - "final_status": "tools_changed", - "original_score": 98.0, - "baseline_score": 96.5, - "retry_score": null, - "retry_status": null, - "baseline_model": "gpt-4o-mini", - "actual_model": "gpt-4o-mini", - "variant_saved": null, - "variant_path": null - }, - { - "test_name": "Refund for Damaged Blender", - "original_status": "tools_changed", - "diagnosis": { - "action": "flag_review", - "trigger": "param_change", - "reason": "parameter changes detected — review needed", - "root_cause_category": null, - "details": { - "param_diffs": 3 - } - }, - "attempted": false, - "healed": false, - "proposed": false, - "final_status": "tools_changed", - "original_score": 86.7, - "baseline_score": 89.55, - "retry_score": null, - "retry_status": null, - "baseline_model": "gpt-4o-mini", - "actual_model": "gpt-4o-mini", - "variant_saved": null, - "variant_path": null - } - ], - "total_healed": 0, - "total_proposed": 0, - "total_review": 2, - "total_blocked": 0, - "attempted_count": 0, - "unresolved_count": 2, - "failed_count": 2, - "policy_version": "v1", - "thresholds": { - "min_variant_score": 70.0, - "max_cost_multiplier": 2.0, - "max_latency_multiplier": 3.0, - "max_auto_variants": 3.0 - }, - "model_update": null, - "audit_path": null -} \ No newline at end of file diff --git a/.evalview/healing/2026-03-25T11-13-40.json b/.evalview/healing/2026-03-25T11-13-40.json deleted file mode 100644 index 9a400ee..0000000 --- a/.evalview/healing/2026-03-25T11-13-40.json +++ /dev/null @@ -1,72 +0,0 @@ -{ - "results": [ - { - "test_name": "Service Outage Login Issue", - "original_status": "tools_changed", - "diagnosis": { - "action": "flag_review", - "trigger": "structural_change", - "reason": "tool change detected — review needed", - "root_cause_category": null, - "details": { - "tool_changes": [ - "added: ? -> escalate_to_human" - ] - } - }, - "attempted": false, - "healed": false, - "proposed": false, - "final_status": "tools_changed", - "original_score": 98.5, - "baseline_score": 96.5, - "retry_score": null, - "retry_status": null, - "baseline_model": "gpt-4o-mini", - "actual_model": "gpt-4o-mini", - "variant_saved": null, - "variant_path": null - }, - { - "test_name": "Refund for Damaged Blender", - "original_status": "regression", - "diagnosis": { - "action": "flag_review", - "trigger": "param_change", - "reason": "parameter changes detected — review needed", - "root_cause_category": null, - "details": { - "param_diffs": 3 - } - }, - "attempted": false, - "healed": false, - "proposed": false, - "final_status": "regression", - "original_score": 83.6, - "baseline_score": 89.55, - "retry_score": null, - "retry_status": null, - "baseline_model": "gpt-4o-mini", - "actual_model": "gpt-4o-mini", - "variant_saved": null, - "variant_path": null - } - ], - "total_healed": 0, - "total_proposed": 0, - "total_review": 2, - "total_blocked": 0, - "attempted_count": 0, - "unresolved_count": 2, - "failed_count": 2, - "policy_version": "v1", - "thresholds": { - "min_variant_score": 70.0, - "max_cost_multiplier": 2.0, - "max_latency_multiplier": 3.0, - "max_auto_variants": 3.0 - }, - "model_update": null, - "audit_path": null -} \ No newline at end of file diff --git a/.evalview/healing/2026-03-25T11-23-25.json b/.evalview/healing/2026-03-25T11-23-25.json deleted file mode 100644 index 915780b..0000000 --- a/.evalview/healing/2026-03-25T11-23-25.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "results": [ - { - "test_name": "Refund for Damaged Blender", - "original_status": "regression", - "diagnosis": { - "action": "flag_review", - "trigger": "param_change", - "reason": "parameter changes detected — review needed", - "root_cause_category": null, - "details": { - "param_diffs": 3 - } - }, - "attempted": false, - "healed": false, - "proposed": false, - "final_status": "regression", - "original_score": 82.35, - "baseline_score": 89.55, - "retry_score": null, - "retry_status": null, - "baseline_model": "gpt-4o-mini", - "actual_model": "gpt-4o-mini", - "variant_saved": null, - "variant_path": null - } - ], - "total_healed": 0, - "total_proposed": 0, - "total_review": 1, - "total_blocked": 0, - "attempted_count": 0, - "unresolved_count": 1, - "failed_count": 1, - "policy_version": "v1", - "thresholds": { - "min_variant_score": 70.0, - "max_cost_multiplier": 2.0, - "max_latency_multiplier": 3.0, - "max_auto_variants": 3.0 - }, - "model_update": null, - "audit_path": null -} \ No newline at end of file diff --git a/.evalview/healing/2026-03-28T22-06-19.json b/.evalview/healing/2026-03-28T22-06-19.json deleted file mode 100644 index b189b8a..0000000 --- a/.evalview/healing/2026-03-28T22-06-19.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "results": [ - { - "test_name": "Office Sign-In Outage Check", - "original_status": "output_changed", - "diagnosis": { - "action": "propose_variant", - "trigger": "nondeterminism", - "reason": "saved candidate variant auto_heal_a3c6 (score 86.0)", - "root_cause_category": null, - "details": { - "retry_status": "regression", - "named_variant_count_before": 0, - "max_auto_variants": 3 - } - }, - "attempted": true, - "healed": false, - "proposed": true, - "final_status": "regression", - "original_score": 92.0, - "baseline_score": 93.5, - "retry_score": 86.0, - "retry_status": "regression", - "baseline_model": "gpt-4o-mini", - "actual_model": "gpt-4o-mini", - "variant_saved": "auto_heal_a3c6", - "variant_path": ".evalview/golden/Office_Sign-In_Outage_Check.variant_auto_heal_a3c6.golden.json" - } - ], - "total_healed": 0, - "total_proposed": 1, - "total_review": 0, - "total_blocked": 0, - "attempted_count": 1, - "unresolved_count": 1, - "failed_count": 1, - "policy_version": "v1", - "thresholds": { - "min_variant_score": 70.0, - "max_cost_multiplier": 2.0, - "max_latency_multiplier": 3.0, - "max_auto_variants": 3.0 - }, - "model_update": null, - "audit_path": null -} \ No newline at end of file diff --git a/.evalview/latest-check.html b/.evalview/latest-check.html deleted file mode 100644 index 47d4b9d..0000000 --- a/.evalview/latest-check.html +++ /dev/null @@ -1,1002 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- ✗ Final Outcome Failing - ✗ 3 Failed3 Tests -
-
- -
-
- - - - - -
- - -
- - - -
-
- - 0.0% passed - (0/3) -
-
- Avg 24.2/100 -
-
- $0 - total -
-
- 281135ms - avg -
-
- Unknown - -
-
- - -
-
-
Health Gauge
-
-
- - - - - 0% -
-
- -
3 failed
-
3 regressions
- -
-
-
- -
-
Score Trends
-
-
- -
- - - - - -
-
Behavior Summary
- - - - - - - - - - - - - - -
BehaviorTotalPassedChangedRegressionsHealed
untagged30030
-
- - - -
- -
-
Baseline Snapshot
-
2026-04-06 13:26
-
Model: Not recorded in snapshot
-
- - -
-
EvalView Judge (openai/gpt-5.4-mini)
-
$0.000305
-
2,489 tokens across 6 judge calls
-
-
-
Token Breakdown
-
in 2,301 / out 188
-
$0.1/M input, $0.4/M output
-
- -
- - - - -
-
Score per Test
-
-
- - - -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostLatencyScore
implementUnknown$0360030ms20.0
bug-fixUnknown$0324936ms32.5
refactorUnknown$0158438ms20.0
Total$0avg $0.000000 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - This check also used 6 EvalView judge calls (2489 tokens). -
-
- -
- - -
- -
-
- - implement -
- - 20.0/100 - - ⚡ 360030ms - - 🧠 Unknown -
- -
-
-
- Model: Unknown - - - Baseline: 2026-04-06 13:26 - Baseline model: Not recorded in snapshot -
- -
-
Score Breakdown
-
-
Tools 0.0% × 30%
-
Output 0.0/100 × 50%
-
Sequence Correct × 20%
-
= 20.0/100
-
-
The response does not implement group_by_key(), does not run the __main__ block, and simply repeats the timeout error. It fails to address the user’s request entirely.
-
- -
- QueryImplement the group_by_key() function in stub.py. The docstring and type signature are already there — the body is just pass. Implement it, then run the __main__ block to verify it prints the correctly grouped output. - -
- -
-
Why it failed
-
  • Score 20.0 below minimum 70.0
  • Output quality: 0.0/100
  • Hallucination detected (70% confidence)
  • Tool accuracy: 0.0%
-
-
sequenceDiagram - participant User - participant Agent - participant T0 as error - User->>Agent: Implement the group_by_key f... - Agent-xT0: error - T0-->Agent: OpenCode timed out after 360... - Agent-->>User: OpenCode timed out after 360...
- - -
-
Conversation Turns
-
- -
Turn 1 · error · ⚡ 360030.9ms · 💰 $0
-
Implement the group_by_key() function in stub.py. The docstring and type signature are already there — the body is just pass. Implement it, then run the __main__ block to verify it prints the correctly grouped output. -
-
OpenCode timed out after 360s
- -
-
- -
- 🔮 Hallucination detected · 70% · [Warning] - Tool 'error' failed/returned error, but agent did not acknowledge the failure · openai/gpt-5.4-mini - 🛡 Safe - - -
-
Unsupported claims: [Warning] - Tool 'error' failed/returned error, but agent did not acknowledge the failure -(Confidence 70% below threshold 98% - not blocking)
- - -
-
- -
-
- - bug-fix -
- - 32.5/100 - - ⚡ 324936ms - - 🧠 Unknown -
- -
-
-
- Model: Unknown - - - Baseline: 2026-04-06 13:26 - Baseline model: Not recorded in snapshot -
- -
-
Score Breakdown
-
-
Tools 33.3% × 30%
-
Output 5.0/100 × 50%
-
Sequence Correct × 20%
-
= 32.5/100
-
-
The response is relevant and acknowledges the bug, but it does not actually fix the code or run the file to confirm the output. It only states an intention to inspect the file, so it fails to satisfy the requested task and provides no concrete correction.
-
- -
- QueryThere is a bug in buggy.py. The find_max function uses range(1, len(numbers) - 1) which means it never checks the last element. Fix the bug so the function correctly returns the maximum value including the last element. After fixing, run the file to confirm it prints 9. - -
- -
-
Why it failed
-
  • Score 32.5 below minimum 70.0
  • Output quality: 5.0/100
  • Tool accuracy: 33.3%
-
-
sequenceDiagram - participant User - participant Agent - participant T0 as read_file - User->>Agent: There is a bug in buggy.py. - Agent->>T0: filePath=/Users/hidaibar-mor... - T0-->Agent: path/Users/hidaibar-mor/Down... - Agent-->>User: Ill fix the bug in the find_...
- - -
-
Conversation Turns
-
- -
Turn 1 · read_file · ⚡ 324936.4ms · 💰 $0
-
There is a bug in buggy.py. The find_max function uses range(1, len(numbers) - 1) which means it never checks the last element. Fix the bug so the function correctly returns the maximum value including the last element. After fixing, run the file to confirm it prints 9. -
-
I'll fix the bug in the find_max function in buggy.py. Let me first examine the file to understand the current implementation.
- -
-
- -
- 🔮 No hallucination · No verifiable factual claims found in output. · openai/gpt-5.4-mini - 🛡 Safe - - -
- - - -
-
- -
-
- - refactor -
- - 20.0/100 - - ⚡ 158438ms - - 🧠 Unknown -
- -
-
-
- Model: Unknown - - - Baseline: 2026-04-06 13:26 - Baseline model: Not recorded in snapshot -
- -
-
Score Breakdown
-
-
Tools 0.0% × 30%
-
Output 0.0/100 × 50%
-
Sequence Correct × 20%
-
= 20.0/100
-
-
The response does not answer the query, provide a refactoring, or confirm execution. It only contains a context-size error message and no code, so it fails completeness and relevance entirely.
-
- -
- QueryRefactor the function p() in messy.py. It has poor naming, deeply nested conditionals, and uses type() instead of isinstance(). Rewrite it to be clean and readable while preserving the exact same behaviour: filter out None values, strip whitespace from non-empty strings, and pass through all other types unchanged. Then run the file to confirm it still prints {'name': 'Alice', 'age': 30}. - -
- -
-
Why it failed
-
  • Score 20.0 below minimum 70.0
  • Output quality: 0.0/100
  • Tool accuracy: 0.0%
-
-
◎ Direct response — no tools invoked
- - -
- 🔮 No hallucination · No verifiable factual claims found in output. · openai/gpt-5.4-mini - 🛡 Safe - - -
- - - -
- Response"Context size has been exceeded." -
-
-
- -
- - - -
- -
-
- ⬇ Regression - - - - implement - 96.0 → 20.0 - -76.0 - lexical 2.9% - semantic 0.4% - 70% confidence - -
-
- -
-
Baselineglobread_fileedit_filebash
-
Currenterror
-
- - -
-
Why This Changed
-
- tool_reordered -
-
Tool sequence changed: 'glob' at step 1 was replaced by 'error'
-
Suggested fix: The agent is calling the right tools but in a different order. If the new order is valid, run `evalview snapshot --variant <name>` to accept it as an alternative.
-
- -
-
Baseline Output
Output matches expected behavior: - -- `"fruit"` → `[apple, banana]` -- `"veggie"` → `[carrot]` - -The implementation iterates over each item, skips any missing the key, and appends to the appropriate grou...
-
Current Output
OpenCode timed out after 360s
--- golden
+++ actual
@@ -1,6 +1 @@
-Output matches expected behavior: -
- -
-- `"fruit"` → `[apple, banana]` -
-- `"veggie"` → `[carrot]` -
- -
-The implementation iterates over each item, skips any missing the key, and appends to the appropriate group in the result dict.
+OpenCode timed out after 360s
-
- - -
-
- - Trajectory Comparison -
- -
- - -
-
- -
-
- ⬇ Regression - - - - bug-fix - 100.0 → 32.5 - -67.5 - lexical 33.7% - semantic 35.2% - 99% confidence - -
-
- -
-
Baselineread_fileedit_filebash
-
Currentread_file
-
- - -
-
Why This Changed
-
- tool_removed -
-
Tools 'edit_file', 'bash' were expected but not called
-
Suggested fix: Check if your agent's prompt or logic still triggers 'edit_file'. If the tool was intentionally removed, run `evalview snapshot` to update the baseline.
-
- -
-
Baseline Output
Fixed. The bug was `range(1, len(numbers) - 1)` which excluded the last index. Changed to `range(1, len(numbers))` so all elements are checked. Output confirms `9`.
-
Current Output
I'll fix the bug in the find_max function in buggy.py. Let me first examine the file to understand the current implementation.
--- golden
+++ actual
@@ -1 +1 @@
-Fixed. The bug was `range(1, len(numbers) - 1)` which excluded the last index. Changed to `range(1, len(numbers))` so all elements are checked. Output confirms `9`.
+I'll fix the bug in the find_max function in buggy.py. Let me first examine the file to understand the current implementation.
-
- - -
-
- - Trajectory Comparison -
- -
- - -
-
- -
-
- ⬇ Regression - - - - refactor - 96.5 → 20.0 - -76.5 - lexical 9.9% - semantic 12.0% - 58% confidence - -
-
- -
-
Baselineread_filetodowriteedit_fileedit_filetodowritebashtodowrite
-
CurrentNo tools
-
- - -
-
Why This Changed
-
- tool_removed -
-
Tools 'read_file', 'todowrite', 'edit_file', 'edit_file', 'todowrite', 'bash', 'todowrite' were expected but not called
-
Suggested fix: Check if your agent's prompt or logic still triggers 'read_file'. If the tool was intentionally removed, run `evalview snapshot` to update the baseline.
-
- -
-
Baseline Output
Output confirmed: `{'name': 'Alice', 'age': 30}` - -Here's a summary of what changed in `messy.py`: - -| Before | After | Reason | -|--------|-------|--------| -| `p(d)` | `process_dict(data)` | Descriptive...
-
Current Output
"Context size has been exceeded."
--- golden
+++ actual
@@ -1,11 +1 @@
-Output confirmed: `{'name': 'Alice', 'age': 30}` -
- -
-Here's a summary of what changed in `messy.py`: -
- -
-| Before | After | Reason | -
-|--------|-------|--------| -
-| `p(d)` | `process_dict(data)` | Descriptive names | -
-| `for k in d` | `for key, value in data.items()` | Unpack key/value directly | -
-| `if d[k] != None` | `if value is None: continue` | `is None` is idiomatic; early-continue flattens nesting | -
-| `type(d[k]) == str` | `isinstance(value, str)` | `isinstance` is correct (handles subclasses) | -
-| `if len(d[k]) > 0` / `else: pass` | `if stripped:` | Truthiness check on the already-stripped string; `pass` branch eliminated |
+"Context size has been exceeded."
-
- - -
-
- - Trajectory Comparison -
- -
- - -
-
- -
- - - -
- - -
-
- 2 - steps -
-
- 281135ms - avg latency -
-
- $0 - total cost -
-
- 3 - tests -
-
-
-
-
Step Latencies
-
-
-
-
Step Cost
-
-
-
- -
- - - -
- - - - -
- Built with EvalView | Agent testing & regression detection - - Share - Star - -
-
- - - \ No newline at end of file diff --git a/.evalview/latest-redesign-v2.html b/.evalview/latest-redesign-v2.html deleted file mode 100644 index 6e51fb0..0000000 --- a/.evalview/latest-redesign-v2.html +++ /dev/null @@ -1,681 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- - - ✗ 1 Failed - - 5 Tests - -
-
- -
- -
- - - - - -
- - -
- -
-
-
-
Pass Rate
-
- - - - -
4/5
-
-
-
80.0%
-
4 of 5 tests
-
-
-
-
Avg Score
-
- - - - -
79
-
-
-
79.7
-
out of 100
-
-
-
Total Cost
-
$0.017
-
2,835 tokens (verified)
- -
in 2,170out 665
- - -
openai/gpt-4o-mini, openai/claude-3.5-sonnet
- -
-
-
Avg Latency
-
1360ms
-
per test
-
-
- -
-
-
Agent Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
-
5 tests in this run
-
- -
-
Token Usage
-
2,835 tokens
-
in 2,170 / out 665
-
- -
- - - -
-
-
EvalView Judge (gpt-4o-mini)
-
$0.006
-
1,320 tokens across 5 judge calls
-
-
-
Judge Token Breakdown
-
in 1,100 / out 220
-
Separate from agent trace cost
-
-
- - -
-
-
Score per Test
-
-
-
-
Distribution
-
-
-
- -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - This check also used 5 EvalView judge calls (1320 tokens). -
-
- -
- - -
- - -
-
- - refund-flow -
- 92.5/100 - 💰 $0.0034 - ⚡ 1200ms - 570 tokens - 🧠 openai/gpt-4o-mini -
- -
-
-
- Model: openai/gpt-4o-mini - in 450 / out 120 tokens$0.0034 - - - -
- -
- QueryProcess refund for order 4812 -
- - -
- Direct response — no tools invoked -
- - - -
- ResponseRefund of $49.99 issued successfully. -
- -
-
- -
-
- - order-lookup -
- 88.0/100 - 💰 $0.0021 - ⚡ 800ms - 415 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - billing-dispute -
- 45.0/100 - 💰 $0.0058 - ⚡ 2400ms - 890 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - password-reset -
- 95.0/100 - 💰 $0.0012 - ⚡ 600ms - 260 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - vip-escalation -
- 78.0/100 - 💰 $0.0045 - ⚡ 1800ms - 700 tokens - 🧠 openai/claude-3.5-sonnet -
- -
- -
- - -
- - -
- -
No diffs yet — run evalview check to compare against a baseline
- -
- - -
- -
No step timing data
- -
- - - -
- - - - -
- Built with EvalView | Agent testing & regression detection - - - Share - - Star - -
-
- - - \ No newline at end of file diff --git a/.evalview/latest-redesign.html b/.evalview/latest-redesign.html deleted file mode 100644 index ea790bb..0000000 --- a/.evalview/latest-redesign.html +++ /dev/null @@ -1,956 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- - - ✗ 1 Failed - - 5 Tests - -
-
- -
- -
- - - - - -
- - -
- -
-
-
-
Pass Rate
-
- - - - -
4/5
-
-
-
80.0%
-
4 of 5 tests
-
-
-
-
Avg Score
-
- - - - -
79
-
-
-
79.7
-
out of 100
-
-
-
-
Total Cost
-
-
$0.017
-
- 2,835 tokens (verified) -
openai/gpt-4o-mini, openai/claude-3.5-sonnet -
- -
- in 2,170 - out 665 -
- -
-
-
-
Avg Latency
-
-
1360ms
-
per test
-
-
- -
-
-
Agent Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
-
5 tests in this run
-
- -
-
Token Usage
-
2,835 tokens
-
in 2,170 / out 665
-
- -
- - - -
-
-
EvalView Judge (gpt-4o-mini)
-
- - $0.006 - -
-
- 1,320 tokens across 5 judge calls -
-
-
-
Judge Token Breakdown
-
in 1,100 / out 220
-
Separate from agent trace cost
-
-
- - - -
-
-
Score per Test
-
-
-
-
Distribution
-
-
-
- - -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - This check also used 5 EvalView judge calls (1320 tokens). -
-
- - -
- - -
- - -
-
- - refund-flow -
- 92.5/100 - 💰 $0.0034 - ⚡ 1200ms - 570 tokens - 🧠 openai/gpt-4o-mini -
- -
-
-
- Model: openai/gpt-4o-mini - - in 450 / out 120 tokens - $0.0034 - - - - -
- -
- QueryProcess refund for order 4812 -
- - -
- - Direct response — no tools invoked - -
- - - -
- ResponseRefund of $49.99 issued successfully. -
- -
-
- -
-
- - order-lookup -
- 88.0/100 - 💰 $0.0021 - ⚡ 800ms - 415 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - billing-dispute -
- 45.0/100 - 💰 $0.0058 - ⚡ 2400ms - 890 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - password-reset -
- 95.0/100 - 💰 $0.0012 - ⚡ 600ms - 260 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - vip-escalation -
- 78.0/100 - 💰 $0.0045 - ⚡ 1800ms - 700 tokens - 🧠 openai/claude-3.5-sonnet -
- -
- -
- - -
- - -
- -
No diffs yet — run evalview check to compare against a baseline
- -
- - -
- -
No step timing data
- -
- - - - -
- - - - -
- - Built with EvalView - | - Agent testing & regression detection - - - - - Share - - - - Star - - -
-
- - - \ No newline at end of file diff --git a/.evalview/latest-v3.html b/.evalview/latest-v3.html deleted file mode 100644 index 4de07bf..0000000 --- a/.evalview/latest-v3.html +++ /dev/null @@ -1,681 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- - - ✗ 1 Failed - - 5 Tests - -
-
- -
- -
- - - - - -
- - -
- -
-
-
-
Pass Rate
-
- - - - -
4/5
-
-
-
80.0%
-
4 of 5 tests
-
-
-
-
Avg Score
-
- - - - -
79
-
-
-
79.7
-
out of 100
-
-
-
Total Cost
-
$0.017
-
2,835 tokens (verified)
- -
in 2,170out 665
- - -
openai/gpt-4o-mini, openai/claude-3.5-sonnet
- -
-
-
Avg Latency
-
1360ms
-
per test
-
-
- -
-
-
Agent Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
-
5 tests in this run
-
- -
-
Token Usage
-
2,835 tokens
-
in 2,170 / out 665
-
- -
- - - -
-
-
EvalView Judge (gpt-4o-mini)
-
$0.006
-
1,320 tokens across 5 judge calls
-
-
-
Judge Token Breakdown
-
in 1,100 / out 220
-
Separate from agent trace cost
-
-
- - -
-
-
Score per Test
-
-
-
-
Distribution
-
-
-
- -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - This check also used 5 EvalView judge calls (1320 tokens). -
-
- -
- - -
- - -
-
- - refund-flow -
- 92.5/100 - 💰 $0.0034 - ⚡ 1200ms - 570 tokens - 🧠 openai/gpt-4o-mini -
- -
-
-
- Model: openai/gpt-4o-mini - in 450 / out 120 tokens$0.0034 - - - -
- -
- QueryProcess refund for order 4812 -
- - -
- Direct response — no tools invoked -
- - - -
- ResponseRefund issued. -
- -
-
- -
-
- - order-lookup -
- 88.0/100 - 💰 $0.0021 - ⚡ 800ms - 415 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - billing-dispute -
- 45.0/100 - 💰 $0.0058 - ⚡ 2400ms - 890 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - password-reset -
- 95.0/100 - 💰 $0.0012 - ⚡ 600ms - 260 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - vip-escalation -
- 78.0/100 - 💰 $0.0045 - ⚡ 1800ms - 700 tokens - 🧠 openai/claude-3.5-sonnet -
- -
- -
- - -
- - -
- -
No diffs yet — run evalview check to compare against a baseline
- -
- - -
- -
No step timing data
- -
- - - -
- - - - -
- Built with EvalView | Agent testing & regression detection - - - Share - - Star - -
-
- - - \ No newline at end of file diff --git a/.evalview/latest-v4.html b/.evalview/latest-v4.html deleted file mode 100644 index 7084865..0000000 --- a/.evalview/latest-v4.html +++ /dev/null @@ -1,708 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- -
- - - ✗ 1 Failed - - 5 Tests - -
-
- -
- -
- - - - - -
- - -
- - - -
-
-
Pass Rate
-
80.0%
-
4 of 5 tests passing
-
- - - - -
4/5
-
-
-
-
-
Avg Score
-
79.7/100
-
-
-
Total Cost
-
$0.017
-
2,835 tokens
-
-
-
Avg Latency
-
1360ms
-
per test
-
-
-
Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
- -
in 2,170 · out 665
- -
-
-
- - -
-
-
Agent Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
-
5 tests in this run
-
- -
-
Token Usage
-
2,835 tokens
-
in 2,170 / out 665
-
- -
- - - -
-
-
EvalView Judge (gpt-4o-mini)
-
$0.006
-
1,320 tokens across 5 judge calls
-
-
-
Judge Token Breakdown
-
in 1,100 / out 220
-
Separate from agent trace cost
-
-
- - - -
-
-
Score per Test
-
-
-
-
Distribution
-
-
-
- - -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - This check also used 5 EvalView judge calls (1320 tokens). -
-
- -
- - -
- - -
-
- - refund-flow -
- 92.5/100 - 💰 $0.0034 - ⚡ 1200ms - 570 tokens - 🧠 openai/gpt-4o-mini -
- -
-
-
- Model: openai/gpt-4o-mini - in 450 / out 120 tokens$0.0034 - - - -
- -
- QueryProcess refund for order 4812 -
- - -
- Direct response — no tools invoked -
- - - -
- ResponseRefund issued. -
- -
-
- -
-
- - order-lookup -
- 88.0/100 - 💰 $0.0021 - ⚡ 800ms - 415 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - billing-dispute -
- 45.0/100 - 💰 $0.0058 - ⚡ 2400ms - 890 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - password-reset -
- 95.0/100 - 💰 $0.0012 - ⚡ 600ms - 260 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - vip-escalation -
- 78.0/100 - 💰 $0.0045 - ⚡ 1800ms - 700 tokens - 🧠 openai/claude-3.5-sonnet -
- -
- -
- - -
- - -
- -
No diffs yet — run evalview check to compare against a baseline
- -
- - -
- -
No step timing data
- -
- - - -
- - - - -
- Built with EvalView | Agent testing & regression detection - - Share - Star - -
-
- - - \ No newline at end of file diff --git a/.evalview/latest-v5.html b/.evalview/latest-v5.html deleted file mode 100644 index 2586400..0000000 --- a/.evalview/latest-v5.html +++ /dev/null @@ -1,689 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- ✗ 1 Failed5 Tests -
-
- -
-
- - - - - -
- - -
- - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
80.0%
-
Pass Rate
-
-
-
4 of 5 tests passing
-
- - -
-
-
Avg Score
-
79.7
-
out of 100
-
-
-
Total Cost
-
$0.017
-
2,835 tokens (verified)
-
-
-
Avg Latency
-
1360ms
-
per test
-
-
-
Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
- -
in 2,170 · out 665
- -
-
- - -
-
-
Agent Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
-
5 tests in this run
-
- -
-
Token Usage
-
2,835 tokens
-
in 2,170 / out 665
-
- -
- - -
-
-
EvalView Judge (gpt-4o-mini)
-
$0.006
-
1,320 tokens across 5 judge calls
-
-
-
Judge Token Breakdown
-
in 1,100 / out 220
-
Separate from agent trace cost
-
-
- - - -
-
-
Score per Test
-
-
-
-
Distribution
-
-
-
- - -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - This check also used 5 EvalView judge calls (1320 tokens). -
-
- -
- - -
- -
-
- - refund-flow -
- 92.5/100 - 💰 $0.0034 - ⚡ 1200ms - 570 tokens - 🧠 openai/gpt-4o-mini -
- -
-
-
- Model: openai/gpt-4o-mini - in 450 / out 120 tokens$0.0034 - - - -
- -
- QueryProcess refund for order 4812 -
-
◎ Direct response — no tools invoked
- - -
- ResponseRefund issued. -
-
-
- -
-
- - order-lookup -
- 88.0/100 - 💰 $0.0021 - ⚡ 800ms - 415 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - billing-dispute -
- 45.0/100 - 💰 $0.0058 - ⚡ 2400ms - 890 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - password-reset -
- 95.0/100 - 💰 $0.0012 - ⚡ 600ms - 260 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - vip-escalation -
- 78.0/100 - 💰 $0.0045 - ⚡ 1800ms - 700 tokens - 🧠 openai/claude-3.5-sonnet -
- -
- -
- -
- - -
-
No diffs yet — run evalview check to compare against a baseline
-
- - -
-
No step timing data
-
- - - -
- - - - -
- Built with EvalView | Agent testing & regression detection - - Share - Star - -
-
- - - \ No newline at end of file diff --git a/.evalview/latest-v6.html b/.evalview/latest-v6.html deleted file mode 100644 index aee40eb..0000000 --- a/.evalview/latest-v6.html +++ /dev/null @@ -1,616 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- ✗ 1 Failed5 Tests -
-
- -
-
- - - - - -
- - -
- - - -
-
-
- - - - -
-
80.0%
-
Pass Rate
-
-
-
4 of 5 tests
-
-
-
-
Avg Score
-
79.7
-
out of 100
-
-
-
Total Cost
-
$0.017
-
2,835 tokens (verified)
-
-
-
Avg Latency
-
1360ms
-
per test
-
-
-
Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
- -
in 2,170 · out 665
- -
-
-
- - -
-
-
Agent Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
-
5 tests in this run
-
- -
-
Token Usage
-
2,835 tokens
-
in 2,170 / out 665
-
- -
- - -
-
-
EvalView Judge (gpt-4o-mini)
-
$0.006
-
1,320 tokens across 5 judge calls
-
-
-
Judge Token Breakdown
-
in 1,100 / out 220
-
Separate from agent trace cost
-
-
- - - -
-
-
Score per Test
-
-
-
-
Distribution
-
-
-
- - -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - This check also used 5 EvalView judge calls (1320 tokens). -
-
- -
- - -
- -
-
- - refund-flow -
- 92.5/100 - 💰 $0.0034 - ⚡ 1200ms - 570 tokens - 🧠 openai/gpt-4o-mini -
- -
-
-
- Model: openai/gpt-4o-mini - in 450 / out 120 tokens$0.0034 - - - -
- -
- QueryProcess refund -
-
◎ Direct response — no tools invoked
- - -
- ResponseDone. -
-
-
- -
-
- - order-lookup -
- 88.0/100 - 💰 $0.0021 - ⚡ 800ms - 415 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - billing-dispute -
- 45.0/100 - 💰 $0.0058 - ⚡ 2400ms - 890 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - password-reset -
- 95.0/100 - 💰 $0.0012 - ⚡ 600ms - 260 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - vip-escalation -
- 78.0/100 - 💰 $0.0045 - ⚡ 1800ms - 700 tokens - 🧠 openai/claude-3.5-sonnet -
- -
- -
- -
- - -
-
No diffs yet — run evalview check to compare against a baseline
-
- - -
-
No step timing data
-
- - - -
- - - - -
- Built with EvalView | Agent testing & regression detection - - Share - Star - -
-
- - - \ No newline at end of file diff --git a/.evalview/latest-v6b.html b/.evalview/latest-v6b.html deleted file mode 100644 index e6e050e..0000000 --- a/.evalview/latest-v6b.html +++ /dev/null @@ -1,616 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- ✗ 1 Failed5 Tests -
-
- -
-
- - - - - -
- - -
- - - -
-
-
- - - - -
-
80.0%
-
Pass Rate
-
-
-
4 of 5 tests
-
-
-
-
Avg Score
-
79.7
-
out of 100
-
-
-
Total Cost
-
$0.017
-
2,835 tokens (verified)
-
-
-
Avg Latency
-
1360ms
-
per test
-
-
-
Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
- -
in 2,170 · out 665
- -
-
-
- - -
-
-
Agent Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
-
5 tests in this run
-
- -
-
Token Usage
-
2,835 tokens
-
in 2,170 / out 665
-
- -
- - -
-
-
EvalView Judge (gpt-4o-mini)
-
$0.006
-
1,320 tokens across 5 judge calls
-
-
-
Judge Token Breakdown
-
in 1,100 / out 220
-
Separate from agent trace cost
-
-
- - - -
-
-
Score per Test
-
-
-
-
Distribution
-
-
-
- - -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - This check also used 5 EvalView judge calls (1320 tokens). -
-
- -
- - -
- -
-
- - refund-flow -
- 92.5/100 - 💰 $0.0034 - ⚡ 1200ms - 570 tokens - 🧠 openai/gpt-4o-mini -
- -
-
-
- Model: openai/gpt-4o-mini - in 450 / out 120 tokens$0.0034 - - - -
- -
- QueryProcess refund -
-
◎ Direct response — no tools invoked
- - -
- ResponseDone. -
-
-
- -
-
- - order-lookup -
- 88.0/100 - 💰 $0.0021 - ⚡ 800ms - 415 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - billing-dispute -
- 45.0/100 - 💰 $0.0058 - ⚡ 2400ms - 890 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - password-reset -
- 95.0/100 - 💰 $0.0012 - ⚡ 600ms - 260 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - vip-escalation -
- 78.0/100 - 💰 $0.0045 - ⚡ 1800ms - 700 tokens - 🧠 openai/claude-3.5-sonnet -
- -
- -
- -
- - -
-
No diffs yet — run evalview check to compare against a baseline
-
- - -
-
No step timing data
-
- - - -
- - - - -
- Built with EvalView | Agent testing & regression detection - - Share - Star - -
-
- - - \ No newline at end of file diff --git a/.evalview/latest-v6c.html b/.evalview/latest-v6c.html deleted file mode 100644 index 5caff22..0000000 --- a/.evalview/latest-v6c.html +++ /dev/null @@ -1,603 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- ✗ 1 Failed5 Tests -
-
- -
-
- - - - - -
- - -
- - - -
-
-
- - - - -
-
80.0%
-
Pass Rate
-
-
-
4 of 5 tests
-
-
-
-
Avg Score
-
79.7
-
out of 100
-
-
-
Total Cost
-
$0.017
-
2,835 tokens (verified)
-
-
-
Avg Latency
-
1360ms
-
per test
-
-
-
Model
-
anthropic/claude-sonnet-4-6
- -
in 2,170 · out 665
- -
-
-
- - -
-
-
Agent Model
-
anthropic/claude-sonnet-4-6
-
5 tests in this run
-
- -
-
Token Usage
-
2,835 tokens
-
in 2,170 / out 665
-
- -
- - - - -
-
-
Score per Test
-
-
-
-
Distribution
-
-
-
- - -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostTokensLatencyScore
refund-flowanthropic/claude-sonnet-4-6$0.0034570 tokens1200ms92.5
order-lookupanthropic/claude-sonnet-4-6$0.0021415 tokens800ms88.0
billing-disputeanthropic/claude-sonnet-4-6$0.0058890 tokens2400ms45.0
password-resetanthropic/claude-sonnet-4-6$0.0012260 tokens600ms95.0
vip-escalationanthropic/claude-sonnet-4-6$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - -
-
- -
- - -
- -
-
- - refund-flow -
- 92.5/100 - 💰 $0.0034 - ⚡ 1200ms - 570 tokens - 🧠 anthropic/claude-sonnet-4-6 -
- -
-
-
- Model: anthropic/claude-sonnet-4-6 - in 450 / out 120 tokens$0.0034 - - - -
- -
- QueryProcess refund -
-
◎ Direct response — no tools invoked
- - -
- ResponseDone. -
-
-
- -
-
- - order-lookup -
- 88.0/100 - 💰 $0.0021 - ⚡ 800ms - 415 tokens - 🧠 anthropic/claude-sonnet-4-6 -
- -
- -
- -
-
- - billing-dispute -
- 45.0/100 - 💰 $0.0058 - ⚡ 2400ms - 890 tokens - 🧠 anthropic/claude-sonnet-4-6 -
- -
- -
- -
-
- - password-reset -
- 95.0/100 - 💰 $0.0012 - ⚡ 600ms - 260 tokens - 🧠 anthropic/claude-sonnet-4-6 -
- -
- -
- -
-
- - vip-escalation -
- 78.0/100 - 💰 $0.0045 - ⚡ 1800ms - 700 tokens - 🧠 anthropic/claude-sonnet-4-6 -
- -
- -
- -
- - -
-
No diffs yet — run evalview check to compare against a baseline
-
- - -
-
No step timing data
-
- - - -
- - - - -
- Built with EvalView | Agent testing & regression detection - - Share - Star - -
-
- - - \ No newline at end of file diff --git a/evalview/commands/check_display.py b/evalview/commands/check_display.py index c53fb48..f376c89 100644 --- a/evalview/commands/check_display.py +++ b/evalview/commands/check_display.py @@ -16,6 +16,96 @@ from evalview.core.model_runtime_detector import ModelRuntimeChangeSummary +def _aggregate_token_summary( + results: Optional[List["EvaluationResult"]], + golden_traces: Optional[Dict[str, "GoldenTrace"]] = None, +) -> Optional[Dict[str, Any]]: + """Aggregate token usage and cost across test results. + + Computes total input/output/cached tokens and cost from all results. + Optionally calculates baseline comparison if golden traces are provided. + + Args: + results: List of evaluation results with token data + golden_traces: Optional baseline traces for comparison + + Returns: + Dict with token_usage, total_cost, baseline_token_usage, and token_delta_pct, + or None if no token data is available. + """ + from evalview.core.types import TokenUsage + + if not results: + return None + + total_cost = 0.0 + total_input = 0 + total_output = 0 + total_cached = 0 + has_tokens = False + + for r in results: + try: + total_cost += float(getattr(r.trace.metrics, "total_cost", 0) or 0) + except Exception: + pass + + tu = getattr(getattr(r, "trace", None), "metrics", None) + tu = getattr(tu, "total_tokens", None) + if tu is None: + continue + + has_tokens = True + total_input += int(getattr(tu, "input_tokens", 0) or 0) + total_output += int(getattr(tu, "output_tokens", 0) or 0) + total_cached += int(getattr(tu, "cached_tokens", 0) or 0) + + if not has_tokens: + return None + + current = TokenUsage( + input_tokens=total_input, + output_tokens=total_output, + cached_tokens=total_cached, + ) + + baseline_usage: Optional[TokenUsage] = None + delta_pct: Optional[float] = None + if golden_traces: + base_input = 0 + base_output = 0 + base_cached = 0 + base_has_tokens = False + for r in results: + g = golden_traces.get(r.test_case) + if not g: + continue + gtu = getattr(getattr(g, "trace", None), "metrics", None) + gtu = getattr(gtu, "total_tokens", None) + if gtu is None: + continue + base_has_tokens = True + base_input += int(getattr(gtu, "input_tokens", 0) or 0) + base_output += int(getattr(gtu, "output_tokens", 0) or 0) + base_cached += int(getattr(gtu, "cached_tokens", 0) or 0) + + if base_has_tokens: + baseline_usage = TokenUsage( + input_tokens=base_input, + output_tokens=base_output, + cached_tokens=base_cached, + ) + if baseline_usage.total_tokens > 0: + delta_pct = (current.total_tokens - baseline_usage.total_tokens) / baseline_usage.total_tokens * 100.0 + + return { + "token_usage": current, + "total_cost": float(total_cost), + "baseline_token_usage": baseline_usage, + "token_delta_pct": delta_pct, + } + + def _print_parameter_diffs(tool_diffs: List["ToolDiff"]) -> None: """Print parameter-level differences for tool calls.""" from rich.table import Table @@ -400,6 +490,7 @@ def _display_check_results( behavior_summary = _build_behavior_summary(diffs, test_metadata, healing_summary) if json_output: + token_summary = _aggregate_token_summary(results, golden_traces) output = { "summary": { "total_tests": len(diffs), @@ -466,6 +557,13 @@ def _display_check_results( for name, diff in diffs ], } + if token_summary is not None: + output["summary"]["token_usage"] = token_summary["token_usage"].model_dump() + output["summary"]["total_cost"] = token_summary["total_cost"] + if token_summary.get("baseline_token_usage") is not None: + output["summary"]["baseline_token_usage"] = token_summary["baseline_token_usage"].model_dump() # type: ignore[union-attr] + if token_summary.get("token_delta_pct") is not None: + output["summary"]["token_delta_pct"] = token_summary["token_delta_pct"] if healing_summary: output["healing"] = { "total_healed": healing_summary.total_healed, @@ -547,6 +645,26 @@ def _display_check_results( ) console.print() + token_summary = _aggregate_token_summary(results, golden_traces) + if token_summary is not None: + from rich.table import Table + + tu = token_summary["token_usage"] + delta_pct = token_summary.get("token_delta_pct") + delta_str = "" + if delta_pct is not None: + sign = "+" if delta_pct > 0 else "" + color = "red" if delta_pct > 10 else "yellow" if delta_pct > 0 else "green" + delta_str = f" [{color}]({sign}{delta_pct:.0f}% tokens vs baseline)[/{color}]" + + table = Table(show_header=False, show_lines=False, padding=(0, 1)) + table.add_column("k", style="dim", width=12) + table.add_column("v") + table.add_row("Tokens", f"in {tu.input_tokens:,} out {tu.output_tokens:,} cached {tu.cached_tokens:,} total {tu.total_tokens:,}{delta_str}") + table.add_row("Cost", f"${float(token_summary['total_cost']):.4f}") + console.print(table) + console.print() + # --- Sparkline Trends --- if diffs and drift_tracker is not None: test_trends: Dict[str, List[float]] = {} diff --git a/tests/test_check_cmd.py b/tests/test_check_cmd.py index 2ed5928..1488747 100644 --- a/tests/test_check_cmd.py +++ b/tests/test_check_cmd.py @@ -6,6 +6,8 @@ from click.testing import CliRunner +import pytest + def test_check_dry_run_handles_golden_metadata_objects(monkeypatch, tmp_path): """Dry-run should count baselines by name without hashing metadata models.""" @@ -140,6 +142,228 @@ def test_check_does_not_report_clean_when_execution_failures_occur(monkeypatch, assert "execution failure" in result.output +def test_check_json_includes_token_usage_and_cost_summary(monkeypatch, tmp_path): + from evalview.commands.check_cmd import check + from evalview.core.diff import DiffStatus, TraceDiff + from evalview.core.golden import GoldenMetadata + from evalview.core.project_state import ProjectState + from evalview.core.types import ( + ContainsChecks, + CostEvaluation, + EvaluationResult, + Evaluations, + ExecutionMetrics, + ExecutionTrace, + LatencyEvaluation, + OutputEvaluation, + SequenceEvaluation, + TokenUsage, + ToolEvaluation, + ) + + project = tmp_path + monkeypatch.chdir(project) + + tests_dir = project / "tests" + tests_dir.mkdir() + (tests_dir / "sample.yaml").write_text( + "name: sample\ninput:\n query: hi\nexpected:\n tools: []\nthresholds:\n min_score: 0\n", + encoding="utf-8", + ) + + evalview_dir = project / ".evalview" + evalview_dir.mkdir() + (evalview_dir / "config.yaml").write_text( + "adapter: http\nendpoint: http://example.com\n", + encoding="utf-8", + ) + + now = datetime.now() + sample_result = EvaluationResult( + test_case="sample", + passed=True, + score=90.0, + evaluations=Evaluations( + tool_accuracy=ToolEvaluation(accuracy=1.0), + sequence_correctness=SequenceEvaluation(correct=True, expected_sequence=[], actual_sequence=[]), + output_quality=OutputEvaluation( + score=90.0, + rationale="ok", + contains_checks=ContainsChecks(), + not_contains_checks=ContainsChecks(), + ), + cost=CostEvaluation(total_cost=0.0123, threshold=1.0, passed=True), + latency=LatencyEvaluation(total_latency=10.0, threshold=1000.0, passed=True), + ), + trace=ExecutionTrace( + session_id="s1", + start_time=now, + end_time=now, + steps=[], + final_output="ok", + metrics=ExecutionMetrics( + total_cost=0.0123, + total_latency=10.0, + total_tokens=TokenUsage(input_tokens=100, output_tokens=50, cached_tokens=25), + ), + ), + timestamp=now, + ) + + diff = TraceDiff( + test_name="sample", + has_differences=False, + tool_diffs=[], + output_diff=None, + score_diff=0.0, + latency_diff=0.0, + overall_severity=DiffStatus.PASSED, + ) + + runner = CliRunner() + + monkeypatch.setattr("evalview.commands.check_cmd._cloud_pull", lambda store: None) + monkeypatch.setattr("evalview.commands.check_cmd._load_config_if_exists", lambda: None) + monkeypatch.setattr( + "evalview.core.golden.GoldenStore.list_golden", + lambda self: [GoldenMetadata(test_name="sample", blessed_at="2026-03-13T00:00:00Z", score=95.0)], + ) + + golden_tokens = TokenUsage(input_tokens=80, output_tokens=40, cached_tokens=20) + golden_traces = { + "sample": type( + "_G", + (), + { + "trace": type("_T", (), {"metrics": ExecutionMetrics(total_cost=0.01, total_latency=1.0, total_tokens=golden_tokens)})() + }, + )() + } + + monkeypatch.setattr( + "evalview.commands.check_cmd._execute_check_tests", + lambda test_cases, config, json_output, semantic_diff=False, timeout=30.0, skip_llm_judge=False, budget_tracker=None: ([ + ("sample", diff) + ], [sample_result], None, golden_traces), + ) + from evalview.core.project_state import ProjectStateStore + monkeypatch.setattr(ProjectStateStore, "load", lambda self: ProjectState()) + monkeypatch.setattr(ProjectStateStore, "update_check", lambda self, has_regressions, status="passed": ProjectState()) + + result = runner.invoke(check, ["tests", "--json"]) + assert result.exit_code == 0 + + import json as _json + payload = _json.loads(result.output) + assert "summary" in payload + assert payload["summary"]["token_usage"] == {"input_tokens": 100, "output_tokens": 50, "cached_tokens": 25} + assert payload["summary"]["total_cost"] == pytest.approx(0.0123) + assert payload["summary"]["baseline_token_usage"] == {"input_tokens": 80, "output_tokens": 40, "cached_tokens": 20} + assert payload["summary"]["token_delta_pct"] == pytest.approx(25.0) + + +def test_check_json_omits_token_usage_when_unavailable(monkeypatch, tmp_path): + from evalview.commands.check_cmd import check + from evalview.core.diff import DiffStatus, TraceDiff + from evalview.core.golden import GoldenMetadata + from evalview.core.project_state import ProjectState + from evalview.core.types import ( + ContainsChecks, + CostEvaluation, + EvaluationResult, + Evaluations, + ExecutionMetrics, + ExecutionTrace, + LatencyEvaluation, + OutputEvaluation, + SequenceEvaluation, + ToolEvaluation, + ) + + project = tmp_path + monkeypatch.chdir(project) + + tests_dir = project / "tests" + tests_dir.mkdir() + (tests_dir / "sample.yaml").write_text( + "name: sample\ninput:\n query: hi\nexpected:\n tools: []\nthresholds:\n min_score: 0\n", + encoding="utf-8", + ) + + evalview_dir = project / ".evalview" + evalview_dir.mkdir() + (evalview_dir / "config.yaml").write_text( + "adapter: http\nendpoint: http://example.com\n", + encoding="utf-8", + ) + + now = datetime.now() + sample_result = EvaluationResult( + test_case="sample", + passed=True, + score=90.0, + evaluations=Evaluations( + tool_accuracy=ToolEvaluation(accuracy=1.0), + sequence_correctness=SequenceEvaluation(correct=True, expected_sequence=[], actual_sequence=[]), + output_quality=OutputEvaluation( + score=90.0, + rationale="ok", + contains_checks=ContainsChecks(), + not_contains_checks=ContainsChecks(), + ), + cost=CostEvaluation(total_cost=0.0, threshold=1.0, passed=True), + latency=LatencyEvaluation(total_latency=10.0, threshold=1000.0, passed=True), + ), + trace=ExecutionTrace( + session_id="s1", + start_time=now, + end_time=now, + steps=[], + final_output="ok", + metrics=ExecutionMetrics(total_cost=0.0, total_latency=10.0, total_tokens=None), + ), + timestamp=now, + ) + + diff = TraceDiff( + test_name="sample", + has_differences=False, + tool_diffs=[], + output_diff=None, + score_diff=0.0, + latency_diff=0.0, + overall_severity=DiffStatus.PASSED, + ) + + runner = CliRunner() + + monkeypatch.setattr("evalview.commands.check_cmd._cloud_pull", lambda store: None) + monkeypatch.setattr("evalview.commands.check_cmd._load_config_if_exists", lambda: None) + monkeypatch.setattr( + "evalview.core.golden.GoldenStore.list_golden", + lambda self: [GoldenMetadata(test_name="sample", blessed_at="2026-03-13T00:00:00Z", score=95.0)], + ) + monkeypatch.setattr( + "evalview.commands.check_cmd._execute_check_tests", + lambda test_cases, config, json_output, semantic_diff=False, timeout=30.0, skip_llm_judge=False, budget_tracker=None: ([ + ("sample", diff) + ], [sample_result], None, {}), + ) + from evalview.core.project_state import ProjectStateStore + monkeypatch.setattr(ProjectStateStore, "load", lambda self: ProjectState()) + monkeypatch.setattr(ProjectStateStore, "update_check", lambda self, has_regressions, status="passed": ProjectState()) + + result = runner.invoke(check, ["tests", "--json"]) + assert result.exit_code == 0 + + import json as _json + payload = _json.loads(result.output) + assert "summary" in payload + assert "token_usage" not in payload["summary"] + assert "baseline_token_usage" not in payload["summary"] + assert "token_delta_pct" not in payload["summary"] + + def test_check_uses_active_test_path_when_no_path_is_given(monkeypatch, tmp_path): """Plain `check` should follow the remembered active suite instead of raw tests/.""" from evalview.commands.check_cmd import check From d9782603446e4a06bf3ac4f595c53fa2e3f946e5 Mon Sep 17 00:00:00 2001 From: gxt Date: Sun, 19 Apr 2026 15:22:05 +0530 Subject: [PATCH 2/5] Fix type check error --- evalview/commands/check_display.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalview/commands/check_display.py b/evalview/commands/check_display.py index f376c89..aa8fb44 100644 --- a/evalview/commands/check_display.py +++ b/evalview/commands/check_display.py @@ -649,7 +649,7 @@ def _display_check_results( if token_summary is not None: from rich.table import Table - tu = token_summary["token_usage"] + tu = token_summary["token_usage"] # type: ignore[assignment] delta_pct = token_summary.get("token_delta_pct") delta_str = "" if delta_pct is not None: From f5e87cc36f77cfa0ee41d6be37036501ccde50f0 Mon Sep 17 00:00:00 2001 From: gxt Date: Sun, 19 Apr 2026 15:22:05 +0530 Subject: [PATCH 3/5] Fix type check error --- evalview/commands/check_display.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evalview/commands/check_display.py b/evalview/commands/check_display.py index f376c89..1be9f4b 100644 --- a/evalview/commands/check_display.py +++ b/evalview/commands/check_display.py @@ -98,7 +98,7 @@ def _aggregate_token_summary( if baseline_usage.total_tokens > 0: delta_pct = (current.total_tokens - baseline_usage.total_tokens) / baseline_usage.total_tokens * 100.0 - return { + return { # type: ignore[return-value] "token_usage": current, "total_cost": float(total_cost), "baseline_token_usage": baseline_usage, @@ -649,7 +649,7 @@ def _display_check_results( if token_summary is not None: from rich.table import Table - tu = token_summary["token_usage"] + tu = token_summary["token_usage"] # type: ignore[assignment] delta_pct = token_summary.get("token_delta_pct") delta_str = "" if delta_pct is not None: From 75342fbd761dd1decb5a6c4f81ce9c505a893245 Mon Sep 17 00:00:00 2001 From: gxt Date: Mon, 20 Apr 2026 20:24:15 +0530 Subject: [PATCH 4/5] Restore accidentally deleted files --- .env.example | 23 + .evalview/badge.json | 6 + .evalview/healing/2026-03-25T11-12-54.json | 72 ++ .evalview/healing/2026-03-25T11-13-40.json | 72 ++ .evalview/healing/2026-03-25T11-23-25.json | 45 + .evalview/healing/2026-03-28T22-06-19.json | 47 + .evalview/latest-check.html | 1002 ++++++++++++++++++++ .evalview/latest-redesign-v2.html | 681 +++++++++++++ .evalview/latest-redesign.html | 956 +++++++++++++++++++ .evalview/latest-v3.html | 681 +++++++++++++ .evalview/latest-v4.html | 708 ++++++++++++++ .evalview/latest-v5.html | 689 ++++++++++++++ .evalview/latest-v6.html | 616 ++++++++++++ .evalview/latest-v6b.html | 616 ++++++++++++ .evalview/latest-v6c.html | 603 ++++++++++++ 15 files changed, 6817 insertions(+) create mode 100644 .env.example create mode 100644 .evalview/badge.json create mode 100644 .evalview/healing/2026-03-25T11-12-54.json create mode 100644 .evalview/healing/2026-03-25T11-13-40.json create mode 100644 .evalview/healing/2026-03-25T11-23-25.json create mode 100644 .evalview/healing/2026-03-28T22-06-19.json create mode 100644 .evalview/latest-check.html create mode 100644 .evalview/latest-redesign-v2.html create mode 100644 .evalview/latest-redesign.html create mode 100644 .evalview/latest-v3.html create mode 100644 .evalview/latest-v4.html create mode 100644 .evalview/latest-v5.html create mode 100644 .evalview/latest-v6.html create mode 100644 .evalview/latest-v6b.html create mode 100644 .evalview/latest-v6c.html diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..0752640 --- /dev/null +++ b/.env.example @@ -0,0 +1,23 @@ +# EvalView Environment Configuration +# Copy this file to .env and fill in your values: +# cp .env.example .env + +# OpenAI API Key (required for LLM-as-judge evaluation) +# Get yours at: https://platform.openai.com/api-keys +OPENAI_API_KEY=your-openai-api-key-here + +# Optional: Anthropic API Key (for Claude-based evaluation) +# ANTHROPIC_API_KEY=your-anthropic-api-key-here + +# Optional: Custom model for LLM-as-judge (default: gpt-4o-mini) +# EVALVIEW_JUDGE_MODEL=gpt-4o + +# Optional: Enable debug logging +# DEBUG=1 + +# ============================================ +# Telemetry +# ============================================ + +# Disable anonymous usage analytics (enabled by default) +# EVALVIEW_TELEMETRY_DISABLED=1 diff --git a/.evalview/badge.json b/.evalview/badge.json new file mode 100644 index 0000000..07ad528 --- /dev/null +++ b/.evalview/badge.json @@ -0,0 +1,6 @@ +{ + "label": "evalview", + "message": "3 regressions", + "color": "red", + "schemaVersion": 1 +} diff --git a/.evalview/healing/2026-03-25T11-12-54.json b/.evalview/healing/2026-03-25T11-12-54.json new file mode 100644 index 0000000..3fdc9b9 --- /dev/null +++ b/.evalview/healing/2026-03-25T11-12-54.json @@ -0,0 +1,72 @@ +{ + "results": [ + { + "test_name": "Service Outage Login Issue", + "original_status": "tools_changed", + "diagnosis": { + "action": "flag_review", + "trigger": "structural_change", + "reason": "tool change detected — review needed", + "root_cause_category": null, + "details": { + "tool_changes": [ + "added: ? -> escalate_to_human" + ] + } + }, + "attempted": false, + "healed": false, + "proposed": false, + "final_status": "tools_changed", + "original_score": 98.0, + "baseline_score": 96.5, + "retry_score": null, + "retry_status": null, + "baseline_model": "gpt-4o-mini", + "actual_model": "gpt-4o-mini", + "variant_saved": null, + "variant_path": null + }, + { + "test_name": "Refund for Damaged Blender", + "original_status": "tools_changed", + "diagnosis": { + "action": "flag_review", + "trigger": "param_change", + "reason": "parameter changes detected — review needed", + "root_cause_category": null, + "details": { + "param_diffs": 3 + } + }, + "attempted": false, + "healed": false, + "proposed": false, + "final_status": "tools_changed", + "original_score": 86.7, + "baseline_score": 89.55, + "retry_score": null, + "retry_status": null, + "baseline_model": "gpt-4o-mini", + "actual_model": "gpt-4o-mini", + "variant_saved": null, + "variant_path": null + } + ], + "total_healed": 0, + "total_proposed": 0, + "total_review": 2, + "total_blocked": 0, + "attempted_count": 0, + "unresolved_count": 2, + "failed_count": 2, + "policy_version": "v1", + "thresholds": { + "min_variant_score": 70.0, + "max_cost_multiplier": 2.0, + "max_latency_multiplier": 3.0, + "max_auto_variants": 3.0 + }, + "model_update": null, + "audit_path": null +} \ No newline at end of file diff --git a/.evalview/healing/2026-03-25T11-13-40.json b/.evalview/healing/2026-03-25T11-13-40.json new file mode 100644 index 0000000..9a400ee --- /dev/null +++ b/.evalview/healing/2026-03-25T11-13-40.json @@ -0,0 +1,72 @@ +{ + "results": [ + { + "test_name": "Service Outage Login Issue", + "original_status": "tools_changed", + "diagnosis": { + "action": "flag_review", + "trigger": "structural_change", + "reason": "tool change detected — review needed", + "root_cause_category": null, + "details": { + "tool_changes": [ + "added: ? -> escalate_to_human" + ] + } + }, + "attempted": false, + "healed": false, + "proposed": false, + "final_status": "tools_changed", + "original_score": 98.5, + "baseline_score": 96.5, + "retry_score": null, + "retry_status": null, + "baseline_model": "gpt-4o-mini", + "actual_model": "gpt-4o-mini", + "variant_saved": null, + "variant_path": null + }, + { + "test_name": "Refund for Damaged Blender", + "original_status": "regression", + "diagnosis": { + "action": "flag_review", + "trigger": "param_change", + "reason": "parameter changes detected — review needed", + "root_cause_category": null, + "details": { + "param_diffs": 3 + } + }, + "attempted": false, + "healed": false, + "proposed": false, + "final_status": "regression", + "original_score": 83.6, + "baseline_score": 89.55, + "retry_score": null, + "retry_status": null, + "baseline_model": "gpt-4o-mini", + "actual_model": "gpt-4o-mini", + "variant_saved": null, + "variant_path": null + } + ], + "total_healed": 0, + "total_proposed": 0, + "total_review": 2, + "total_blocked": 0, + "attempted_count": 0, + "unresolved_count": 2, + "failed_count": 2, + "policy_version": "v1", + "thresholds": { + "min_variant_score": 70.0, + "max_cost_multiplier": 2.0, + "max_latency_multiplier": 3.0, + "max_auto_variants": 3.0 + }, + "model_update": null, + "audit_path": null +} \ No newline at end of file diff --git a/.evalview/healing/2026-03-25T11-23-25.json b/.evalview/healing/2026-03-25T11-23-25.json new file mode 100644 index 0000000..915780b --- /dev/null +++ b/.evalview/healing/2026-03-25T11-23-25.json @@ -0,0 +1,45 @@ +{ + "results": [ + { + "test_name": "Refund for Damaged Blender", + "original_status": "regression", + "diagnosis": { + "action": "flag_review", + "trigger": "param_change", + "reason": "parameter changes detected — review needed", + "root_cause_category": null, + "details": { + "param_diffs": 3 + } + }, + "attempted": false, + "healed": false, + "proposed": false, + "final_status": "regression", + "original_score": 82.35, + "baseline_score": 89.55, + "retry_score": null, + "retry_status": null, + "baseline_model": "gpt-4o-mini", + "actual_model": "gpt-4o-mini", + "variant_saved": null, + "variant_path": null + } + ], + "total_healed": 0, + "total_proposed": 0, + "total_review": 1, + "total_blocked": 0, + "attempted_count": 0, + "unresolved_count": 1, + "failed_count": 1, + "policy_version": "v1", + "thresholds": { + "min_variant_score": 70.0, + "max_cost_multiplier": 2.0, + "max_latency_multiplier": 3.0, + "max_auto_variants": 3.0 + }, + "model_update": null, + "audit_path": null +} \ No newline at end of file diff --git a/.evalview/healing/2026-03-28T22-06-19.json b/.evalview/healing/2026-03-28T22-06-19.json new file mode 100644 index 0000000..b189b8a --- /dev/null +++ b/.evalview/healing/2026-03-28T22-06-19.json @@ -0,0 +1,47 @@ +{ + "results": [ + { + "test_name": "Office Sign-In Outage Check", + "original_status": "output_changed", + "diagnosis": { + "action": "propose_variant", + "trigger": "nondeterminism", + "reason": "saved candidate variant auto_heal_a3c6 (score 86.0)", + "root_cause_category": null, + "details": { + "retry_status": "regression", + "named_variant_count_before": 0, + "max_auto_variants": 3 + } + }, + "attempted": true, + "healed": false, + "proposed": true, + "final_status": "regression", + "original_score": 92.0, + "baseline_score": 93.5, + "retry_score": 86.0, + "retry_status": "regression", + "baseline_model": "gpt-4o-mini", + "actual_model": "gpt-4o-mini", + "variant_saved": "auto_heal_a3c6", + "variant_path": ".evalview/golden/Office_Sign-In_Outage_Check.variant_auto_heal_a3c6.golden.json" + } + ], + "total_healed": 0, + "total_proposed": 1, + "total_review": 0, + "total_blocked": 0, + "attempted_count": 1, + "unresolved_count": 1, + "failed_count": 1, + "policy_version": "v1", + "thresholds": { + "min_variant_score": 70.0, + "max_cost_multiplier": 2.0, + "max_latency_multiplier": 3.0, + "max_auto_variants": 3.0 + }, + "model_update": null, + "audit_path": null +} \ No newline at end of file diff --git a/.evalview/latest-check.html b/.evalview/latest-check.html new file mode 100644 index 0000000..47d4b9d --- /dev/null +++ b/.evalview/latest-check.html @@ -0,0 +1,1002 @@ + + + + + +EvalView Check Report + + + + + + + + + +
+ +
+ ✗ Final Outcome Failing + ✗ 3 Failed3 Tests +
+
+ +
+
+ + + + + +
+ + +
+ + + +
+
+ + 0.0% passed + (0/3) +
+
+ Avg 24.2/100 +
+
+ $0 + total +
+
+ 281135ms + avg +
+
+ Unknown + +
+
+ + +
+
+
Health Gauge
+
+
+ + + + + 0% +
+
+ +
3 failed
+
3 regressions
+ +
+
+
+ +
+
Score Trends
+
+
+ +
+ + + + + +
+
Behavior Summary
+ + + + + + + + + + + + + + +
BehaviorTotalPassedChangedRegressionsHealed
untagged30030
+
+ + + +
+ +
+
Baseline Snapshot
+
2026-04-06 13:26
+
Model: Not recorded in snapshot
+
+ + +
+
EvalView Judge (openai/gpt-5.4-mini)
+
$0.000305
+
2,489 tokens across 6 judge calls
+
+
+
Token Breakdown
+
in 2,301 / out 188
+
$0.1/M input, $0.4/M output
+
+ +
+ + + + +
+
Score per Test
+
+
+ + + +
+
Execution Cost per Query
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TestModelTrace CostLatencyScore
implementUnknown$0360030ms20.0
bug-fixUnknown$0324936ms32.5
refactorUnknown$0158438ms20.0
Total$0avg $0.000000 per query
+
+ Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. + This check also used 6 EvalView judge calls (2489 tokens). +
+
+ +
+ + +
+ +
+
+ + implement +
+ + 20.0/100 + + ⚡ 360030ms + + 🧠 Unknown +
+ +
+
+
+ Model: Unknown + + + Baseline: 2026-04-06 13:26 + Baseline model: Not recorded in snapshot +
+ +
+
Score Breakdown
+
+
Tools 0.0% × 30%
+
Output 0.0/100 × 50%
+
Sequence Correct × 20%
+
= 20.0/100
+
+
The response does not implement group_by_key(), does not run the __main__ block, and simply repeats the timeout error. It fails to address the user’s request entirely.
+
+ +
+ QueryImplement the group_by_key() function in stub.py. The docstring and type signature are already there — the body is just pass. Implement it, then run the __main__ block to verify it prints the correctly grouped output. + +
+ +
+
Why it failed
+
  • Score 20.0 below minimum 70.0
  • Output quality: 0.0/100
  • Hallucination detected (70% confidence)
  • Tool accuracy: 0.0%
+
+
sequenceDiagram + participant User + participant Agent + participant T0 as error + User->>Agent: Implement the group_by_key f... + Agent-xT0: error + T0-->Agent: OpenCode timed out after 360... + Agent-->>User: OpenCode timed out after 360...
+ + +
+
Conversation Turns
+
+ +
Turn 1 · error · ⚡ 360030.9ms · 💰 $0
+
Implement the group_by_key() function in stub.py. The docstring and type signature are already there — the body is just pass. Implement it, then run the __main__ block to verify it prints the correctly grouped output. +
+
OpenCode timed out after 360s
+ +
+
+ +
+ 🔮 Hallucination detected · 70% · [Warning] - Tool 'error' failed/returned error, but agent did not acknowledge the failure · openai/gpt-5.4-mini + 🛡 Safe + + +
+
Unsupported claims: [Warning] - Tool 'error' failed/returned error, but agent did not acknowledge the failure +(Confidence 70% below threshold 98% - not blocking)
+ + +
+
+ +
+
+ + bug-fix +
+ + 32.5/100 + + ⚡ 324936ms + + 🧠 Unknown +
+ +
+
+
+ Model: Unknown + + + Baseline: 2026-04-06 13:26 + Baseline model: Not recorded in snapshot +
+ +
+
Score Breakdown
+
+
Tools 33.3% × 30%
+
Output 5.0/100 × 50%
+
Sequence Correct × 20%
+
= 32.5/100
+
+
The response is relevant and acknowledges the bug, but it does not actually fix the code or run the file to confirm the output. It only states an intention to inspect the file, so it fails to satisfy the requested task and provides no concrete correction.
+
+ +
+ QueryThere is a bug in buggy.py. The find_max function uses range(1, len(numbers) - 1) which means it never checks the last element. Fix the bug so the function correctly returns the maximum value including the last element. After fixing, run the file to confirm it prints 9. + +
+ +
+
Why it failed
+
  • Score 32.5 below minimum 70.0
  • Output quality: 5.0/100
  • Tool accuracy: 33.3%
+
+
sequenceDiagram + participant User + participant Agent + participant T0 as read_file + User->>Agent: There is a bug in buggy.py. + Agent->>T0: filePath=/Users/hidaibar-mor... + T0-->Agent: path/Users/hidaibar-mor/Down... + Agent-->>User: Ill fix the bug in the find_...
+ + +
+
Conversation Turns
+
+ +
Turn 1 · read_file · ⚡ 324936.4ms · 💰 $0
+
There is a bug in buggy.py. The find_max function uses range(1, len(numbers) - 1) which means it never checks the last element. Fix the bug so the function correctly returns the maximum value including the last element. After fixing, run the file to confirm it prints 9. +
+
I'll fix the bug in the find_max function in buggy.py. Let me first examine the file to understand the current implementation.
+ +
+
+ +
+ 🔮 No hallucination · No verifiable factual claims found in output. · openai/gpt-5.4-mini + 🛡 Safe + + +
+ + + +
+
+ +
+
+ + refactor +
+ + 20.0/100 + + ⚡ 158438ms + + 🧠 Unknown +
+ +
+
+
+ Model: Unknown + + + Baseline: 2026-04-06 13:26 + Baseline model: Not recorded in snapshot +
+ +
+
Score Breakdown
+
+
Tools 0.0% × 30%
+
Output 0.0/100 × 50%
+
Sequence Correct × 20%
+
= 20.0/100
+
+
The response does not answer the query, provide a refactoring, or confirm execution. It only contains a context-size error message and no code, so it fails completeness and relevance entirely.
+
+ +
+ QueryRefactor the function p() in messy.py. It has poor naming, deeply nested conditionals, and uses type() instead of isinstance(). Rewrite it to be clean and readable while preserving the exact same behaviour: filter out None values, strip whitespace from non-empty strings, and pass through all other types unchanged. Then run the file to confirm it still prints {'name': 'Alice', 'age': 30}. + +
+ +
+
Why it failed
+
  • Score 20.0 below minimum 70.0
  • Output quality: 0.0/100
  • Tool accuracy: 0.0%
+
+
◎ Direct response — no tools invoked
+ + +
+ 🔮 No hallucination · No verifiable factual claims found in output. · openai/gpt-5.4-mini + 🛡 Safe + + +
+ + + +
+ Response"Context size has been exceeded." +
+
+
+ +
+ + + +
+ +
+
+ ⬇ Regression + + + + implement + 96.0 → 20.0 + -76.0 + lexical 2.9% + semantic 0.4% + 70% confidence + +
+
+ +
+
Baselineglobread_fileedit_filebash
+
Currenterror
+
+ + +
+
Why This Changed
+
+ tool_reordered +
+
Tool sequence changed: 'glob' at step 1 was replaced by 'error'
+
Suggested fix: The agent is calling the right tools but in a different order. If the new order is valid, run `evalview snapshot --variant <name>` to accept it as an alternative.
+
+ +
+
Baseline Output
Output matches expected behavior: + +- `"fruit"` → `[apple, banana]` +- `"veggie"` → `[carrot]` + +The implementation iterates over each item, skips any missing the key, and appends to the appropriate grou...
+
Current Output
OpenCode timed out after 360s
--- golden
+++ actual
@@ -1,6 +1 @@
-Output matches expected behavior: +
- +
-- `"fruit"` → `[apple, banana]` +
-- `"veggie"` → `[carrot]` +
- +
-The implementation iterates over each item, skips any missing the key, and appends to the appropriate group in the result dict.
+OpenCode timed out after 360s
+
+ + +
+
+ + Trajectory Comparison +
+ +
+ + +
+
+ +
+
+ ⬇ Regression + + + + bug-fix + 100.0 → 32.5 + -67.5 + lexical 33.7% + semantic 35.2% + 99% confidence + +
+
+ +
+
Baselineread_fileedit_filebash
+
Currentread_file
+
+ + +
+
Why This Changed
+
+ tool_removed +
+
Tools 'edit_file', 'bash' were expected but not called
+
Suggested fix: Check if your agent's prompt or logic still triggers 'edit_file'. If the tool was intentionally removed, run `evalview snapshot` to update the baseline.
+
+ +
+
Baseline Output
Fixed. The bug was `range(1, len(numbers) - 1)` which excluded the last index. Changed to `range(1, len(numbers))` so all elements are checked. Output confirms `9`.
+
Current Output
I'll fix the bug in the find_max function in buggy.py. Let me first examine the file to understand the current implementation.
--- golden
+++ actual
@@ -1 +1 @@
-Fixed. The bug was `range(1, len(numbers) - 1)` which excluded the last index. Changed to `range(1, len(numbers))` so all elements are checked. Output confirms `9`.
+I'll fix the bug in the find_max function in buggy.py. Let me first examine the file to understand the current implementation.
+
+ + +
+
+ + Trajectory Comparison +
+ +
+ + +
+
+ +
+
+ ⬇ Regression + + + + refactor + 96.5 → 20.0 + -76.5 + lexical 9.9% + semantic 12.0% + 58% confidence + +
+
+ +
+
Baselineread_filetodowriteedit_fileedit_filetodowritebashtodowrite
+
CurrentNo tools
+
+ + +
+
Why This Changed
+
+ tool_removed +
+
Tools 'read_file', 'todowrite', 'edit_file', 'edit_file', 'todowrite', 'bash', 'todowrite' were expected but not called
+
Suggested fix: Check if your agent's prompt or logic still triggers 'read_file'. If the tool was intentionally removed, run `evalview snapshot` to update the baseline.
+
+ +
+
Baseline Output
Output confirmed: `{'name': 'Alice', 'age': 30}` + +Here's a summary of what changed in `messy.py`: + +| Before | After | Reason | +|--------|-------|--------| +| `p(d)` | `process_dict(data)` | Descriptive...
+
Current Output
"Context size has been exceeded."
--- golden
+++ actual
@@ -1,11 +1 @@
-Output confirmed: `{'name': 'Alice', 'age': 30}` +
- +
-Here's a summary of what changed in `messy.py`: +
- +
-| Before | After | Reason | +
-|--------|-------|--------| +
-| `p(d)` | `process_dict(data)` | Descriptive names | +
-| `for k in d` | `for key, value in data.items()` | Unpack key/value directly | +
-| `if d[k] != None` | `if value is None: continue` | `is None` is idiomatic; early-continue flattens nesting | +
-| `type(d[k]) == str` | `isinstance(value, str)` | `isinstance` is correct (handles subclasses) | +
-| `if len(d[k]) > 0` / `else: pass` | `if stripped:` | Truthiness check on the already-stripped string; `pass` branch eliminated |
+"Context size has been exceeded."
+
+ + +
+
+ + Trajectory Comparison +
+ +
+ + +
+
+ +
+ + + +
+ + +
+
+ 2 + steps +
+
+ 281135ms + avg latency +
+
+ $0 + total cost +
+
+ 3 + tests +
+
+
+
+
Step Latencies
+
+
+
+
Step Cost
+
+
+
+ +
+ + + +
+ + + + +
+ Built with EvalView | Agent testing & regression detection + + Share + Star + +
+
+ + + \ No newline at end of file diff --git a/.evalview/latest-redesign-v2.html b/.evalview/latest-redesign-v2.html new file mode 100644 index 0000000..6e51fb0 --- /dev/null +++ b/.evalview/latest-redesign-v2.html @@ -0,0 +1,681 @@ + + + + + +EvalView Check Report + + + + + + + + + +
+ +
+ + + ✗ 1 Failed + + 5 Tests + +
+
+ +
+ +
+ + + + + +
+ + +
+ +
+
+
+
Pass Rate
+
+ + + + +
4/5
+
+
+
80.0%
+
4 of 5 tests
+
+
+
+
Avg Score
+
+ + + + +
79
+
+
+
79.7
+
out of 100
+
+
+
Total Cost
+
$0.017
+
2,835 tokens (verified)
+ +
in 2,170out 665
+ + +
openai/gpt-4o-mini, openai/claude-3.5-sonnet
+ +
+
+
Avg Latency
+
1360ms
+
per test
+
+
+ +
+
+
Agent Model
+
openai/gpt-4o-mini, openai/claude-3.5-sonnet
+
5 tests in this run
+
+ +
+
Token Usage
+
2,835 tokens
+
in 2,170 / out 665
+
+ +
+ + + +
+
+
EvalView Judge (gpt-4o-mini)
+
$0.006
+
1,320 tokens across 5 judge calls
+
+
+
Judge Token Breakdown
+
in 1,100 / out 220
+
Separate from agent trace cost
+
+
+ + +
+
+
Score per Test
+
+
+
+
Distribution
+
+
+
+ +
+
Execution Cost per Query
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
+
+ Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. + This check also used 5 EvalView judge calls (1320 tokens). +
+
+ +
+ + +
+ + +
+
+ + refund-flow +
+ 92.5/100 + 💰 $0.0034 + ⚡ 1200ms + 570 tokens + 🧠 openai/gpt-4o-mini +
+ +
+
+
+ Model: openai/gpt-4o-mini + in 450 / out 120 tokens$0.0034 + + + +
+ +
+ QueryProcess refund for order 4812 +
+ + +
+ Direct response — no tools invoked +
+ + + +
+ ResponseRefund of $49.99 issued successfully. +
+ +
+
+ +
+
+ + order-lookup +
+ 88.0/100 + 💰 $0.0021 + ⚡ 800ms + 415 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + billing-dispute +
+ 45.0/100 + 💰 $0.0058 + ⚡ 2400ms + 890 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + password-reset +
+ 95.0/100 + 💰 $0.0012 + ⚡ 600ms + 260 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + vip-escalation +
+ 78.0/100 + 💰 $0.0045 + ⚡ 1800ms + 700 tokens + 🧠 openai/claude-3.5-sonnet +
+ +
+ +
+ + +
+ + +
+ +
No diffs yet — run evalview check to compare against a baseline
+ +
+ + +
+ +
No step timing data
+ +
+ + + +
+ + + + +
+ Built with EvalView | Agent testing & regression detection + + + Share + + Star + +
+
+ + + \ No newline at end of file diff --git a/.evalview/latest-redesign.html b/.evalview/latest-redesign.html new file mode 100644 index 0000000..ea790bb --- /dev/null +++ b/.evalview/latest-redesign.html @@ -0,0 +1,956 @@ + + + + + +EvalView Check Report + + + + + + + + + +
+ +
+ + + ✗ 1 Failed + + 5 Tests + +
+
+ +
+ +
+ + + + + +
+ + +
+ +
+
+
+
Pass Rate
+
+ + + + +
4/5
+
+
+
80.0%
+
4 of 5 tests
+
+
+
+
Avg Score
+
+ + + + +
79
+
+
+
79.7
+
out of 100
+
+
+
+
Total Cost
+
+
$0.017
+
+ 2,835 tokens (verified) +
openai/gpt-4o-mini, openai/claude-3.5-sonnet +
+ +
+ in 2,170 + out 665 +
+ +
+
+
+
Avg Latency
+
+
1360ms
+
per test
+
+
+ +
+
+
Agent Model
+
openai/gpt-4o-mini, openai/claude-3.5-sonnet
+
5 tests in this run
+
+ +
+
Token Usage
+
2,835 tokens
+
in 2,170 / out 665
+
+ +
+ + + +
+
+
EvalView Judge (gpt-4o-mini)
+
+ + $0.006 + +
+
+ 1,320 tokens across 5 judge calls +
+
+
+
Judge Token Breakdown
+
in 1,100 / out 220
+
Separate from agent trace cost
+
+
+ + + +
+
+
Score per Test
+
+
+
+
Distribution
+
+
+
+ + +
+
Execution Cost per Query
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
+
+ Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. + This check also used 5 EvalView judge calls (1320 tokens). +
+
+ + +
+ + +
+ + +
+
+ + refund-flow +
+ 92.5/100 + 💰 $0.0034 + ⚡ 1200ms + 570 tokens + 🧠 openai/gpt-4o-mini +
+ +
+
+
+ Model: openai/gpt-4o-mini + + in 450 / out 120 tokens + $0.0034 + + + + +
+ +
+ QueryProcess refund for order 4812 +
+ + +
+ + Direct response — no tools invoked + +
+ + + +
+ ResponseRefund of $49.99 issued successfully. +
+ +
+
+ +
+
+ + order-lookup +
+ 88.0/100 + 💰 $0.0021 + ⚡ 800ms + 415 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + billing-dispute +
+ 45.0/100 + 💰 $0.0058 + ⚡ 2400ms + 890 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + password-reset +
+ 95.0/100 + 💰 $0.0012 + ⚡ 600ms + 260 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + vip-escalation +
+ 78.0/100 + 💰 $0.0045 + ⚡ 1800ms + 700 tokens + 🧠 openai/claude-3.5-sonnet +
+ +
+ +
+ + +
+ + +
+ +
No diffs yet — run evalview check to compare against a baseline
+ +
+ + +
+ +
No step timing data
+ +
+ + + + +
+ + + + +
+ + Built with EvalView + | + Agent testing & regression detection + + + + + Share + + + + Star + + +
+
+ + + \ No newline at end of file diff --git a/.evalview/latest-v3.html b/.evalview/latest-v3.html new file mode 100644 index 0000000..4de07bf --- /dev/null +++ b/.evalview/latest-v3.html @@ -0,0 +1,681 @@ + + + + + +EvalView Check Report + + + + + + + + + +
+ +
+ + + ✗ 1 Failed + + 5 Tests + +
+
+ +
+ +
+ + + + + +
+ + +
+ +
+
+
+
Pass Rate
+
+ + + + +
4/5
+
+
+
80.0%
+
4 of 5 tests
+
+
+
+
Avg Score
+
+ + + + +
79
+
+
+
79.7
+
out of 100
+
+
+
Total Cost
+
$0.017
+
2,835 tokens (verified)
+ +
in 2,170out 665
+ + +
openai/gpt-4o-mini, openai/claude-3.5-sonnet
+ +
+
+
Avg Latency
+
1360ms
+
per test
+
+
+ +
+
+
Agent Model
+
openai/gpt-4o-mini, openai/claude-3.5-sonnet
+
5 tests in this run
+
+ +
+
Token Usage
+
2,835 tokens
+
in 2,170 / out 665
+
+ +
+ + + +
+
+
EvalView Judge (gpt-4o-mini)
+
$0.006
+
1,320 tokens across 5 judge calls
+
+
+
Judge Token Breakdown
+
in 1,100 / out 220
+
Separate from agent trace cost
+
+
+ + +
+
+
Score per Test
+
+
+
+
Distribution
+
+
+
+ +
+
Execution Cost per Query
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
+
+ Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. + This check also used 5 EvalView judge calls (1320 tokens). +
+
+ +
+ + +
+ + +
+
+ + refund-flow +
+ 92.5/100 + 💰 $0.0034 + ⚡ 1200ms + 570 tokens + 🧠 openai/gpt-4o-mini +
+ +
+
+
+ Model: openai/gpt-4o-mini + in 450 / out 120 tokens$0.0034 + + + +
+ +
+ QueryProcess refund for order 4812 +
+ + +
+ Direct response — no tools invoked +
+ + + +
+ ResponseRefund issued. +
+ +
+
+ +
+
+ + order-lookup +
+ 88.0/100 + 💰 $0.0021 + ⚡ 800ms + 415 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + billing-dispute +
+ 45.0/100 + 💰 $0.0058 + ⚡ 2400ms + 890 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + password-reset +
+ 95.0/100 + 💰 $0.0012 + ⚡ 600ms + 260 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + vip-escalation +
+ 78.0/100 + 💰 $0.0045 + ⚡ 1800ms + 700 tokens + 🧠 openai/claude-3.5-sonnet +
+ +
+ +
+ + +
+ + +
+ +
No diffs yet — run evalview check to compare against a baseline
+ +
+ + +
+ +
No step timing data
+ +
+ + + +
+ + + + +
+ Built with EvalView | Agent testing & regression detection + + + Share + + Star + +
+
+ + + \ No newline at end of file diff --git a/.evalview/latest-v4.html b/.evalview/latest-v4.html new file mode 100644 index 0000000..7084865 --- /dev/null +++ b/.evalview/latest-v4.html @@ -0,0 +1,708 @@ + + + + + +EvalView Check Report + + + + + + + + + +
+ +
+ +
+ + + ✗ 1 Failed + + 5 Tests + +
+
+ +
+ +
+ + + + + +
+ + +
+ + + +
+
+
Pass Rate
+
80.0%
+
4 of 5 tests passing
+
+ + + + +
4/5
+
+
+
+
+
Avg Score
+
79.7/100
+
+
+
Total Cost
+
$0.017
+
2,835 tokens
+
+
+
Avg Latency
+
1360ms
+
per test
+
+
+
Model
+
openai/gpt-4o-mini, openai/claude-3.5-sonnet
+ +
in 2,170 · out 665
+ +
+
+
+ + +
+
+
Agent Model
+
openai/gpt-4o-mini, openai/claude-3.5-sonnet
+
5 tests in this run
+
+ +
+
Token Usage
+
2,835 tokens
+
in 2,170 / out 665
+
+ +
+ + + +
+
+
EvalView Judge (gpt-4o-mini)
+
$0.006
+
1,320 tokens across 5 judge calls
+
+
+
Judge Token Breakdown
+
in 1,100 / out 220
+
Separate from agent trace cost
+
+
+ + + +
+
+
Score per Test
+
+
+
+
Distribution
+
+
+
+ + +
+
Execution Cost per Query
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
+
+ Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. + This check also used 5 EvalView judge calls (1320 tokens). +
+
+ +
+ + +
+ + +
+
+ + refund-flow +
+ 92.5/100 + 💰 $0.0034 + ⚡ 1200ms + 570 tokens + 🧠 openai/gpt-4o-mini +
+ +
+
+
+ Model: openai/gpt-4o-mini + in 450 / out 120 tokens$0.0034 + + + +
+ +
+ QueryProcess refund for order 4812 +
+ + +
+ Direct response — no tools invoked +
+ + + +
+ ResponseRefund issued. +
+ +
+
+ +
+
+ + order-lookup +
+ 88.0/100 + 💰 $0.0021 + ⚡ 800ms + 415 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + billing-dispute +
+ 45.0/100 + 💰 $0.0058 + ⚡ 2400ms + 890 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + password-reset +
+ 95.0/100 + 💰 $0.0012 + ⚡ 600ms + 260 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + vip-escalation +
+ 78.0/100 + 💰 $0.0045 + ⚡ 1800ms + 700 tokens + 🧠 openai/claude-3.5-sonnet +
+ +
+ +
+ + +
+ + +
+ +
No diffs yet — run evalview check to compare against a baseline
+ +
+ + +
+ +
No step timing data
+ +
+ + + +
+ + + + +
+ Built with EvalView | Agent testing & regression detection + + Share + Star + +
+
+ + + \ No newline at end of file diff --git a/.evalview/latest-v5.html b/.evalview/latest-v5.html new file mode 100644 index 0000000..2586400 --- /dev/null +++ b/.evalview/latest-v5.html @@ -0,0 +1,689 @@ + + + + + +EvalView Check Report + + + + + + + + + +
+ +
+ ✗ 1 Failed5 Tests +
+
+ +
+
+ + + + + +
+ + +
+ + + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
80.0%
+
Pass Rate
+
+
+
4 of 5 tests passing
+
+ + +
+
+
Avg Score
+
79.7
+
out of 100
+
+
+
Total Cost
+
$0.017
+
2,835 tokens (verified)
+
+
+
Avg Latency
+
1360ms
+
per test
+
+
+
Model
+
openai/gpt-4o-mini, openai/claude-3.5-sonnet
+ +
in 2,170 · out 665
+ +
+
+ + +
+
+
Agent Model
+
openai/gpt-4o-mini, openai/claude-3.5-sonnet
+
5 tests in this run
+
+ +
+
Token Usage
+
2,835 tokens
+
in 2,170 / out 665
+
+ +
+ + +
+
+
EvalView Judge (gpt-4o-mini)
+
$0.006
+
1,320 tokens across 5 judge calls
+
+
+
Judge Token Breakdown
+
in 1,100 / out 220
+
Separate from agent trace cost
+
+
+ + + +
+
+
Score per Test
+
+
+
+
Distribution
+
+
+
+ + +
+
Execution Cost per Query
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
+
+ Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. + This check also used 5 EvalView judge calls (1320 tokens). +
+
+ +
+ + +
+ +
+
+ + refund-flow +
+ 92.5/100 + 💰 $0.0034 + ⚡ 1200ms + 570 tokens + 🧠 openai/gpt-4o-mini +
+ +
+
+
+ Model: openai/gpt-4o-mini + in 450 / out 120 tokens$0.0034 + + + +
+ +
+ QueryProcess refund for order 4812 +
+
◎ Direct response — no tools invoked
+ + +
+ ResponseRefund issued. +
+
+
+ +
+
+ + order-lookup +
+ 88.0/100 + 💰 $0.0021 + ⚡ 800ms + 415 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + billing-dispute +
+ 45.0/100 + 💰 $0.0058 + ⚡ 2400ms + 890 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + password-reset +
+ 95.0/100 + 💰 $0.0012 + ⚡ 600ms + 260 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + vip-escalation +
+ 78.0/100 + 💰 $0.0045 + ⚡ 1800ms + 700 tokens + 🧠 openai/claude-3.5-sonnet +
+ +
+ +
+ +
+ + +
+
No diffs yet — run evalview check to compare against a baseline
+
+ + +
+
No step timing data
+
+ + + +
+ + + + +
+ Built with EvalView | Agent testing & regression detection + + Share + Star + +
+
+ + + \ No newline at end of file diff --git a/.evalview/latest-v6.html b/.evalview/latest-v6.html new file mode 100644 index 0000000..aee40eb --- /dev/null +++ b/.evalview/latest-v6.html @@ -0,0 +1,616 @@ + + + + + +EvalView Check Report + + + + + + + + + +
+ +
+ ✗ 1 Failed5 Tests +
+
+ +
+
+ + + + + +
+ + +
+ + + +
+
+
+ + + + +
+
80.0%
+
Pass Rate
+
+
+
4 of 5 tests
+
+
+
+
Avg Score
+
79.7
+
out of 100
+
+
+
Total Cost
+
$0.017
+
2,835 tokens (verified)
+
+
+
Avg Latency
+
1360ms
+
per test
+
+
+
Model
+
openai/gpt-4o-mini, openai/claude-3.5-sonnet
+ +
in 2,170 · out 665
+ +
+
+
+ + +
+
+
Agent Model
+
openai/gpt-4o-mini, openai/claude-3.5-sonnet
+
5 tests in this run
+
+ +
+
Token Usage
+
2,835 tokens
+
in 2,170 / out 665
+
+ +
+ + +
+
+
EvalView Judge (gpt-4o-mini)
+
$0.006
+
1,320 tokens across 5 judge calls
+
+
+
Judge Token Breakdown
+
in 1,100 / out 220
+
Separate from agent trace cost
+
+
+ + + +
+
+
Score per Test
+
+
+
+
Distribution
+
+
+
+ + +
+
Execution Cost per Query
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
+
+ Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. + This check also used 5 EvalView judge calls (1320 tokens). +
+
+ +
+ + +
+ +
+
+ + refund-flow +
+ 92.5/100 + 💰 $0.0034 + ⚡ 1200ms + 570 tokens + 🧠 openai/gpt-4o-mini +
+ +
+
+
+ Model: openai/gpt-4o-mini + in 450 / out 120 tokens$0.0034 + + + +
+ +
+ QueryProcess refund +
+
◎ Direct response — no tools invoked
+ + +
+ ResponseDone. +
+
+
+ +
+
+ + order-lookup +
+ 88.0/100 + 💰 $0.0021 + ⚡ 800ms + 415 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + billing-dispute +
+ 45.0/100 + 💰 $0.0058 + ⚡ 2400ms + 890 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + password-reset +
+ 95.0/100 + 💰 $0.0012 + ⚡ 600ms + 260 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + vip-escalation +
+ 78.0/100 + 💰 $0.0045 + ⚡ 1800ms + 700 tokens + 🧠 openai/claude-3.5-sonnet +
+ +
+ +
+ +
+ + +
+
No diffs yet — run evalview check to compare against a baseline
+
+ + +
+
No step timing data
+
+ + + +
+ + + + +
+ Built with EvalView | Agent testing & regression detection + + Share + Star + +
+
+ + + \ No newline at end of file diff --git a/.evalview/latest-v6b.html b/.evalview/latest-v6b.html new file mode 100644 index 0000000..e6e050e --- /dev/null +++ b/.evalview/latest-v6b.html @@ -0,0 +1,616 @@ + + + + + +EvalView Check Report + + + + + + + + + +
+ +
+ ✗ 1 Failed5 Tests +
+
+ +
+
+ + + + + +
+ + +
+ + + +
+
+
+ + + + +
+
80.0%
+
Pass Rate
+
+
+
4 of 5 tests
+
+
+
+
Avg Score
+
79.7
+
out of 100
+
+
+
Total Cost
+
$0.017
+
2,835 tokens (verified)
+
+
+
Avg Latency
+
1360ms
+
per test
+
+
+
Model
+
openai/gpt-4o-mini, openai/claude-3.5-sonnet
+ +
in 2,170 · out 665
+ +
+
+
+ + +
+
+
Agent Model
+
openai/gpt-4o-mini, openai/claude-3.5-sonnet
+
5 tests in this run
+
+ +
+
Token Usage
+
2,835 tokens
+
in 2,170 / out 665
+
+ +
+ + +
+
+
EvalView Judge (gpt-4o-mini)
+
$0.006
+
1,320 tokens across 5 judge calls
+
+
+
Judge Token Breakdown
+
in 1,100 / out 220
+
Separate from agent trace cost
+
+
+ + + +
+
+
Score per Test
+
+
+
+
Distribution
+
+
+
+ + +
+
Execution Cost per Query
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
+
+ Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. + This check also used 5 EvalView judge calls (1320 tokens). +
+
+ +
+ + +
+ +
+
+ + refund-flow +
+ 92.5/100 + 💰 $0.0034 + ⚡ 1200ms + 570 tokens + 🧠 openai/gpt-4o-mini +
+ +
+
+
+ Model: openai/gpt-4o-mini + in 450 / out 120 tokens$0.0034 + + + +
+ +
+ QueryProcess refund +
+
◎ Direct response — no tools invoked
+ + +
+ ResponseDone. +
+
+
+ +
+
+ + order-lookup +
+ 88.0/100 + 💰 $0.0021 + ⚡ 800ms + 415 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + billing-dispute +
+ 45.0/100 + 💰 $0.0058 + ⚡ 2400ms + 890 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + password-reset +
+ 95.0/100 + 💰 $0.0012 + ⚡ 600ms + 260 tokens + 🧠 openai/gpt-4o-mini +
+ +
+ +
+ +
+
+ + vip-escalation +
+ 78.0/100 + 💰 $0.0045 + ⚡ 1800ms + 700 tokens + 🧠 openai/claude-3.5-sonnet +
+ +
+ +
+ +
+ + +
+
No diffs yet — run evalview check to compare against a baseline
+
+ + +
+
No step timing data
+
+ + + +
+ + + + +
+ Built with EvalView | Agent testing & regression detection + + Share + Star + +
+
+ + + \ No newline at end of file diff --git a/.evalview/latest-v6c.html b/.evalview/latest-v6c.html new file mode 100644 index 0000000..5caff22 --- /dev/null +++ b/.evalview/latest-v6c.html @@ -0,0 +1,603 @@ + + + + + +EvalView Check Report + + + + + + + + + +
+ +
+ ✗ 1 Failed5 Tests +
+
+ +
+
+ + + + + +
+ + +
+ + + +
+
+
+ + + + +
+
80.0%
+
Pass Rate
+
+
+
4 of 5 tests
+
+
+
+
Avg Score
+
79.7
+
out of 100
+
+
+
Total Cost
+
$0.017
+
2,835 tokens (verified)
+
+
+
Avg Latency
+
1360ms
+
per test
+
+
+
Model
+
anthropic/claude-sonnet-4-6
+ +
in 2,170 · out 665
+ +
+
+
+ + +
+
+
Agent Model
+
anthropic/claude-sonnet-4-6
+
5 tests in this run
+
+ +
+
Token Usage
+
2,835 tokens
+
in 2,170 / out 665
+
+ +
+ + + + +
+
+
Score per Test
+
+
+
+
Distribution
+
+
+
+ + +
+
Execution Cost per Query
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TestModelTrace CostTokensLatencyScore
refund-flowanthropic/claude-sonnet-4-6$0.0034570 tokens1200ms92.5
order-lookupanthropic/claude-sonnet-4-6$0.0021415 tokens800ms88.0
billing-disputeanthropic/claude-sonnet-4-6$0.0058890 tokens2400ms45.0
password-resetanthropic/claude-sonnet-4-6$0.0012260 tokens600ms95.0
vip-escalationanthropic/claude-sonnet-4-6$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
+
+ Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. + +
+
+ +
+ + +
+ +
+
+ + refund-flow +
+ 92.5/100 + 💰 $0.0034 + ⚡ 1200ms + 570 tokens + 🧠 anthropic/claude-sonnet-4-6 +
+ +
+
+
+ Model: anthropic/claude-sonnet-4-6 + in 450 / out 120 tokens$0.0034 + + + +
+ +
+ QueryProcess refund +
+
◎ Direct response — no tools invoked
+ + +
+ ResponseDone. +
+
+
+ +
+
+ + order-lookup +
+ 88.0/100 + 💰 $0.0021 + ⚡ 800ms + 415 tokens + 🧠 anthropic/claude-sonnet-4-6 +
+ +
+ +
+ +
+
+ + billing-dispute +
+ 45.0/100 + 💰 $0.0058 + ⚡ 2400ms + 890 tokens + 🧠 anthropic/claude-sonnet-4-6 +
+ +
+ +
+ +
+
+ + password-reset +
+ 95.0/100 + 💰 $0.0012 + ⚡ 600ms + 260 tokens + 🧠 anthropic/claude-sonnet-4-6 +
+ +
+ +
+ +
+
+ + vip-escalation +
+ 78.0/100 + 💰 $0.0045 + ⚡ 1800ms + 700 tokens + 🧠 anthropic/claude-sonnet-4-6 +
+ +
+ +
+ +
+ + +
+
No diffs yet — run evalview check to compare against a baseline
+
+ + +
+
No step timing data
+
+ + + +
+ + + + +
+ Built with EvalView | Agent testing & regression detection + + Share + Star + +
+
+ + + \ No newline at end of file From 393741614430ad5429958546a9f533e78bfe6da4 Mon Sep 17 00:00:00 2001 From: gxt Date: Mon, 20 Apr 2026 20:32:44 +0530 Subject: [PATCH 5/5] Fix type error by adding explicit annotation --- evalview/commands/check_display.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evalview/commands/check_display.py b/evalview/commands/check_display.py index 1be9f4b..ff2b582 100644 --- a/evalview/commands/check_display.py +++ b/evalview/commands/check_display.py @@ -491,7 +491,7 @@ def _display_check_results( if json_output: token_summary = _aggregate_token_summary(results, golden_traces) - output = { + output: Dict[str, Any] = { "summary": { "total_tests": len(diffs), "unchanged": sum(1 for _, d in diffs if d.overall_severity == DiffStatus.PASSED), @@ -561,7 +561,7 @@ def _display_check_results( output["summary"]["token_usage"] = token_summary["token_usage"].model_dump() output["summary"]["total_cost"] = token_summary["total_cost"] if token_summary.get("baseline_token_usage") is not None: - output["summary"]["baseline_token_usage"] = token_summary["baseline_token_usage"].model_dump() # type: ignore[union-attr] + output["summary"]["baseline_token_usage"] = token_summary["baseline_token_usage"].model_dump() if token_summary.get("token_delta_pct") is not None: output["summary"]["token_delta_pct"] = token_summary["token_delta_pct"] if healing_summary: