From 05858680b18cb8bdaa95212896f1380533c98b8e Mon Sep 17 00:00:00 2001 From: gxt Date: Tue, 21 Apr 2026 02:11:50 +0530 Subject: [PATCH] Remove generated artifacts from git tracking --- .evalview/badge.json | 6 - .evalview/healing/2026-03-25T11-12-54.json | 72 -- .evalview/healing/2026-03-25T11-13-40.json | 72 -- .evalview/healing/2026-03-25T11-23-25.json | 45 - .evalview/healing/2026-03-28T22-06-19.json | 47 - .evalview/latest-check.html | 1002 -------------------- .evalview/latest-redesign-v2.html | 681 ------------- .evalview/latest-redesign.html | 956 ------------------- .evalview/latest-v3.html | 681 ------------- .evalview/latest-v4.html | 708 -------------- .evalview/latest-v5.html | 689 -------------- .evalview/latest-v6.html | 616 ------------ .evalview/latest-v6b.html | 616 ------------ .evalview/latest-v6c.html | 603 ------------ .gitignore | 4 + 15 files changed, 4 insertions(+), 6794 deletions(-) delete mode 100644 .evalview/badge.json delete mode 100644 .evalview/healing/2026-03-25T11-12-54.json delete mode 100644 .evalview/healing/2026-03-25T11-13-40.json delete mode 100644 .evalview/healing/2026-03-25T11-23-25.json delete mode 100644 .evalview/healing/2026-03-28T22-06-19.json delete mode 100644 .evalview/latest-check.html delete mode 100644 .evalview/latest-redesign-v2.html delete mode 100644 .evalview/latest-redesign.html delete mode 100644 .evalview/latest-v3.html delete mode 100644 .evalview/latest-v4.html delete mode 100644 .evalview/latest-v5.html delete mode 100644 .evalview/latest-v6.html delete mode 100644 .evalview/latest-v6b.html delete mode 100644 .evalview/latest-v6c.html diff --git a/.evalview/badge.json b/.evalview/badge.json deleted file mode 100644 index 07ad528..0000000 --- a/.evalview/badge.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "label": "evalview", - "message": "3 regressions", - "color": "red", - "schemaVersion": 1 -} diff --git a/.evalview/healing/2026-03-25T11-12-54.json b/.evalview/healing/2026-03-25T11-12-54.json deleted file mode 100644 index 3fdc9b9..0000000 --- a/.evalview/healing/2026-03-25T11-12-54.json +++ /dev/null @@ -1,72 +0,0 @@ -{ - "results": [ - { - "test_name": "Service Outage Login Issue", - "original_status": "tools_changed", - "diagnosis": { - "action": "flag_review", - "trigger": "structural_change", - "reason": "tool change detected — review needed", - "root_cause_category": null, - "details": { - "tool_changes": [ - "added: ? -> escalate_to_human" - ] - } - }, - "attempted": false, - "healed": false, - "proposed": false, - "final_status": "tools_changed", - "original_score": 98.0, - "baseline_score": 96.5, - "retry_score": null, - "retry_status": null, - "baseline_model": "gpt-4o-mini", - "actual_model": "gpt-4o-mini", - "variant_saved": null, - "variant_path": null - }, - { - "test_name": "Refund for Damaged Blender", - "original_status": "tools_changed", - "diagnosis": { - "action": "flag_review", - "trigger": "param_change", - "reason": "parameter changes detected — review needed", - "root_cause_category": null, - "details": { - "param_diffs": 3 - } - }, - "attempted": false, - "healed": false, - "proposed": false, - "final_status": "tools_changed", - "original_score": 86.7, - "baseline_score": 89.55, - "retry_score": null, - "retry_status": null, - "baseline_model": "gpt-4o-mini", - "actual_model": "gpt-4o-mini", - "variant_saved": null, - "variant_path": null - } - ], - "total_healed": 0, - "total_proposed": 0, - "total_review": 2, - "total_blocked": 0, - "attempted_count": 0, - "unresolved_count": 2, - "failed_count": 2, - "policy_version": "v1", - "thresholds": { - "min_variant_score": 70.0, - "max_cost_multiplier": 2.0, - "max_latency_multiplier": 3.0, - "max_auto_variants": 3.0 - }, - "model_update": null, - "audit_path": null -} \ No newline at end of file diff --git a/.evalview/healing/2026-03-25T11-13-40.json b/.evalview/healing/2026-03-25T11-13-40.json deleted file mode 100644 index 9a400ee..0000000 --- a/.evalview/healing/2026-03-25T11-13-40.json +++ /dev/null @@ -1,72 +0,0 @@ -{ - "results": [ - { - "test_name": "Service Outage Login Issue", - "original_status": "tools_changed", - "diagnosis": { - "action": "flag_review", - "trigger": "structural_change", - "reason": "tool change detected — review needed", - "root_cause_category": null, - "details": { - "tool_changes": [ - "added: ? -> escalate_to_human" - ] - } - }, - "attempted": false, - "healed": false, - "proposed": false, - "final_status": "tools_changed", - "original_score": 98.5, - "baseline_score": 96.5, - "retry_score": null, - "retry_status": null, - "baseline_model": "gpt-4o-mini", - "actual_model": "gpt-4o-mini", - "variant_saved": null, - "variant_path": null - }, - { - "test_name": "Refund for Damaged Blender", - "original_status": "regression", - "diagnosis": { - "action": "flag_review", - "trigger": "param_change", - "reason": "parameter changes detected — review needed", - "root_cause_category": null, - "details": { - "param_diffs": 3 - } - }, - "attempted": false, - "healed": false, - "proposed": false, - "final_status": "regression", - "original_score": 83.6, - "baseline_score": 89.55, - "retry_score": null, - "retry_status": null, - "baseline_model": "gpt-4o-mini", - "actual_model": "gpt-4o-mini", - "variant_saved": null, - "variant_path": null - } - ], - "total_healed": 0, - "total_proposed": 0, - "total_review": 2, - "total_blocked": 0, - "attempted_count": 0, - "unresolved_count": 2, - "failed_count": 2, - "policy_version": "v1", - "thresholds": { - "min_variant_score": 70.0, - "max_cost_multiplier": 2.0, - "max_latency_multiplier": 3.0, - "max_auto_variants": 3.0 - }, - "model_update": null, - "audit_path": null -} \ No newline at end of file diff --git a/.evalview/healing/2026-03-25T11-23-25.json b/.evalview/healing/2026-03-25T11-23-25.json deleted file mode 100644 index 915780b..0000000 --- a/.evalview/healing/2026-03-25T11-23-25.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "results": [ - { - "test_name": "Refund for Damaged Blender", - "original_status": "regression", - "diagnosis": { - "action": "flag_review", - "trigger": "param_change", - "reason": "parameter changes detected — review needed", - "root_cause_category": null, - "details": { - "param_diffs": 3 - } - }, - "attempted": false, - "healed": false, - "proposed": false, - "final_status": "regression", - "original_score": 82.35, - "baseline_score": 89.55, - "retry_score": null, - "retry_status": null, - "baseline_model": "gpt-4o-mini", - "actual_model": "gpt-4o-mini", - "variant_saved": null, - "variant_path": null - } - ], - "total_healed": 0, - "total_proposed": 0, - "total_review": 1, - "total_blocked": 0, - "attempted_count": 0, - "unresolved_count": 1, - "failed_count": 1, - "policy_version": "v1", - "thresholds": { - "min_variant_score": 70.0, - "max_cost_multiplier": 2.0, - "max_latency_multiplier": 3.0, - "max_auto_variants": 3.0 - }, - "model_update": null, - "audit_path": null -} \ No newline at end of file diff --git a/.evalview/healing/2026-03-28T22-06-19.json b/.evalview/healing/2026-03-28T22-06-19.json deleted file mode 100644 index b189b8a..0000000 --- a/.evalview/healing/2026-03-28T22-06-19.json +++ /dev/null @@ -1,47 +0,0 @@ -{ - "results": [ - { - "test_name": "Office Sign-In Outage Check", - "original_status": "output_changed", - "diagnosis": { - "action": "propose_variant", - "trigger": "nondeterminism", - "reason": "saved candidate variant auto_heal_a3c6 (score 86.0)", - "root_cause_category": null, - "details": { - "retry_status": "regression", - "named_variant_count_before": 0, - "max_auto_variants": 3 - } - }, - "attempted": true, - "healed": false, - "proposed": true, - "final_status": "regression", - "original_score": 92.0, - "baseline_score": 93.5, - "retry_score": 86.0, - "retry_status": "regression", - "baseline_model": "gpt-4o-mini", - "actual_model": "gpt-4o-mini", - "variant_saved": "auto_heal_a3c6", - "variant_path": ".evalview/golden/Office_Sign-In_Outage_Check.variant_auto_heal_a3c6.golden.json" - } - ], - "total_healed": 0, - "total_proposed": 1, - "total_review": 0, - "total_blocked": 0, - "attempted_count": 1, - "unresolved_count": 1, - "failed_count": 1, - "policy_version": "v1", - "thresholds": { - "min_variant_score": 70.0, - "max_cost_multiplier": 2.0, - "max_latency_multiplier": 3.0, - "max_auto_variants": 3.0 - }, - "model_update": null, - "audit_path": null -} \ No newline at end of file diff --git a/.evalview/latest-check.html b/.evalview/latest-check.html deleted file mode 100644 index 47d4b9d..0000000 --- a/.evalview/latest-check.html +++ /dev/null @@ -1,1002 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- ✗ Final Outcome Failing - ✗ 3 Failed3 Tests -
-
- -
-
- - - - - -
- - -
- - - -
-
- - 0.0% passed - (0/3) -
-
- Avg 24.2/100 -
-
- $0 - total -
-
- 281135ms - avg -
-
- Unknown - -
-
- - -
-
-
Health Gauge
-
-
- - - - - 0% -
-
- -
3 failed
-
3 regressions
- -
-
-
- -
-
Score Trends
-
-
- -
- - - - - -
-
Behavior Summary
- - - - - - - - - - - - - - -
BehaviorTotalPassedChangedRegressionsHealed
untagged30030
-
- - - -
- -
-
Baseline Snapshot
-
2026-04-06 13:26
-
Model: Not recorded in snapshot
-
- - -
-
EvalView Judge (openai/gpt-5.4-mini)
-
$0.000305
-
2,489 tokens across 6 judge calls
-
-
-
Token Breakdown
-
in 2,301 / out 188
-
$0.1/M input, $0.4/M output
-
- -
- - - - -
-
Score per Test
-
-
- - - -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostLatencyScore
implementUnknown$0360030ms20.0
bug-fixUnknown$0324936ms32.5
refactorUnknown$0158438ms20.0
Total$0avg $0.000000 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - This check also used 6 EvalView judge calls (2489 tokens). -
-
- -
- - -
- -
-
- - implement -
- - 20.0/100 - - ⚡ 360030ms - - 🧠 Unknown -
- -
-
-
- Model: Unknown - - - Baseline: 2026-04-06 13:26 - Baseline model: Not recorded in snapshot -
- -
-
Score Breakdown
-
-
Tools 0.0% × 30%
-
Output 0.0/100 × 50%
-
Sequence Correct × 20%
-
= 20.0/100
-
-
The response does not implement group_by_key(), does not run the __main__ block, and simply repeats the timeout error. It fails to address the user’s request entirely.
-
- -
- QueryImplement the group_by_key() function in stub.py. The docstring and type signature are already there — the body is just pass. Implement it, then run the __main__ block to verify it prints the correctly grouped output. - -
- -
-
Why it failed
-
  • Score 20.0 below minimum 70.0
  • Output quality: 0.0/100
  • Hallucination detected (70% confidence)
  • Tool accuracy: 0.0%
-
-
sequenceDiagram - participant User - participant Agent - participant T0 as error - User->>Agent: Implement the group_by_key f... - Agent-xT0: error - T0-->Agent: OpenCode timed out after 360... - Agent-->>User: OpenCode timed out after 360...
- - -
-
Conversation Turns
-
- -
Turn 1 · error · ⚡ 360030.9ms · 💰 $0
-
Implement the group_by_key() function in stub.py. The docstring and type signature are already there — the body is just pass. Implement it, then run the __main__ block to verify it prints the correctly grouped output. -
-
OpenCode timed out after 360s
- -
-
- -
- 🔮 Hallucination detected · 70% · [Warning] - Tool 'error' failed/returned error, but agent did not acknowledge the failure · openai/gpt-5.4-mini - 🛡 Safe - - -
-
Unsupported claims: [Warning] - Tool 'error' failed/returned error, but agent did not acknowledge the failure -(Confidence 70% below threshold 98% - not blocking)
- - -
-
- -
-
- - bug-fix -
- - 32.5/100 - - ⚡ 324936ms - - 🧠 Unknown -
- -
-
-
- Model: Unknown - - - Baseline: 2026-04-06 13:26 - Baseline model: Not recorded in snapshot -
- -
-
Score Breakdown
-
-
Tools 33.3% × 30%
-
Output 5.0/100 × 50%
-
Sequence Correct × 20%
-
= 32.5/100
-
-
The response is relevant and acknowledges the bug, but it does not actually fix the code or run the file to confirm the output. It only states an intention to inspect the file, so it fails to satisfy the requested task and provides no concrete correction.
-
- -
- QueryThere is a bug in buggy.py. The find_max function uses range(1, len(numbers) - 1) which means it never checks the last element. Fix the bug so the function correctly returns the maximum value including the last element. After fixing, run the file to confirm it prints 9. - -
- -
-
Why it failed
-
  • Score 32.5 below minimum 70.0
  • Output quality: 5.0/100
  • Tool accuracy: 33.3%
-
-
sequenceDiagram - participant User - participant Agent - participant T0 as read_file - User->>Agent: There is a bug in buggy.py. - Agent->>T0: filePath=/Users/hidaibar-mor... - T0-->Agent: path/Users/hidaibar-mor/Down... - Agent-->>User: Ill fix the bug in the find_...
- - -
-
Conversation Turns
-
- -
Turn 1 · read_file · ⚡ 324936.4ms · 💰 $0
-
There is a bug in buggy.py. The find_max function uses range(1, len(numbers) - 1) which means it never checks the last element. Fix the bug so the function correctly returns the maximum value including the last element. After fixing, run the file to confirm it prints 9. -
-
I'll fix the bug in the find_max function in buggy.py. Let me first examine the file to understand the current implementation.
- -
-
- -
- 🔮 No hallucination · No verifiable factual claims found in output. · openai/gpt-5.4-mini - 🛡 Safe - - -
- - - -
-
- -
-
- - refactor -
- - 20.0/100 - - ⚡ 158438ms - - 🧠 Unknown -
- -
-
-
- Model: Unknown - - - Baseline: 2026-04-06 13:26 - Baseline model: Not recorded in snapshot -
- -
-
Score Breakdown
-
-
Tools 0.0% × 30%
-
Output 0.0/100 × 50%
-
Sequence Correct × 20%
-
= 20.0/100
-
-
The response does not answer the query, provide a refactoring, or confirm execution. It only contains a context-size error message and no code, so it fails completeness and relevance entirely.
-
- -
- QueryRefactor the function p() in messy.py. It has poor naming, deeply nested conditionals, and uses type() instead of isinstance(). Rewrite it to be clean and readable while preserving the exact same behaviour: filter out None values, strip whitespace from non-empty strings, and pass through all other types unchanged. Then run the file to confirm it still prints {'name': 'Alice', 'age': 30}. - -
- -
-
Why it failed
-
  • Score 20.0 below minimum 70.0
  • Output quality: 0.0/100
  • Tool accuracy: 0.0%
-
-
◎ Direct response — no tools invoked
- - -
- 🔮 No hallucination · No verifiable factual claims found in output. · openai/gpt-5.4-mini - 🛡 Safe - - -
- - - -
- Response"Context size has been exceeded." -
-
-
- -
- - - -
- -
-
- ⬇ Regression - - - - implement - 96.0 → 20.0 - -76.0 - lexical 2.9% - semantic 0.4% - 70% confidence - -
-
- -
-
Baselineglobread_fileedit_filebash
-
Currenterror
-
- - -
-
Why This Changed
-
- tool_reordered -
-
Tool sequence changed: 'glob' at step 1 was replaced by 'error'
-
Suggested fix: The agent is calling the right tools but in a different order. If the new order is valid, run `evalview snapshot --variant <name>` to accept it as an alternative.
-
- -
-
Baseline Output
Output matches expected behavior: - -- `"fruit"` → `[apple, banana]` -- `"veggie"` → `[carrot]` - -The implementation iterates over each item, skips any missing the key, and appends to the appropriate grou...
-
Current Output
OpenCode timed out after 360s
--- golden
+++ actual
@@ -1,6 +1 @@
-Output matches expected behavior: -
- -
-- `"fruit"` → `[apple, banana]` -
-- `"veggie"` → `[carrot]` -
- -
-The implementation iterates over each item, skips any missing the key, and appends to the appropriate group in the result dict.
+OpenCode timed out after 360s
-
- - -
-
- - Trajectory Comparison -
- -
- - -
-
- -
-
- ⬇ Regression - - - - bug-fix - 100.0 → 32.5 - -67.5 - lexical 33.7% - semantic 35.2% - 99% confidence - -
-
- -
-
Baselineread_fileedit_filebash
-
Currentread_file
-
- - -
-
Why This Changed
-
- tool_removed -
-
Tools 'edit_file', 'bash' were expected but not called
-
Suggested fix: Check if your agent's prompt or logic still triggers 'edit_file'. If the tool was intentionally removed, run `evalview snapshot` to update the baseline.
-
- -
-
Baseline Output
Fixed. The bug was `range(1, len(numbers) - 1)` which excluded the last index. Changed to `range(1, len(numbers))` so all elements are checked. Output confirms `9`.
-
Current Output
I'll fix the bug in the find_max function in buggy.py. Let me first examine the file to understand the current implementation.
--- golden
+++ actual
@@ -1 +1 @@
-Fixed. The bug was `range(1, len(numbers) - 1)` which excluded the last index. Changed to `range(1, len(numbers))` so all elements are checked. Output confirms `9`.
+I'll fix the bug in the find_max function in buggy.py. Let me first examine the file to understand the current implementation.
-
- - -
-
- - Trajectory Comparison -
- -
- - -
-
- -
-
- ⬇ Regression - - - - refactor - 96.5 → 20.0 - -76.5 - lexical 9.9% - semantic 12.0% - 58% confidence - -
-
- -
-
Baselineread_filetodowriteedit_fileedit_filetodowritebashtodowrite
-
CurrentNo tools
-
- - -
-
Why This Changed
-
- tool_removed -
-
Tools 'read_file', 'todowrite', 'edit_file', 'edit_file', 'todowrite', 'bash', 'todowrite' were expected but not called
-
Suggested fix: Check if your agent's prompt or logic still triggers 'read_file'. If the tool was intentionally removed, run `evalview snapshot` to update the baseline.
-
- -
-
Baseline Output
Output confirmed: `{'name': 'Alice', 'age': 30}` - -Here's a summary of what changed in `messy.py`: - -| Before | After | Reason | -|--------|-------|--------| -| `p(d)` | `process_dict(data)` | Descriptive...
-
Current Output
"Context size has been exceeded."
--- golden
+++ actual
@@ -1,11 +1 @@
-Output confirmed: `{'name': 'Alice', 'age': 30}` -
- -
-Here's a summary of what changed in `messy.py`: -
- -
-| Before | After | Reason | -
-|--------|-------|--------| -
-| `p(d)` | `process_dict(data)` | Descriptive names | -
-| `for k in d` | `for key, value in data.items()` | Unpack key/value directly | -
-| `if d[k] != None` | `if value is None: continue` | `is None` is idiomatic; early-continue flattens nesting | -
-| `type(d[k]) == str` | `isinstance(value, str)` | `isinstance` is correct (handles subclasses) | -
-| `if len(d[k]) > 0` / `else: pass` | `if stripped:` | Truthiness check on the already-stripped string; `pass` branch eliminated |
+"Context size has been exceeded."
-
- - -
-
- - Trajectory Comparison -
- -
- - -
-
- -
- - - -
- - -
-
- 2 - steps -
-
- 281135ms - avg latency -
-
- $0 - total cost -
-
- 3 - tests -
-
-
-
-
Step Latencies
-
-
-
-
Step Cost
-
-
-
- -
- - - -
- - - - -
- Built with EvalView | Agent testing & regression detection - - Share - Star - -
-
- - - \ No newline at end of file diff --git a/.evalview/latest-redesign-v2.html b/.evalview/latest-redesign-v2.html deleted file mode 100644 index 6e51fb0..0000000 --- a/.evalview/latest-redesign-v2.html +++ /dev/null @@ -1,681 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- - - ✗ 1 Failed - - 5 Tests - -
-
- -
- -
- - - - - -
- - -
- -
-
-
-
Pass Rate
-
- - - - -
4/5
-
-
-
80.0%
-
4 of 5 tests
-
-
-
-
Avg Score
-
- - - - -
79
-
-
-
79.7
-
out of 100
-
-
-
Total Cost
-
$0.017
-
2,835 tokens (verified)
- -
in 2,170out 665
- - -
openai/gpt-4o-mini, openai/claude-3.5-sonnet
- -
-
-
Avg Latency
-
1360ms
-
per test
-
-
- -
-
-
Agent Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
-
5 tests in this run
-
- -
-
Token Usage
-
2,835 tokens
-
in 2,170 / out 665
-
- -
- - - -
-
-
EvalView Judge (gpt-4o-mini)
-
$0.006
-
1,320 tokens across 5 judge calls
-
-
-
Judge Token Breakdown
-
in 1,100 / out 220
-
Separate from agent trace cost
-
-
- - -
-
-
Score per Test
-
-
-
-
Distribution
-
-
-
- -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - This check also used 5 EvalView judge calls (1320 tokens). -
-
- -
- - -
- - -
-
- - refund-flow -
- 92.5/100 - 💰 $0.0034 - ⚡ 1200ms - 570 tokens - 🧠 openai/gpt-4o-mini -
- -
-
-
- Model: openai/gpt-4o-mini - in 450 / out 120 tokens$0.0034 - - - -
- -
- QueryProcess refund for order 4812 -
- - -
- Direct response — no tools invoked -
- - - -
- ResponseRefund of $49.99 issued successfully. -
- -
-
- -
-
- - order-lookup -
- 88.0/100 - 💰 $0.0021 - ⚡ 800ms - 415 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - billing-dispute -
- 45.0/100 - 💰 $0.0058 - ⚡ 2400ms - 890 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - password-reset -
- 95.0/100 - 💰 $0.0012 - ⚡ 600ms - 260 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - vip-escalation -
- 78.0/100 - 💰 $0.0045 - ⚡ 1800ms - 700 tokens - 🧠 openai/claude-3.5-sonnet -
- -
- -
- - -
- - -
- -
No diffs yet — run evalview check to compare against a baseline
- -
- - -
- -
No step timing data
- -
- - - -
- - - - -
- Built with EvalView | Agent testing & regression detection - - - Share - - Star - -
-
- - - \ No newline at end of file diff --git a/.evalview/latest-redesign.html b/.evalview/latest-redesign.html deleted file mode 100644 index ea790bb..0000000 --- a/.evalview/latest-redesign.html +++ /dev/null @@ -1,956 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- - - ✗ 1 Failed - - 5 Tests - -
-
- -
- -
- - - - - -
- - -
- -
-
-
-
Pass Rate
-
- - - - -
4/5
-
-
-
80.0%
-
4 of 5 tests
-
-
-
-
Avg Score
-
- - - - -
79
-
-
-
79.7
-
out of 100
-
-
-
-
Total Cost
-
-
$0.017
-
- 2,835 tokens (verified) -
openai/gpt-4o-mini, openai/claude-3.5-sonnet -
- -
- in 2,170 - out 665 -
- -
-
-
-
Avg Latency
-
-
1360ms
-
per test
-
-
- -
-
-
Agent Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
-
5 tests in this run
-
- -
-
Token Usage
-
2,835 tokens
-
in 2,170 / out 665
-
- -
- - - -
-
-
EvalView Judge (gpt-4o-mini)
-
- - $0.006 - -
-
- 1,320 tokens across 5 judge calls -
-
-
-
Judge Token Breakdown
-
in 1,100 / out 220
-
Separate from agent trace cost
-
-
- - - -
-
-
Score per Test
-
-
-
-
Distribution
-
-
-
- - -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - This check also used 5 EvalView judge calls (1320 tokens). -
-
- - -
- - -
- - -
-
- - refund-flow -
- 92.5/100 - 💰 $0.0034 - ⚡ 1200ms - 570 tokens - 🧠 openai/gpt-4o-mini -
- -
-
-
- Model: openai/gpt-4o-mini - - in 450 / out 120 tokens - $0.0034 - - - - -
- -
- QueryProcess refund for order 4812 -
- - -
- - Direct response — no tools invoked - -
- - - -
- ResponseRefund of $49.99 issued successfully. -
- -
-
- -
-
- - order-lookup -
- 88.0/100 - 💰 $0.0021 - ⚡ 800ms - 415 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - billing-dispute -
- 45.0/100 - 💰 $0.0058 - ⚡ 2400ms - 890 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - password-reset -
- 95.0/100 - 💰 $0.0012 - ⚡ 600ms - 260 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - vip-escalation -
- 78.0/100 - 💰 $0.0045 - ⚡ 1800ms - 700 tokens - 🧠 openai/claude-3.5-sonnet -
- -
- -
- - -
- - -
- -
No diffs yet — run evalview check to compare against a baseline
- -
- - -
- -
No step timing data
- -
- - - - -
- - - - -
- - Built with EvalView - | - Agent testing & regression detection - - - - - Share - - - - Star - - -
-
- - - \ No newline at end of file diff --git a/.evalview/latest-v3.html b/.evalview/latest-v3.html deleted file mode 100644 index 4de07bf..0000000 --- a/.evalview/latest-v3.html +++ /dev/null @@ -1,681 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- - - ✗ 1 Failed - - 5 Tests - -
-
- -
- -
- - - - - -
- - -
- -
-
-
-
Pass Rate
-
- - - - -
4/5
-
-
-
80.0%
-
4 of 5 tests
-
-
-
-
Avg Score
-
- - - - -
79
-
-
-
79.7
-
out of 100
-
-
-
Total Cost
-
$0.017
-
2,835 tokens (verified)
- -
in 2,170out 665
- - -
openai/gpt-4o-mini, openai/claude-3.5-sonnet
- -
-
-
Avg Latency
-
1360ms
-
per test
-
-
- -
-
-
Agent Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
-
5 tests in this run
-
- -
-
Token Usage
-
2,835 tokens
-
in 2,170 / out 665
-
- -
- - - -
-
-
EvalView Judge (gpt-4o-mini)
-
$0.006
-
1,320 tokens across 5 judge calls
-
-
-
Judge Token Breakdown
-
in 1,100 / out 220
-
Separate from agent trace cost
-
-
- - -
-
-
Score per Test
-
-
-
-
Distribution
-
-
-
- -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - This check also used 5 EvalView judge calls (1320 tokens). -
-
- -
- - -
- - -
-
- - refund-flow -
- 92.5/100 - 💰 $0.0034 - ⚡ 1200ms - 570 tokens - 🧠 openai/gpt-4o-mini -
- -
-
-
- Model: openai/gpt-4o-mini - in 450 / out 120 tokens$0.0034 - - - -
- -
- QueryProcess refund for order 4812 -
- - -
- Direct response — no tools invoked -
- - - -
- ResponseRefund issued. -
- -
-
- -
-
- - order-lookup -
- 88.0/100 - 💰 $0.0021 - ⚡ 800ms - 415 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - billing-dispute -
- 45.0/100 - 💰 $0.0058 - ⚡ 2400ms - 890 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - password-reset -
- 95.0/100 - 💰 $0.0012 - ⚡ 600ms - 260 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - vip-escalation -
- 78.0/100 - 💰 $0.0045 - ⚡ 1800ms - 700 tokens - 🧠 openai/claude-3.5-sonnet -
- -
- -
- - -
- - -
- -
No diffs yet — run evalview check to compare against a baseline
- -
- - -
- -
No step timing data
- -
- - - -
- - - - -
- Built with EvalView | Agent testing & regression detection - - - Share - - Star - -
-
- - - \ No newline at end of file diff --git a/.evalview/latest-v4.html b/.evalview/latest-v4.html deleted file mode 100644 index 7084865..0000000 --- a/.evalview/latest-v4.html +++ /dev/null @@ -1,708 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- -
- - - ✗ 1 Failed - - 5 Tests - -
-
- -
- -
- - - - - -
- - -
- - - -
-
-
Pass Rate
-
80.0%
-
4 of 5 tests passing
-
- - - - -
4/5
-
-
-
-
-
Avg Score
-
79.7/100
-
-
-
Total Cost
-
$0.017
-
2,835 tokens
-
-
-
Avg Latency
-
1360ms
-
per test
-
-
-
Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
- -
in 2,170 · out 665
- -
-
-
- - -
-
-
Agent Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
-
5 tests in this run
-
- -
-
Token Usage
-
2,835 tokens
-
in 2,170 / out 665
-
- -
- - - -
-
-
EvalView Judge (gpt-4o-mini)
-
$0.006
-
1,320 tokens across 5 judge calls
-
-
-
Judge Token Breakdown
-
in 1,100 / out 220
-
Separate from agent trace cost
-
-
- - - -
-
-
Score per Test
-
-
-
-
Distribution
-
-
-
- - -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - This check also used 5 EvalView judge calls (1320 tokens). -
-
- -
- - -
- - -
-
- - refund-flow -
- 92.5/100 - 💰 $0.0034 - ⚡ 1200ms - 570 tokens - 🧠 openai/gpt-4o-mini -
- -
-
-
- Model: openai/gpt-4o-mini - in 450 / out 120 tokens$0.0034 - - - -
- -
- QueryProcess refund for order 4812 -
- - -
- Direct response — no tools invoked -
- - - -
- ResponseRefund issued. -
- -
-
- -
-
- - order-lookup -
- 88.0/100 - 💰 $0.0021 - ⚡ 800ms - 415 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - billing-dispute -
- 45.0/100 - 💰 $0.0058 - ⚡ 2400ms - 890 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - password-reset -
- 95.0/100 - 💰 $0.0012 - ⚡ 600ms - 260 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - vip-escalation -
- 78.0/100 - 💰 $0.0045 - ⚡ 1800ms - 700 tokens - 🧠 openai/claude-3.5-sonnet -
- -
- -
- - -
- - -
- -
No diffs yet — run evalview check to compare against a baseline
- -
- - -
- -
No step timing data
- -
- - - -
- - - - -
- Built with EvalView | Agent testing & regression detection - - Share - Star - -
-
- - - \ No newline at end of file diff --git a/.evalview/latest-v5.html b/.evalview/latest-v5.html deleted file mode 100644 index 2586400..0000000 --- a/.evalview/latest-v5.html +++ /dev/null @@ -1,689 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- ✗ 1 Failed5 Tests -
-
- -
-
- - - - - -
- - -
- - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-
80.0%
-
Pass Rate
-
-
-
4 of 5 tests passing
-
- - -
-
-
Avg Score
-
79.7
-
out of 100
-
-
-
Total Cost
-
$0.017
-
2,835 tokens (verified)
-
-
-
Avg Latency
-
1360ms
-
per test
-
-
-
Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
- -
in 2,170 · out 665
- -
-
- - -
-
-
Agent Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
-
5 tests in this run
-
- -
-
Token Usage
-
2,835 tokens
-
in 2,170 / out 665
-
- -
- - -
-
-
EvalView Judge (gpt-4o-mini)
-
$0.006
-
1,320 tokens across 5 judge calls
-
-
-
Judge Token Breakdown
-
in 1,100 / out 220
-
Separate from agent trace cost
-
-
- - - -
-
-
Score per Test
-
-
-
-
Distribution
-
-
-
- - -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - This check also used 5 EvalView judge calls (1320 tokens). -
-
- -
- - -
- -
-
- - refund-flow -
- 92.5/100 - 💰 $0.0034 - ⚡ 1200ms - 570 tokens - 🧠 openai/gpt-4o-mini -
- -
-
-
- Model: openai/gpt-4o-mini - in 450 / out 120 tokens$0.0034 - - - -
- -
- QueryProcess refund for order 4812 -
-
◎ Direct response — no tools invoked
- - -
- ResponseRefund issued. -
-
-
- -
-
- - order-lookup -
- 88.0/100 - 💰 $0.0021 - ⚡ 800ms - 415 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - billing-dispute -
- 45.0/100 - 💰 $0.0058 - ⚡ 2400ms - 890 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - password-reset -
- 95.0/100 - 💰 $0.0012 - ⚡ 600ms - 260 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - vip-escalation -
- 78.0/100 - 💰 $0.0045 - ⚡ 1800ms - 700 tokens - 🧠 openai/claude-3.5-sonnet -
- -
- -
- -
- - -
-
No diffs yet — run evalview check to compare against a baseline
-
- - -
-
No step timing data
-
- - - -
- - - - -
- Built with EvalView | Agent testing & regression detection - - Share - Star - -
-
- - - \ No newline at end of file diff --git a/.evalview/latest-v6.html b/.evalview/latest-v6.html deleted file mode 100644 index aee40eb..0000000 --- a/.evalview/latest-v6.html +++ /dev/null @@ -1,616 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- ✗ 1 Failed5 Tests -
-
- -
-
- - - - - -
- - -
- - - -
-
-
- - - - -
-
80.0%
-
Pass Rate
-
-
-
4 of 5 tests
-
-
-
-
Avg Score
-
79.7
-
out of 100
-
-
-
Total Cost
-
$0.017
-
2,835 tokens (verified)
-
-
-
Avg Latency
-
1360ms
-
per test
-
-
-
Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
- -
in 2,170 · out 665
- -
-
-
- - -
-
-
Agent Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
-
5 tests in this run
-
- -
-
Token Usage
-
2,835 tokens
-
in 2,170 / out 665
-
- -
- - -
-
-
EvalView Judge (gpt-4o-mini)
-
$0.006
-
1,320 tokens across 5 judge calls
-
-
-
Judge Token Breakdown
-
in 1,100 / out 220
-
Separate from agent trace cost
-
-
- - - -
-
-
Score per Test
-
-
-
-
Distribution
-
-
-
- - -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - This check also used 5 EvalView judge calls (1320 tokens). -
-
- -
- - -
- -
-
- - refund-flow -
- 92.5/100 - 💰 $0.0034 - ⚡ 1200ms - 570 tokens - 🧠 openai/gpt-4o-mini -
- -
-
-
- Model: openai/gpt-4o-mini - in 450 / out 120 tokens$0.0034 - - - -
- -
- QueryProcess refund -
-
◎ Direct response — no tools invoked
- - -
- ResponseDone. -
-
-
- -
-
- - order-lookup -
- 88.0/100 - 💰 $0.0021 - ⚡ 800ms - 415 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - billing-dispute -
- 45.0/100 - 💰 $0.0058 - ⚡ 2400ms - 890 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - password-reset -
- 95.0/100 - 💰 $0.0012 - ⚡ 600ms - 260 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - vip-escalation -
- 78.0/100 - 💰 $0.0045 - ⚡ 1800ms - 700 tokens - 🧠 openai/claude-3.5-sonnet -
- -
- -
- -
- - -
-
No diffs yet — run evalview check to compare against a baseline
-
- - -
-
No step timing data
-
- - - -
- - - - -
- Built with EvalView | Agent testing & regression detection - - Share - Star - -
-
- - - \ No newline at end of file diff --git a/.evalview/latest-v6b.html b/.evalview/latest-v6b.html deleted file mode 100644 index e6e050e..0000000 --- a/.evalview/latest-v6b.html +++ /dev/null @@ -1,616 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- ✗ 1 Failed5 Tests -
-
- -
-
- - - - - -
- - -
- - - -
-
-
- - - - -
-
80.0%
-
Pass Rate
-
-
-
4 of 5 tests
-
-
-
-
Avg Score
-
79.7
-
out of 100
-
-
-
Total Cost
-
$0.017
-
2,835 tokens (verified)
-
-
-
Avg Latency
-
1360ms
-
per test
-
-
-
Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
- -
in 2,170 · out 665
- -
-
-
- - -
-
-
Agent Model
-
openai/gpt-4o-mini, openai/claude-3.5-sonnet
-
5 tests in this run
-
- -
-
Token Usage
-
2,835 tokens
-
in 2,170 / out 665
-
- -
- - -
-
-
EvalView Judge (gpt-4o-mini)
-
$0.006
-
1,320 tokens across 5 judge calls
-
-
-
Judge Token Breakdown
-
in 1,100 / out 220
-
Separate from agent trace cost
-
-
- - - -
-
-
Score per Test
-
-
-
-
Distribution
-
-
-
- - -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostTokensLatencyScore
refund-flowopenai/gpt-4o-mini$0.0034570 tokens1200ms92.5
order-lookupopenai/gpt-4o-mini$0.0021415 tokens800ms88.0
billing-disputeopenai/gpt-4o-mini$0.0058890 tokens2400ms45.0
password-resetopenai/gpt-4o-mini$0.0012260 tokens600ms95.0
vip-escalationopenai/claude-3.5-sonnet$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - This check also used 5 EvalView judge calls (1320 tokens). -
-
- -
- - -
- -
-
- - refund-flow -
- 92.5/100 - 💰 $0.0034 - ⚡ 1200ms - 570 tokens - 🧠 openai/gpt-4o-mini -
- -
-
-
- Model: openai/gpt-4o-mini - in 450 / out 120 tokens$0.0034 - - - -
- -
- QueryProcess refund -
-
◎ Direct response — no tools invoked
- - -
- ResponseDone. -
-
-
- -
-
- - order-lookup -
- 88.0/100 - 💰 $0.0021 - ⚡ 800ms - 415 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - billing-dispute -
- 45.0/100 - 💰 $0.0058 - ⚡ 2400ms - 890 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - password-reset -
- 95.0/100 - 💰 $0.0012 - ⚡ 600ms - 260 tokens - 🧠 openai/gpt-4o-mini -
- -
- -
- -
-
- - vip-escalation -
- 78.0/100 - 💰 $0.0045 - ⚡ 1800ms - 700 tokens - 🧠 openai/claude-3.5-sonnet -
- -
- -
- -
- - -
-
No diffs yet — run evalview check to compare against a baseline
-
- - -
-
No step timing data
-
- - - -
- - - - -
- Built with EvalView | Agent testing & regression detection - - Share - Star - -
-
- - - \ No newline at end of file diff --git a/.evalview/latest-v6c.html b/.evalview/latest-v6c.html deleted file mode 100644 index 5caff22..0000000 --- a/.evalview/latest-v6c.html +++ /dev/null @@ -1,603 +0,0 @@ - - - - - -EvalView Check Report - - - - - - - - - -
- -
- ✗ 1 Failed5 Tests -
-
- -
-
- - - - - -
- - -
- - - -
-
-
- - - - -
-
80.0%
-
Pass Rate
-
-
-
4 of 5 tests
-
-
-
-
Avg Score
-
79.7
-
out of 100
-
-
-
Total Cost
-
$0.017
-
2,835 tokens (verified)
-
-
-
Avg Latency
-
1360ms
-
per test
-
-
-
Model
-
anthropic/claude-sonnet-4-6
- -
in 2,170 · out 665
- -
-
-
- - -
-
-
Agent Model
-
anthropic/claude-sonnet-4-6
-
5 tests in this run
-
- -
-
Token Usage
-
2,835 tokens
-
in 2,170 / out 665
-
- -
- - - - -
-
-
Score per Test
-
-
-
-
Distribution
-
-
-
- - -
-
Execution Cost per Query
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
TestModelTrace CostTokensLatencyScore
refund-flowanthropic/claude-sonnet-4-6$0.0034570 tokens1200ms92.5
order-lookupanthropic/claude-sonnet-4-6$0.0021415 tokens800ms88.0
billing-disputeanthropic/claude-sonnet-4-6$0.0058890 tokens2400ms45.0
password-resetanthropic/claude-sonnet-4-6$0.0012260 tokens600ms95.0
vip-escalationanthropic/claude-sonnet-4-6$0.0045700 tokens1800ms78.0
Total$0.017avg $0.003400 per query
-
- Trace cost comes from the agent execution trace only. Mock or non-metered tools will show $0 even when EvalView used a separate judge or local model during evaluation. - -
-
- -
- - -
- -
-
- - refund-flow -
- 92.5/100 - 💰 $0.0034 - ⚡ 1200ms - 570 tokens - 🧠 anthropic/claude-sonnet-4-6 -
- -
-
-
- Model: anthropic/claude-sonnet-4-6 - in 450 / out 120 tokens$0.0034 - - - -
- -
- QueryProcess refund -
-
◎ Direct response — no tools invoked
- - -
- ResponseDone. -
-
-
- -
-
- - order-lookup -
- 88.0/100 - 💰 $0.0021 - ⚡ 800ms - 415 tokens - 🧠 anthropic/claude-sonnet-4-6 -
- -
- -
- -
-
- - billing-dispute -
- 45.0/100 - 💰 $0.0058 - ⚡ 2400ms - 890 tokens - 🧠 anthropic/claude-sonnet-4-6 -
- -
- -
- -
-
- - password-reset -
- 95.0/100 - 💰 $0.0012 - ⚡ 600ms - 260 tokens - 🧠 anthropic/claude-sonnet-4-6 -
- -
- -
- -
-
- - vip-escalation -
- 78.0/100 - 💰 $0.0045 - ⚡ 1800ms - 700 tokens - 🧠 anthropic/claude-sonnet-4-6 -
- -
- -
- -
- - -
-
No diffs yet — run evalview check to compare against a baseline
-
- - -
-
No step timing data
-
- - - -
- - - - -
- Built with EvalView | Agent testing & regression detection - - Share - Star - -
-
- - - \ No newline at end of file diff --git a/.gitignore b/.gitignore index 866773b..c2ceaa2 100644 --- a/.gitignore +++ b/.gitignore @@ -47,6 +47,10 @@ env/ .evalview/config.yaml .evalview/golden/ .evalview/history.jsonl +# Generated artifacts that should not be tracked +.evalview/*.html +.evalview/badge.json +.evalview/healing/ tests/test-cases/*.yaml !tests/test-cases/example.yaml