diff --git a/evalview/commands/check_display.py b/evalview/commands/check_display.py index c53fb48..ff2b582 100644 --- a/evalview/commands/check_display.py +++ b/evalview/commands/check_display.py @@ -16,6 +16,96 @@ from evalview.core.model_runtime_detector import ModelRuntimeChangeSummary +def _aggregate_token_summary( + results: Optional[List["EvaluationResult"]], + golden_traces: Optional[Dict[str, "GoldenTrace"]] = None, +) -> Optional[Dict[str, Any]]: + """Aggregate token usage and cost across test results. + + Computes total input/output/cached tokens and cost from all results. + Optionally calculates baseline comparison if golden traces are provided. + + Args: + results: List of evaluation results with token data + golden_traces: Optional baseline traces for comparison + + Returns: + Dict with token_usage, total_cost, baseline_token_usage, and token_delta_pct, + or None if no token data is available. + """ + from evalview.core.types import TokenUsage + + if not results: + return None + + total_cost = 0.0 + total_input = 0 + total_output = 0 + total_cached = 0 + has_tokens = False + + for r in results: + try: + total_cost += float(getattr(r.trace.metrics, "total_cost", 0) or 0) + except Exception: + pass + + tu = getattr(getattr(r, "trace", None), "metrics", None) + tu = getattr(tu, "total_tokens", None) + if tu is None: + continue + + has_tokens = True + total_input += int(getattr(tu, "input_tokens", 0) or 0) + total_output += int(getattr(tu, "output_tokens", 0) or 0) + total_cached += int(getattr(tu, "cached_tokens", 0) or 0) + + if not has_tokens: + return None + + current = TokenUsage( + input_tokens=total_input, + output_tokens=total_output, + cached_tokens=total_cached, + ) + + baseline_usage: Optional[TokenUsage] = None + delta_pct: Optional[float] = None + if golden_traces: + base_input = 0 + base_output = 0 + base_cached = 0 + base_has_tokens = False + for r in results: + g = golden_traces.get(r.test_case) + if not g: + continue + gtu = getattr(getattr(g, "trace", None), "metrics", None) + gtu = getattr(gtu, "total_tokens", None) + if gtu is None: + continue + base_has_tokens = True + base_input += int(getattr(gtu, "input_tokens", 0) or 0) + base_output += int(getattr(gtu, "output_tokens", 0) or 0) + base_cached += int(getattr(gtu, "cached_tokens", 0) or 0) + + if base_has_tokens: + baseline_usage = TokenUsage( + input_tokens=base_input, + output_tokens=base_output, + cached_tokens=base_cached, + ) + if baseline_usage.total_tokens > 0: + delta_pct = (current.total_tokens - baseline_usage.total_tokens) / baseline_usage.total_tokens * 100.0 + + return { # type: ignore[return-value] + "token_usage": current, + "total_cost": float(total_cost), + "baseline_token_usage": baseline_usage, + "token_delta_pct": delta_pct, + } + + def _print_parameter_diffs(tool_diffs: List["ToolDiff"]) -> None: """Print parameter-level differences for tool calls.""" from rich.table import Table @@ -400,7 +490,8 @@ def _display_check_results( behavior_summary = _build_behavior_summary(diffs, test_metadata, healing_summary) if json_output: - output = { + token_summary = _aggregate_token_summary(results, golden_traces) + output: Dict[str, Any] = { "summary": { "total_tests": len(diffs), "unchanged": sum(1 for _, d in diffs if d.overall_severity == DiffStatus.PASSED), @@ -466,6 +557,13 @@ def _display_check_results( for name, diff in diffs ], } + if token_summary is not None: + output["summary"]["token_usage"] = token_summary["token_usage"].model_dump() + output["summary"]["total_cost"] = token_summary["total_cost"] + if token_summary.get("baseline_token_usage") is not None: + output["summary"]["baseline_token_usage"] = token_summary["baseline_token_usage"].model_dump() + if token_summary.get("token_delta_pct") is not None: + output["summary"]["token_delta_pct"] = token_summary["token_delta_pct"] if healing_summary: output["healing"] = { "total_healed": healing_summary.total_healed, @@ -547,6 +645,26 @@ def _display_check_results( ) console.print() + token_summary = _aggregate_token_summary(results, golden_traces) + if token_summary is not None: + from rich.table import Table + + tu = token_summary["token_usage"] # type: ignore[assignment] + delta_pct = token_summary.get("token_delta_pct") + delta_str = "" + if delta_pct is not None: + sign = "+" if delta_pct > 0 else "" + color = "red" if delta_pct > 10 else "yellow" if delta_pct > 0 else "green" + delta_str = f" [{color}]({sign}{delta_pct:.0f}% tokens vs baseline)[/{color}]" + + table = Table(show_header=False, show_lines=False, padding=(0, 1)) + table.add_column("k", style="dim", width=12) + table.add_column("v") + table.add_row("Tokens", f"in {tu.input_tokens:,} out {tu.output_tokens:,} cached {tu.cached_tokens:,} total {tu.total_tokens:,}{delta_str}") + table.add_row("Cost", f"${float(token_summary['total_cost']):.4f}") + console.print(table) + console.print() + # --- Sparkline Trends --- if diffs and drift_tracker is not None: test_trends: Dict[str, List[float]] = {} diff --git a/tests/test_check_cmd.py b/tests/test_check_cmd.py index 2ed5928..1488747 100644 --- a/tests/test_check_cmd.py +++ b/tests/test_check_cmd.py @@ -6,6 +6,8 @@ from click.testing import CliRunner +import pytest + def test_check_dry_run_handles_golden_metadata_objects(monkeypatch, tmp_path): """Dry-run should count baselines by name without hashing metadata models.""" @@ -140,6 +142,228 @@ def test_check_does_not_report_clean_when_execution_failures_occur(monkeypatch, assert "execution failure" in result.output +def test_check_json_includes_token_usage_and_cost_summary(monkeypatch, tmp_path): + from evalview.commands.check_cmd import check + from evalview.core.diff import DiffStatus, TraceDiff + from evalview.core.golden import GoldenMetadata + from evalview.core.project_state import ProjectState + from evalview.core.types import ( + ContainsChecks, + CostEvaluation, + EvaluationResult, + Evaluations, + ExecutionMetrics, + ExecutionTrace, + LatencyEvaluation, + OutputEvaluation, + SequenceEvaluation, + TokenUsage, + ToolEvaluation, + ) + + project = tmp_path + monkeypatch.chdir(project) + + tests_dir = project / "tests" + tests_dir.mkdir() + (tests_dir / "sample.yaml").write_text( + "name: sample\ninput:\n query: hi\nexpected:\n tools: []\nthresholds:\n min_score: 0\n", + encoding="utf-8", + ) + + evalview_dir = project / ".evalview" + evalview_dir.mkdir() + (evalview_dir / "config.yaml").write_text( + "adapter: http\nendpoint: http://example.com\n", + encoding="utf-8", + ) + + now = datetime.now() + sample_result = EvaluationResult( + test_case="sample", + passed=True, + score=90.0, + evaluations=Evaluations( + tool_accuracy=ToolEvaluation(accuracy=1.0), + sequence_correctness=SequenceEvaluation(correct=True, expected_sequence=[], actual_sequence=[]), + output_quality=OutputEvaluation( + score=90.0, + rationale="ok", + contains_checks=ContainsChecks(), + not_contains_checks=ContainsChecks(), + ), + cost=CostEvaluation(total_cost=0.0123, threshold=1.0, passed=True), + latency=LatencyEvaluation(total_latency=10.0, threshold=1000.0, passed=True), + ), + trace=ExecutionTrace( + session_id="s1", + start_time=now, + end_time=now, + steps=[], + final_output="ok", + metrics=ExecutionMetrics( + total_cost=0.0123, + total_latency=10.0, + total_tokens=TokenUsage(input_tokens=100, output_tokens=50, cached_tokens=25), + ), + ), + timestamp=now, + ) + + diff = TraceDiff( + test_name="sample", + has_differences=False, + tool_diffs=[], + output_diff=None, + score_diff=0.0, + latency_diff=0.0, + overall_severity=DiffStatus.PASSED, + ) + + runner = CliRunner() + + monkeypatch.setattr("evalview.commands.check_cmd._cloud_pull", lambda store: None) + monkeypatch.setattr("evalview.commands.check_cmd._load_config_if_exists", lambda: None) + monkeypatch.setattr( + "evalview.core.golden.GoldenStore.list_golden", + lambda self: [GoldenMetadata(test_name="sample", blessed_at="2026-03-13T00:00:00Z", score=95.0)], + ) + + golden_tokens = TokenUsage(input_tokens=80, output_tokens=40, cached_tokens=20) + golden_traces = { + "sample": type( + "_G", + (), + { + "trace": type("_T", (), {"metrics": ExecutionMetrics(total_cost=0.01, total_latency=1.0, total_tokens=golden_tokens)})() + }, + )() + } + + monkeypatch.setattr( + "evalview.commands.check_cmd._execute_check_tests", + lambda test_cases, config, json_output, semantic_diff=False, timeout=30.0, skip_llm_judge=False, budget_tracker=None: ([ + ("sample", diff) + ], [sample_result], None, golden_traces), + ) + from evalview.core.project_state import ProjectStateStore + monkeypatch.setattr(ProjectStateStore, "load", lambda self: ProjectState()) + monkeypatch.setattr(ProjectStateStore, "update_check", lambda self, has_regressions, status="passed": ProjectState()) + + result = runner.invoke(check, ["tests", "--json"]) + assert result.exit_code == 0 + + import json as _json + payload = _json.loads(result.output) + assert "summary" in payload + assert payload["summary"]["token_usage"] == {"input_tokens": 100, "output_tokens": 50, "cached_tokens": 25} + assert payload["summary"]["total_cost"] == pytest.approx(0.0123) + assert payload["summary"]["baseline_token_usage"] == {"input_tokens": 80, "output_tokens": 40, "cached_tokens": 20} + assert payload["summary"]["token_delta_pct"] == pytest.approx(25.0) + + +def test_check_json_omits_token_usage_when_unavailable(monkeypatch, tmp_path): + from evalview.commands.check_cmd import check + from evalview.core.diff import DiffStatus, TraceDiff + from evalview.core.golden import GoldenMetadata + from evalview.core.project_state import ProjectState + from evalview.core.types import ( + ContainsChecks, + CostEvaluation, + EvaluationResult, + Evaluations, + ExecutionMetrics, + ExecutionTrace, + LatencyEvaluation, + OutputEvaluation, + SequenceEvaluation, + ToolEvaluation, + ) + + project = tmp_path + monkeypatch.chdir(project) + + tests_dir = project / "tests" + tests_dir.mkdir() + (tests_dir / "sample.yaml").write_text( + "name: sample\ninput:\n query: hi\nexpected:\n tools: []\nthresholds:\n min_score: 0\n", + encoding="utf-8", + ) + + evalview_dir = project / ".evalview" + evalview_dir.mkdir() + (evalview_dir / "config.yaml").write_text( + "adapter: http\nendpoint: http://example.com\n", + encoding="utf-8", + ) + + now = datetime.now() + sample_result = EvaluationResult( + test_case="sample", + passed=True, + score=90.0, + evaluations=Evaluations( + tool_accuracy=ToolEvaluation(accuracy=1.0), + sequence_correctness=SequenceEvaluation(correct=True, expected_sequence=[], actual_sequence=[]), + output_quality=OutputEvaluation( + score=90.0, + rationale="ok", + contains_checks=ContainsChecks(), + not_contains_checks=ContainsChecks(), + ), + cost=CostEvaluation(total_cost=0.0, threshold=1.0, passed=True), + latency=LatencyEvaluation(total_latency=10.0, threshold=1000.0, passed=True), + ), + trace=ExecutionTrace( + session_id="s1", + start_time=now, + end_time=now, + steps=[], + final_output="ok", + metrics=ExecutionMetrics(total_cost=0.0, total_latency=10.0, total_tokens=None), + ), + timestamp=now, + ) + + diff = TraceDiff( + test_name="sample", + has_differences=False, + tool_diffs=[], + output_diff=None, + score_diff=0.0, + latency_diff=0.0, + overall_severity=DiffStatus.PASSED, + ) + + runner = CliRunner() + + monkeypatch.setattr("evalview.commands.check_cmd._cloud_pull", lambda store: None) + monkeypatch.setattr("evalview.commands.check_cmd._load_config_if_exists", lambda: None) + monkeypatch.setattr( + "evalview.core.golden.GoldenStore.list_golden", + lambda self: [GoldenMetadata(test_name="sample", blessed_at="2026-03-13T00:00:00Z", score=95.0)], + ) + monkeypatch.setattr( + "evalview.commands.check_cmd._execute_check_tests", + lambda test_cases, config, json_output, semantic_diff=False, timeout=30.0, skip_llm_judge=False, budget_tracker=None: ([ + ("sample", diff) + ], [sample_result], None, {}), + ) + from evalview.core.project_state import ProjectStateStore + monkeypatch.setattr(ProjectStateStore, "load", lambda self: ProjectState()) + monkeypatch.setattr(ProjectStateStore, "update_check", lambda self, has_regressions, status="passed": ProjectState()) + + result = runner.invoke(check, ["tests", "--json"]) + assert result.exit_code == 0 + + import json as _json + payload = _json.loads(result.output) + assert "summary" in payload + assert "token_usage" not in payload["summary"] + assert "baseline_token_usage" not in payload["summary"] + assert "token_delta_pct" not in payload["summary"] + + def test_check_uses_active_test_path_when_no_path_is_given(monkeypatch, tmp_path): """Plain `check` should follow the remembered active suite instead of raw tests/.""" from evalview.commands.check_cmd import check