From 8eb3874263bb7a2f92d3a92ee06780c491ea09c7 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 20 Apr 2026 07:35:46 +0000 Subject: [PATCH] fix(snapshot): harden --json output for CI consumers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to Matt's --json flag: - Keep stdout clean in JSON mode by plumbing json_output into _execute_snapshot_tests and running without the Rich live spinner. Previously per-test prints and spinner frames leaked into the payload, making stdout unparseable. - Track the real set of saved tests via a {name -> Path} map so per-test `saved` and `golden_file` reflect actual disk writes — previously any passing test was reported as saved whenever at least one sibling saved, and the path guessed `.yaml` instead of the variant-aware `.golden.json` GoldenStore actually writes. - Reject `--preview --json` upfront with a JSON error and non-zero exit rather than silently dropping --json. - Pretty-print JSON (indent=2) to match `skill`/`model-check` output, tidy the --json help text to document the suppression and auto-approve behavior, and keep `import json` with the other stdlib imports. - Cover the contract with tests/test_snapshot_json_output.py: parseable payload, accurate per-test saved/golden_file tracking (including partial save failures), variant-aware paths, --preview rejection, and the empty-suite error shape. https://claude.ai/code/session_01YPVyciLBGFEpKoMV7y8zFQ --- evalview/commands/shared.py | 18 ++- evalview/commands/snapshot_cmd.py | 78 +++++---- tests/test_snapshot_json_output.py | 247 +++++++++++++++++++++++++++++ 3 files changed, 310 insertions(+), 33 deletions(-) create mode 100644 tests/test_snapshot_json_output.py diff --git a/evalview/commands/shared.py b/evalview/commands/shared.py index a1dabe4..e45da53 100644 --- a/evalview/commands/shared.py +++ b/evalview/commands/shared.py @@ -552,8 +552,13 @@ def _execute_snapshot_tests( config: Optional["EvalViewConfig"], timeout: float = 30.0, skip_llm_judge: bool = False, + json_output: bool = False, ) -> List["EvaluationResult"]: - """Execute tests and evaluate results for snapshot/benchmark commands.""" + """Execute tests and evaluate results for snapshot/benchmark commands. + + When json_output=True, per-test console output is suppressed so stdout + stays clean for JSON consumers. + """ from evalview.evaluators.evaluator import Evaluator results = [] @@ -563,10 +568,12 @@ async def _run_one(tc: "TestCase") -> Optional["EvaluationResult"]: try: adapter = _build_adapter_for_tc(tc, config, timeout) except ValueError as e: - console.print(f"[yellow]⚠ Skipping {tc.name}: {e}[/yellow]") + if not json_output: + console.print(f"[yellow]⚠ Skipping {tc.name}: {e}[/yellow]") return None if adapter is None: - console.print(f"[yellow]⚠ Skipping {tc.name}: No adapter/endpoint configured[/yellow]") + if not json_output: + console.print(f"[yellow]⚠ Skipping {tc.name}: No adapter/endpoint configured[/yellow]") return None trace = await _execute_agent_with_slow_warning(tc, adapter, timeout) @@ -581,6 +588,8 @@ async def _run_all() -> List[Any]: if isinstance(outcome, BaseException): error_str = str(outcome) endpoint = tc.endpoint or (config.endpoint if config else None) or "" + if json_output: + continue if isinstance(outcome, (asyncio.TimeoutError, asyncio.CancelledError)): console.print(f"[red]✗ {tc.name}: Async execution failed - {outcome}[/red]") else: @@ -597,6 +606,9 @@ async def _run_all() -> List[Any]: result = outcome results.append(result) + if json_output: + continue + if result.passed: console.print(f"[green]✓ {tc.name}:[/green] {result.score:.1f}/100") else: diff --git a/evalview/commands/snapshot_cmd.py b/evalview/commands/snapshot_cmd.py index 4a2426b..feeca5d 100644 --- a/evalview/commands/snapshot_cmd.py +++ b/evalview/commands/snapshot_cmd.py @@ -1,12 +1,11 @@ """Snapshot command — run tests and save passing results as baseline.""" from __future__ import annotations +import json from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, TYPE_CHECKING -import json - import click import yaml # type: ignore[import-untyped] @@ -27,11 +26,14 @@ def _save_snapshot_results( notes: Optional[str], variant: Optional[str] = None, quiet: bool = False, -) -> int: +) -> Dict[str, Path]: """Save passing test results as golden baselines. Returns: - Number of tests successfully saved + Mapping of test_case name to the saved golden file path. Only + tests that were actually written appear in the result — tests + that raised during save are omitted so callers can report + accurate per-test status. """ from evalview.core.golden import GoldenStore @@ -52,30 +54,30 @@ def _save_snapshot_results( if not timed_out and not low_score: console.print("[dim] Run evalview run to see detailed failure reasons, then fix and retry.[/dim]") console.print() - return 0 + return {} # Save passing results as golden if not quiet: console.print() - saved_count = 0 - saved_names = [] + saved: Dict[str, Path] = {} for result in passing: try: - store.save_golden(result, notes=notes, variant_name=variant) + path = store.save_golden(result, notes=notes, variant_name=variant) variant_label = f" (variant: {variant})" if variant else "" if not quiet: console.print(f"[green]✓ Snapshotted:[/green] {result.test_case}{variant_label}") - saved_count += 1 - saved_names.append(result.test_case) + # save_golden returns a Path on success; fall back to the + # deterministic path helper if an older implementation returns None. + saved[result.test_case] = path if path is not None else store._get_golden_path(result.test_case, variant) except Exception as e: if not quiet: console.print(f"[red]❌ Failed to save {result.test_case}: {e}[/red]") # Silent cloud push — never blocks or fails the snapshot - if saved_names: - _cloud_push(saved_names) + if saved: + _cloud_push(list(saved.keys())) - return saved_count + return saved def _is_generated_draft(test_case) -> bool: @@ -195,7 +197,7 @@ def _group_tests_by_target(test_cases: List, config) -> Dict[tuple[str, str], li @click.option("--no-judge", "no_judge", is_flag=True, default=False, help="Skip LLM-as-judge evaluation. Uses deterministic scoring only (scores capped at 75). No API key required.") @click.option("--timeout", default=30.0, type=float, help="Timeout in seconds per test (default: 30).") @click.option("--preview", is_flag=True, help="Show what would change without saving. Dry-run mode for snapshot.") -@click.option("--json", "json_output", is_flag=True, help="Output JSON for CI") +@click.option("--json", "json_output", is_flag=True, help="Emit a JSON payload on stdout for CI. Suppresses Rich output, auto-approves generated drafts, and skips the dashboard prompt.") @track_command("snapshot") @click.pass_context def snapshot(ctx: click.Context, test_path: str, notes: str, test: str, variant: str, approve_generated: bool, reset: bool, judge_model: Optional[str], no_judge: bool, timeout: float, preview: bool, json_output: bool): @@ -225,6 +227,15 @@ def snapshot(ctx: click.Context, test_path: str, notes: str, test: str, variant: from evalview.core.messages import get_random_snapshot_message from evalview.skills.ui_utils import print_evalview_banner + # --preview and --json collide: preview emits human-readable diff output, + # --json promises a parseable payload. Fail fast rather than silently + # drop one of them. + if json_output and preview: + print(json.dumps( + {"error": "--preview cannot be combined with --json"}, indent=2 + )) + ctx.exit(2) + if not json_output: print_evalview_banner(console, subtitle="[dim]Catch agent regressions before you ship[/dim]") @@ -265,12 +276,12 @@ def snapshot(ctx: click.Context, test_path: str, notes: str, test: str, variant: console.print(f"[red]❌ Failed to load test cases: {e}[/red]\n") Celebrations.no_tests_found() else: - print(json.dumps({"error": str(e)})) + print(json.dumps({"error": str(e)}, indent=2)) return if not test_cases: if json_output: - print(json.dumps({"error": "no tests found"})) + print(json.dumps({"error": "no tests found"}, indent=2)) else: Celebrations.no_tests_found() return @@ -280,7 +291,7 @@ def snapshot(ctx: click.Context, test_path: str, notes: str, test: str, variant: test_cases = [tc for tc in test_cases if tc.name == test] if not test_cases: if json_output: - print(json.dumps({"error": f"no test found with name: {test}"})) + print(json.dumps({"error": f"no test found with name: {test}"}, indent=2)) else: console.print(f"[red]❌ No test found with name: {test}[/red]\n") return @@ -329,13 +340,20 @@ def snapshot(ctx: click.Context, test_path: str, notes: str, test: str, variant: endpoints, adapters = _summarize_mixed_targets(test_cases, config) target_groups = _group_tests_by_target(test_cases, config) - # Execute tests with spinner - from evalview.commands.shared import run_with_spinner - results = run_with_spinner( - lambda: _execute_snapshot_tests(test_cases, config, timeout=timeout, skip_llm_judge=no_judge), - "Snapshotting", - len(test_cases), - ) + # Execute tests. In JSON mode we skip the live spinner — it writes Rich + # frames to the same console stream as our JSON payload and would make + # stdout unparseable. + if json_output: + results = _execute_snapshot_tests( + test_cases, config, timeout=timeout, skip_llm_judge=no_judge, json_output=True + ) + else: + from evalview.commands.shared import run_with_spinner + results = run_with_spinner( + lambda: _execute_snapshot_tests(test_cases, config, timeout=timeout, skip_llm_judge=no_judge), + "Snapshotting", + len(test_cases), + ) failed_count = len(test_cases) - len(results) # Preview mode: show what would change without saving @@ -380,11 +398,11 @@ def snapshot(ctx: click.Context, test_path: str, notes: str, test: str, variant: return # Save passing results as golden - saved_count = _save_snapshot_results(results, notes, variant=variant, quiet=json_output) + saved_paths = _save_snapshot_results(results, notes, variant=variant, quiet=json_output) + saved_count = len(saved_paths) # JSON output mode if json_output: - golden_store = GoldenStore() snapshot_data = { "snapshot": { "timestamp": datetime.now(timezone.utc).isoformat(), @@ -401,15 +419,15 @@ def snapshot(ctx: click.Context, test_path: str, notes: str, test: str, variant: "name": result.test_case, "passed": result.passed, "score": result.score, - "saved": result.passed and saved_count > 0, - "golden_file": str(golden_store.golden_dir / f"{result.test_case}.yaml") - if result.passed and saved_count > 0 + "saved": result.test_case in saved_paths, + "golden_file": str(saved_paths[result.test_case]) + if result.test_case in saved_paths else None, } for result in results ], } - print(json.dumps(snapshot_data)) + print(json.dumps(snapshot_data, indent=2)) state_store.update_snapshot(test_count=saved_count) return diff --git a/tests/test_snapshot_json_output.py b/tests/test_snapshot_json_output.py new file mode 100644 index 0000000..be7187a --- /dev/null +++ b/tests/test_snapshot_json_output.py @@ -0,0 +1,247 @@ +"""Tests for `evalview snapshot --json` CI output contract.""" + +from __future__ import annotations + +import json +from datetime import datetime +from pathlib import Path + +from click.testing import CliRunner + +from evalview.core.types import ( + ContainsChecks, + CostEvaluation, + EvaluationResult, + Evaluations, + ExecutionMetrics, + ExecutionTrace, + LatencyEvaluation, + OutputEvaluation, + SequenceEvaluation, + ToolEvaluation, +) + + +def _result(name: str, *, passed: bool, score: float = 90.0) -> EvaluationResult: + now = datetime.now() + return EvaluationResult( + test_case=name, + passed=passed, + score=score, + evaluations=Evaluations( + tool_accuracy=ToolEvaluation(accuracy=1.0 if passed else 0.0), + sequence_correctness=SequenceEvaluation( + correct=passed, expected_sequence=[], actual_sequence=[] + ), + output_quality=OutputEvaluation( + score=score, + rationale="ok", + contains_checks=ContainsChecks(), + not_contains_checks=ContainsChecks(), + ), + cost=CostEvaluation(total_cost=0.0, threshold=1.0, passed=True), + latency=LatencyEvaluation(total_latency=10.0, threshold=1000.0, passed=True), + ), + trace=ExecutionTrace( + session_id="s1", + start_time=now, + end_time=now, + steps=[], + final_output="ok", + metrics=ExecutionMetrics(total_cost=0.0, total_latency=10.0), + ), + timestamp=now, + ) + + +def _write_yaml(path: Path, name: str) -> None: + path.write_text( + f"""name: {name} +input: + query: hi +expected: + output: + contains: + - ok +thresholds: + min_score: 0 +""", + encoding="utf-8", + ) + + +def _patch_pipeline(monkeypatch, results): + """Stub out the slow moving parts so the command runs deterministically.""" + from evalview.commands import snapshot_cmd + + monkeypatch.setattr(snapshot_cmd, "_load_config_if_exists", lambda: None) + monkeypatch.setattr( + snapshot_cmd, + "_execute_snapshot_tests", + lambda test_cases, config, **kwargs: list(results), + ) + monkeypatch.setattr(snapshot_cmd, "_cloud_push", lambda saved_names: None) + monkeypatch.setattr( + "evalview.core.project_state.ProjectStateStore.is_first_snapshot", + lambda self: False, + ) + monkeypatch.setattr( + "evalview.core.project_state.ProjectStateStore.update_snapshot", + lambda self, test_count=1: None, + ) + + +def test_snapshot_json_emits_parseable_payload(monkeypatch, tmp_path): + """`--json` must put a single parseable JSON document on stdout with no Rich output.""" + from evalview.commands.snapshot_cmd import snapshot + + monkeypatch.chdir(tmp_path) + tests_dir = tmp_path / "tests" + tests_dir.mkdir() + _write_yaml(tests_dir / "alpha.yaml", "alpha") + _write_yaml(tests_dir / "beta.yaml", "beta") + + saved_paths: dict[str, Path] = {} + + def fake_save(self, result, notes=None, variant_name=None): + path = tmp_path / ".evalview" / "golden" / f"{result.test_case}.golden.json" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("{}", encoding="utf-8") + saved_paths[result.test_case] = path + return path + + monkeypatch.setattr("evalview.core.golden.GoldenStore.save_golden", fake_save) + _patch_pipeline( + monkeypatch, + [_result("alpha", passed=True), _result("beta", passed=False, score=10.0)], + ) + + runner = CliRunner() + result = runner.invoke(snapshot, ["--path", "tests", "--json"]) + + assert result.exit_code == 0, result.output + + # Stdout must be a single, parseable JSON document — no banner, spinner, + # or Rich markup before/after it. + payload = json.loads(result.output) + + assert payload["snapshot"]["total_tests"] == 2 + assert payload["snapshot"]["passing"] == 1 + assert payload["snapshot"]["saved"] == 1 + assert payload["snapshot"]["test_path"] == "tests" + + by_name = {t["name"]: t for t in payload["tests"]} + assert by_name["alpha"]["passed"] is True + assert by_name["alpha"]["saved"] is True + assert by_name["alpha"]["golden_file"] == str(saved_paths["alpha"]) + assert by_name["beta"]["passed"] is False + assert by_name["beta"]["saved"] is False + assert by_name["beta"]["golden_file"] is None + + # No banner/spinner/Rich markers should leak into stdout. + assert "Catch agent regressions" not in result.output + assert "Snapshotting" not in result.output + assert "✓" not in result.output + + +def test_snapshot_json_marks_failed_saves_accurately(monkeypatch, tmp_path): + """If save_golden raises for one passing test, that test's `saved` must be False.""" + from evalview.commands.snapshot_cmd import snapshot + + monkeypatch.chdir(tmp_path) + tests_dir = tmp_path / "tests" + tests_dir.mkdir() + _write_yaml(tests_dir / "good.yaml", "good") + _write_yaml(tests_dir / "broken.yaml", "broken") + + def flaky_save(self, result, notes=None, variant_name=None): + if result.test_case == "broken": + raise RuntimeError("disk full") + path = tmp_path / ".evalview" / "golden" / f"{result.test_case}.golden.json" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("{}", encoding="utf-8") + return path + + monkeypatch.setattr("evalview.core.golden.GoldenStore.save_golden", flaky_save) + _patch_pipeline( + monkeypatch, + [_result("good", passed=True), _result("broken", passed=True)], + ) + + runner = CliRunner() + result = runner.invoke(snapshot, ["--path", "tests", "--json"]) + + assert result.exit_code == 0, result.output + payload = json.loads(result.output) + by_name = {t["name"]: t for t in payload["tests"]} + + assert payload["snapshot"]["passing"] == 2 + assert payload["snapshot"]["saved"] == 1 + assert by_name["good"]["saved"] is True + assert by_name["good"]["golden_file"] is not None + assert by_name["broken"]["saved"] is False + assert by_name["broken"]["golden_file"] is None + + +def test_snapshot_json_uses_variant_aware_golden_path(monkeypatch, tmp_path): + """Per-test `golden_file` must match the path GoldenStore actually wrote to.""" + from evalview.commands.snapshot_cmd import snapshot + + monkeypatch.chdir(tmp_path) + tests_dir = tmp_path / "tests" + tests_dir.mkdir() + _write_yaml(tests_dir / "alpha.yaml", "alpha") + + written: dict[str, Path] = {} + + def fake_save(self, result, notes=None, variant_name=None): + suffix = f".variant_{variant_name}" if variant_name else "" + path = tmp_path / ".evalview" / "golden" / f"{result.test_case}{suffix}.golden.json" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("{}", encoding="utf-8") + written[result.test_case] = path + return path + + monkeypatch.setattr("evalview.core.golden.GoldenStore.save_golden", fake_save) + _patch_pipeline(monkeypatch, [_result("alpha", passed=True)]) + + runner = CliRunner() + result = runner.invoke(snapshot, ["--path", "tests", "--json", "--variant", "slow"]) + + assert result.exit_code == 0, result.output + payload = json.loads(result.output) + alpha = payload["tests"][0] + assert alpha["golden_file"] == str(written["alpha"]) + assert "variant_slow" in alpha["golden_file"] + assert payload["snapshot"]["variant"] == "slow" + + +def test_snapshot_json_rejects_preview_combo(monkeypatch, tmp_path): + """`--preview --json` is mutually exclusive — must exit non-zero with JSON error.""" + from evalview.commands.snapshot_cmd import snapshot + + monkeypatch.chdir(tmp_path) + (tmp_path / "tests").mkdir() + + runner = CliRunner() + result = runner.invoke(snapshot, ["--path", "tests", "--json", "--preview"]) + + assert result.exit_code != 0 + payload = json.loads(result.output) + assert "error" in payload + assert "preview" in payload["error"].lower() + + +def test_snapshot_json_no_tests_emits_error_payload(monkeypatch, tmp_path): + """Empty suite should yield a parseable JSON error rather than Rich output.""" + from evalview.commands.snapshot_cmd import snapshot + + monkeypatch.chdir(tmp_path) + (tmp_path / "tests").mkdir() + + runner = CliRunner() + result = runner.invoke(snapshot, ["--path", "tests", "--json"]) + + assert result.exit_code == 0 + payload = json.loads(result.output) + assert payload == {"error": "no tests found"}