From 8eb3874263bb7a2f92d3a92ee06780c491ea09c7 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 20 Apr 2026 07:35:46 +0000
Subject: [PATCH] fix(snapshot): harden --json output for CI consumers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to Matt's --json flag:

- Keep stdout clean in JSON mode by plumbing json_output into
  _execute_snapshot_tests and running without the Rich live spinner.
  Previously per-test prints and spinner frames leaked into the
  payload, making stdout unparseable.
- Track the real set of saved tests via a {name -> Path} map so per-test
  `saved` and `golden_file` reflect actual disk writes — previously any
  passing test was reported as saved whenever at least one sibling
  saved, and the path guessed `.yaml` instead of the variant-aware
  `.golden.json` GoldenStore actually writes.
- Reject `--preview --json` upfront with a JSON error and non-zero exit
  rather than silently dropping --json.
- Pretty-print JSON (indent=2) to match `skill`/`model-check` output,
  tidy the --json help text to document the suppression and
  auto-approve behavior, and keep `import json` with the other stdlib
  imports.
- Cover the contract with tests/test_snapshot_json_output.py:
  parseable payload, accurate per-test saved/golden_file tracking
  (including partial save failures), variant-aware paths, --preview
  rejection, and the empty-suite error shape.

https://claude.ai/code/session_01YPVyciLBGFEpKoMV7y8zFQ
---
 evalview/commands/shared.py        |  18 ++-
 evalview/commands/snapshot_cmd.py  |  78 +++++----
 tests/test_snapshot_json_output.py | 247 +++++++++++++++++++++++++++++
 3 files changed, 310 insertions(+), 33 deletions(-)
 create mode 100644 tests/test_snapshot_json_output.py

diff --git a/evalview/commands/shared.py b/evalview/commands/shared.py
index a1dabe4..e45da53 100644
--- a/evalview/commands/shared.py
+++ b/evalview/commands/shared.py
@@ -552,8 +552,13 @@ def _execute_snapshot_tests(
     config: Optional["EvalViewConfig"],
     timeout: float = 30.0,
     skip_llm_judge: bool = False,
+    json_output: bool = False,
 ) -> List["EvaluationResult"]:
-    """Execute tests and evaluate results for snapshot/benchmark commands."""
+    """Execute tests and evaluate results for snapshot/benchmark commands.
+
+    When json_output=True, per-test console output is suppressed so stdout
+    stays clean for JSON consumers.
+    """
     from evalview.evaluators.evaluator import Evaluator
 
     results = []
@@ -563,10 +568,12 @@ async def _run_one(tc: "TestCase") -> Optional["EvaluationResult"]:
         try:
             adapter = _build_adapter_for_tc(tc, config, timeout)
         except ValueError as e:
-            console.print(f"[yellow]⚠ Skipping {tc.name}: {e}[/yellow]")
+            if not json_output:
+                console.print(f"[yellow]⚠ Skipping {tc.name}: {e}[/yellow]")
             return None
         if adapter is None:
-            console.print(f"[yellow]⚠ Skipping {tc.name}: No adapter/endpoint configured[/yellow]")
+            if not json_output:
+                console.print(f"[yellow]⚠ Skipping {tc.name}: No adapter/endpoint configured[/yellow]")
             return None
 
         trace = await _execute_agent_with_slow_warning(tc, adapter, timeout)
@@ -581,6 +588,8 @@ async def _run_all() -> List[Any]:
         if isinstance(outcome, BaseException):
             error_str = str(outcome)
             endpoint = tc.endpoint or (config.endpoint if config else None) or ""
+            if json_output:
+                continue
             if isinstance(outcome, (asyncio.TimeoutError, asyncio.CancelledError)):
                 console.print(f"[red]✗ {tc.name}: Async execution failed - {outcome}[/red]")
             else:
@@ -597,6 +606,9 @@ async def _run_all() -> List[Any]:
         result = outcome
         results.append(result)
 
+        if json_output:
+            continue
+
         if result.passed:
             console.print(f"[green]✓ {tc.name}:[/green] {result.score:.1f}/100")
         else:
diff --git a/evalview/commands/snapshot_cmd.py b/evalview/commands/snapshot_cmd.py
index 4a2426b..feeca5d 100644
--- a/evalview/commands/snapshot_cmd.py
+++ b/evalview/commands/snapshot_cmd.py
@@ -1,12 +1,11 @@
 """Snapshot command — run tests and save passing results as baseline."""
 from __future__ import annotations
 
+import json
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Dict, List, Optional, TYPE_CHECKING
 
-import json
-
 import click
 import yaml  # type: ignore[import-untyped]
 
@@ -27,11 +26,14 @@ def _save_snapshot_results(
     notes: Optional[str],
     variant: Optional[str] = None,
     quiet: bool = False,
-) -> int:
+) -> Dict[str, Path]:
     """Save passing test results as golden baselines.
 
     Returns:
-        Number of tests successfully saved
+        Mapping of test_case name to the saved golden file path. Only
+        tests that were actually written appear in the result — tests
+        that raised during save are omitted so callers can report
+        accurate per-test status.
     """
     from evalview.core.golden import GoldenStore
 
@@ -52,30 +54,30 @@ def _save_snapshot_results(
             if not timed_out and not low_score:
                 console.print("[dim]  Run evalview run to see detailed failure reasons, then fix and retry.[/dim]")
             console.print()
-        return 0
+        return {}
 
     # Save passing results as golden
     if not quiet:
         console.print()
-    saved_count = 0
-    saved_names = []
+    saved: Dict[str, Path] = {}
     for result in passing:
         try:
-            store.save_golden(result, notes=notes, variant_name=variant)
+            path = store.save_golden(result, notes=notes, variant_name=variant)
             variant_label = f" (variant: {variant})" if variant else ""
             if not quiet:
                 console.print(f"[green]✓ Snapshotted:[/green] {result.test_case}{variant_label}")
-            saved_count += 1
-            saved_names.append(result.test_case)
+            # save_golden returns a Path on success; fall back to the
+            # deterministic path helper if an older implementation returns None.
+            saved[result.test_case] = path if path is not None else store._get_golden_path(result.test_case, variant)
         except Exception as e:
             if not quiet:
                 console.print(f"[red]❌ Failed to save {result.test_case}: {e}[/red]")
 
     # Silent cloud push — never blocks or fails the snapshot
-    if saved_names:
-        _cloud_push(saved_names)
+    if saved:
+        _cloud_push(list(saved.keys()))
 
-    return saved_count
+    return saved
 
 
 def _is_generated_draft(test_case) -> bool:
@@ -195,7 +197,7 @@ def _group_tests_by_target(test_cases: List, config) -> Dict[tuple[str, str], li
 @click.option("--no-judge", "no_judge", is_flag=True, default=False, help="Skip LLM-as-judge evaluation. Uses deterministic scoring only (scores capped at 75). No API key required.")
 @click.option("--timeout", default=30.0, type=float, help="Timeout in seconds per test (default: 30).")
 @click.option("--preview", is_flag=True, help="Show what would change without saving. Dry-run mode for snapshot.")
-@click.option("--json", "json_output", is_flag=True, help="Output JSON for CI")
+@click.option("--json", "json_output", is_flag=True, help="Emit a JSON payload on stdout for CI. Suppresses Rich output, auto-approves generated drafts, and skips the dashboard prompt.")
 @track_command("snapshot")
 @click.pass_context
 def snapshot(ctx: click.Context, test_path: str, notes: str, test: str, variant: str, approve_generated: bool, reset: bool, judge_model: Optional[str], no_judge: bool, timeout: float, preview: bool, json_output: bool):
@@ -225,6 +227,15 @@ def snapshot(ctx: click.Context, test_path: str, notes: str, test: str, variant:
     from evalview.core.messages import get_random_snapshot_message
     from evalview.skills.ui_utils import print_evalview_banner
 
+    # --preview and --json collide: preview emits human-readable diff output,
+    # --json promises a parseable payload. Fail fast rather than silently
+    # drop one of them.
+    if json_output and preview:
+        print(json.dumps(
+            {"error": "--preview cannot be combined with --json"}, indent=2
+        ))
+        ctx.exit(2)
+
     if not json_output:
         print_evalview_banner(console, subtitle="[dim]Catch agent regressions before you ship[/dim]")
 
@@ -265,12 +276,12 @@ def snapshot(ctx: click.Context, test_path: str, notes: str, test: str, variant:
             console.print(f"[red]❌ Failed to load test cases: {e}[/red]\n")
             Celebrations.no_tests_found()
         else:
-            print(json.dumps({"error": str(e)}))
+            print(json.dumps({"error": str(e)}, indent=2))
         return
 
     if not test_cases:
         if json_output:
-            print(json.dumps({"error": "no tests found"}))
+            print(json.dumps({"error": "no tests found"}, indent=2))
         else:
             Celebrations.no_tests_found()
         return
@@ -280,7 +291,7 @@ def snapshot(ctx: click.Context, test_path: str, notes: str, test: str, variant:
         test_cases = [tc for tc in test_cases if tc.name == test]
         if not test_cases:
             if json_output:
-                print(json.dumps({"error": f"no test found with name: {test}"}))
+                print(json.dumps({"error": f"no test found with name: {test}"}, indent=2))
             else:
                 console.print(f"[red]❌ No test found with name: {test}[/red]\n")
             return
@@ -329,13 +340,20 @@ def snapshot(ctx: click.Context, test_path: str, notes: str, test: str, variant:
     endpoints, adapters = _summarize_mixed_targets(test_cases, config)
     target_groups = _group_tests_by_target(test_cases, config)
 
-    # Execute tests with spinner
-    from evalview.commands.shared import run_with_spinner
-    results = run_with_spinner(
-        lambda: _execute_snapshot_tests(test_cases, config, timeout=timeout, skip_llm_judge=no_judge),
-        "Snapshotting",
-        len(test_cases),
-    )
+    # Execute tests. In JSON mode we skip the live spinner — it writes Rich
+    # frames to the same console stream as our JSON payload and would make
+    # stdout unparseable.
+    if json_output:
+        results = _execute_snapshot_tests(
+            test_cases, config, timeout=timeout, skip_llm_judge=no_judge, json_output=True
+        )
+    else:
+        from evalview.commands.shared import run_with_spinner
+        results = run_with_spinner(
+            lambda: _execute_snapshot_tests(test_cases, config, timeout=timeout, skip_llm_judge=no_judge),
+            "Snapshotting",
+            len(test_cases),
+        )
     failed_count = len(test_cases) - len(results)
 
     # Preview mode: show what would change without saving
@@ -380,11 +398,11 @@ def snapshot(ctx: click.Context, test_path: str, notes: str, test: str, variant:
         return
 
     # Save passing results as golden
-    saved_count = _save_snapshot_results(results, notes, variant=variant, quiet=json_output)
+    saved_paths = _save_snapshot_results(results, notes, variant=variant, quiet=json_output)
+    saved_count = len(saved_paths)
 
     # JSON output mode
     if json_output:
-        golden_store = GoldenStore()
         snapshot_data = {
             "snapshot": {
                 "timestamp": datetime.now(timezone.utc).isoformat(),
@@ -401,15 +419,15 @@ def snapshot(ctx: click.Context, test_path: str, notes: str, test: str, variant:
                     "name": result.test_case,
                     "passed": result.passed,
                     "score": result.score,
-                    "saved": result.passed and saved_count > 0,
-                    "golden_file": str(golden_store.golden_dir / f"{result.test_case}.yaml")
-                    if result.passed and saved_count > 0
+                    "saved": result.test_case in saved_paths,
+                    "golden_file": str(saved_paths[result.test_case])
+                    if result.test_case in saved_paths
                     else None,
                 }
                 for result in results
             ],
         }
-        print(json.dumps(snapshot_data))
+        print(json.dumps(snapshot_data, indent=2))
         state_store.update_snapshot(test_count=saved_count)
         return
 
diff --git a/tests/test_snapshot_json_output.py b/tests/test_snapshot_json_output.py
new file mode 100644
index 0000000..be7187a
--- /dev/null
+++ b/tests/test_snapshot_json_output.py
@@ -0,0 +1,247 @@
+"""Tests for `evalview snapshot --json` CI output contract."""
+
+from __future__ import annotations
+
+import json
+from datetime import datetime
+from pathlib import Path
+
+from click.testing import CliRunner
+
+from evalview.core.types import (
+    ContainsChecks,
+    CostEvaluation,
+    EvaluationResult,
+    Evaluations,
+    ExecutionMetrics,
+    ExecutionTrace,
+    LatencyEvaluation,
+    OutputEvaluation,
+    SequenceEvaluation,
+    ToolEvaluation,
+)
+
+
+def _result(name: str, *, passed: bool, score: float = 90.0) -> EvaluationResult:
+    now = datetime.now()
+    return EvaluationResult(
+        test_case=name,
+        passed=passed,
+        score=score,
+        evaluations=Evaluations(
+            tool_accuracy=ToolEvaluation(accuracy=1.0 if passed else 0.0),
+            sequence_correctness=SequenceEvaluation(
+                correct=passed, expected_sequence=[], actual_sequence=[]
+            ),
+            output_quality=OutputEvaluation(
+                score=score,
+                rationale="ok",
+                contains_checks=ContainsChecks(),
+                not_contains_checks=ContainsChecks(),
+            ),
+            cost=CostEvaluation(total_cost=0.0, threshold=1.0, passed=True),
+            latency=LatencyEvaluation(total_latency=10.0, threshold=1000.0, passed=True),
+        ),
+        trace=ExecutionTrace(
+            session_id="s1",
+            start_time=now,
+            end_time=now,
+            steps=[],
+            final_output="ok",
+            metrics=ExecutionMetrics(total_cost=0.0, total_latency=10.0),
+        ),
+        timestamp=now,
+    )
+
+
+def _write_yaml(path: Path, name: str) -> None:
+    path.write_text(
+        f"""name: {name}
+input:
+  query: hi
+expected:
+  output:
+    contains:
+      - ok
+thresholds:
+  min_score: 0
+""",
+        encoding="utf-8",
+    )
+
+
+def _patch_pipeline(monkeypatch, results):
+    """Stub out the slow moving parts so the command runs deterministically."""
+    from evalview.commands import snapshot_cmd
+
+    monkeypatch.setattr(snapshot_cmd, "_load_config_if_exists", lambda: None)
+    monkeypatch.setattr(
+        snapshot_cmd,
+        "_execute_snapshot_tests",
+        lambda test_cases, config, **kwargs: list(results),
+    )
+    monkeypatch.setattr(snapshot_cmd, "_cloud_push", lambda saved_names: None)
+    monkeypatch.setattr(
+        "evalview.core.project_state.ProjectStateStore.is_first_snapshot",
+        lambda self: False,
+    )
+    monkeypatch.setattr(
+        "evalview.core.project_state.ProjectStateStore.update_snapshot",
+        lambda self, test_count=1: None,
+    )
+
+
+def test_snapshot_json_emits_parseable_payload(monkeypatch, tmp_path):
+    """`--json` must put a single parseable JSON document on stdout with no Rich output."""
+    from evalview.commands.snapshot_cmd import snapshot
+
+    monkeypatch.chdir(tmp_path)
+    tests_dir = tmp_path / "tests"
+    tests_dir.mkdir()
+    _write_yaml(tests_dir / "alpha.yaml", "alpha")
+    _write_yaml(tests_dir / "beta.yaml", "beta")
+
+    saved_paths: dict[str, Path] = {}
+
+    def fake_save(self, result, notes=None, variant_name=None):
+        path = tmp_path / ".evalview" / "golden" / f"{result.test_case}.golden.json"
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text("{}", encoding="utf-8")
+        saved_paths[result.test_case] = path
+        return path
+
+    monkeypatch.setattr("evalview.core.golden.GoldenStore.save_golden", fake_save)
+    _patch_pipeline(
+        monkeypatch,
+        [_result("alpha", passed=True), _result("beta", passed=False, score=10.0)],
+    )
+
+    runner = CliRunner()
+    result = runner.invoke(snapshot, ["--path", "tests", "--json"])
+
+    assert result.exit_code == 0, result.output
+
+    # Stdout must be a single, parseable JSON document — no banner, spinner,
+    # or Rich markup before/after it.
+    payload = json.loads(result.output)
+
+    assert payload["snapshot"]["total_tests"] == 2
+    assert payload["snapshot"]["passing"] == 1
+    assert payload["snapshot"]["saved"] == 1
+    assert payload["snapshot"]["test_path"] == "tests"
+
+    by_name = {t["name"]: t for t in payload["tests"]}
+    assert by_name["alpha"]["passed"] is True
+    assert by_name["alpha"]["saved"] is True
+    assert by_name["alpha"]["golden_file"] == str(saved_paths["alpha"])
+    assert by_name["beta"]["passed"] is False
+    assert by_name["beta"]["saved"] is False
+    assert by_name["beta"]["golden_file"] is None
+
+    # No banner/spinner/Rich markers should leak into stdout.
+    assert "Catch agent regressions" not in result.output
+    assert "Snapshotting" not in result.output
+    assert "✓" not in result.output
+
+
+def test_snapshot_json_marks_failed_saves_accurately(monkeypatch, tmp_path):
+    """If save_golden raises for one passing test, that test's `saved` must be False."""
+    from evalview.commands.snapshot_cmd import snapshot
+
+    monkeypatch.chdir(tmp_path)
+    tests_dir = tmp_path / "tests"
+    tests_dir.mkdir()
+    _write_yaml(tests_dir / "good.yaml", "good")
+    _write_yaml(tests_dir / "broken.yaml", "broken")
+
+    def flaky_save(self, result, notes=None, variant_name=None):
+        if result.test_case == "broken":
+            raise RuntimeError("disk full")
+        path = tmp_path / ".evalview" / "golden" / f"{result.test_case}.golden.json"
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text("{}", encoding="utf-8")
+        return path
+
+    monkeypatch.setattr("evalview.core.golden.GoldenStore.save_golden", flaky_save)
+    _patch_pipeline(
+        monkeypatch,
+        [_result("good", passed=True), _result("broken", passed=True)],
+    )
+
+    runner = CliRunner()
+    result = runner.invoke(snapshot, ["--path", "tests", "--json"])
+
+    assert result.exit_code == 0, result.output
+    payload = json.loads(result.output)
+    by_name = {t["name"]: t for t in payload["tests"]}
+
+    assert payload["snapshot"]["passing"] == 2
+    assert payload["snapshot"]["saved"] == 1
+    assert by_name["good"]["saved"] is True
+    assert by_name["good"]["golden_file"] is not None
+    assert by_name["broken"]["saved"] is False
+    assert by_name["broken"]["golden_file"] is None
+
+
+def test_snapshot_json_uses_variant_aware_golden_path(monkeypatch, tmp_path):
+    """Per-test `golden_file` must match the path GoldenStore actually wrote to."""
+    from evalview.commands.snapshot_cmd import snapshot
+
+    monkeypatch.chdir(tmp_path)
+    tests_dir = tmp_path / "tests"
+    tests_dir.mkdir()
+    _write_yaml(tests_dir / "alpha.yaml", "alpha")
+
+    written: dict[str, Path] = {}
+
+    def fake_save(self, result, notes=None, variant_name=None):
+        suffix = f".variant_{variant_name}" if variant_name else ""
+        path = tmp_path / ".evalview" / "golden" / f"{result.test_case}{suffix}.golden.json"
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_text("{}", encoding="utf-8")
+        written[result.test_case] = path
+        return path
+
+    monkeypatch.setattr("evalview.core.golden.GoldenStore.save_golden", fake_save)
+    _patch_pipeline(monkeypatch, [_result("alpha", passed=True)])
+
+    runner = CliRunner()
+    result = runner.invoke(snapshot, ["--path", "tests", "--json", "--variant", "slow"])
+
+    assert result.exit_code == 0, result.output
+    payload = json.loads(result.output)
+    alpha = payload["tests"][0]
+    assert alpha["golden_file"] == str(written["alpha"])
+    assert "variant_slow" in alpha["golden_file"]
+    assert payload["snapshot"]["variant"] == "slow"
+
+
+def test_snapshot_json_rejects_preview_combo(monkeypatch, tmp_path):
+    """`--preview --json` is mutually exclusive — must exit non-zero with JSON error."""
+    from evalview.commands.snapshot_cmd import snapshot
+
+    monkeypatch.chdir(tmp_path)
+    (tmp_path / "tests").mkdir()
+
+    runner = CliRunner()
+    result = runner.invoke(snapshot, ["--path", "tests", "--json", "--preview"])
+
+    assert result.exit_code != 0
+    payload = json.loads(result.output)
+    assert "error" in payload
+    assert "preview" in payload["error"].lower()
+
+
+def test_snapshot_json_no_tests_emits_error_payload(monkeypatch, tmp_path):
+    """Empty suite should yield a parseable JSON error rather than Rich output."""
+    from evalview.commands.snapshot_cmd import snapshot
+
+    monkeypatch.chdir(tmp_path)
+    (tmp_path / "tests").mkdir()
+
+    runner = CliRunner()
+    result = runner.invoke(snapshot, ["--path", "tests", "--json"])
+
+    assert result.exit_code == 0
+    payload = json.loads(result.output)
+    assert payload == {"error": "no tests found"}