diff --git a/.gitignore b/.gitignore index 3d147ad..ae37427 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,4 @@ logs/ tmp/ temp/ *.tmp +.worktrees/ diff --git a/docs/scenario-pipeline/README.md b/docs/scenario-pipeline/README.md new file mode 100644 index 0000000..8da5109 --- /dev/null +++ b/docs/scenario-pipeline/README.md @@ -0,0 +1,180 @@ +--- +commissioned-by: spacedock@0.8.2 +entity-type: eval_scenario +entity-label: scenario +entity-label-plural: scenarios +id-style: sequential +stages: + defaults: + worktree: false + concurrency: 2 + states: + - name: draft + initial: true + - name: ground-truth + - name: eval-run + - name: validated + gate: true + feedback-to: draft + - name: integrated + terminal: true +--- + +# Recce eval scenario pipeline + +Design, verify, and validate eval scenarios from jaffle-shop-simulator issues to build a comprehensive benchmark for measuring Recce plugin effectiveness at data PR review. + +## File Naming + +Each scenario is a markdown file named `{slug}.md` — lowercase, hyphens, no spaces. Example: `exclude-zero-orders-v1.md`. + +## Schema + +Every scenario file has YAML frontmatter with these fields: + +```yaml +--- +id: +title: Human-readable name +status: draft +assignee: +source: +started: +completed: +verdict: +score: +worktree: +issue: +pr: +jaffle_issue: GitHub issue number in jaffle-shop-simulator +patch_file: Path to the reverse patch file +scenario_yaml: Path to the scenario YAML definition +prompt_file: Path to the eval prompt file +--- +``` + +### Field Reference + +| Field | Type | Description | +|-------|------|-------------| +| `id` | string | Unique identifier, format determined by id-style in README frontmatter | +| `title` | string | Human-readable scenario name | +| `status` | enum | One of: draft, ground-truth, eval-run, validated, integrated | +| `assignee` | string | Who is working on this scenario (GitHub username). Claim by setting + commit/push. | +| `source` | string | Where this scenario came from | +| `started` | ISO 8601 | When active work began | +| `completed` | ISO 8601 | When the scenario reached terminal status | +| `verdict` | enum | PASSED or REJECTED — set at final stage | +| `score` | number | Priority score, 0.0–1.0 (optional) | +| `worktree` | string | Worktree path while a dispatched agent is active, empty otherwise | +| `issue` | string | GitHub issue reference (e.g., `#42` or `owner/repo#42`). Optional cross-reference, set manually. | +| `pr` | string | GitHub PR reference (e.g., `#57` or `owner/repo#57`). Set when a PR is created for this entity's worktree branch. | +| `jaffle_issue` | number | Source issue number in DataRecce/jaffle-shop-simulator | +| `patch_file` | string | Relative path to the reverse patch file | +| `scenario_yaml` | string | Relative path to the scenario YAML definition | +| `prompt_file` | string | Relative path to the eval prompt template | + +## Stages + +### `draft` + +A new scenario has been conceived. The worker designs a subtle, plausible bug variant based on a jaffle-shop-simulator issue, creates the reverse patch, writes the scenario YAML, and prepares the eval prompt. + +- **Inputs:** jaffle-shop-simulator issue description, existing model SQL, existing scenario YAMLs as reference (r1/r2) +- **Outputs:** Patch file that applies cleanly and introduces a plausible bug; scenario YAML with all required fields (ground_truth values may be estimates); prompt file adapted to the scenario's story; dbt tests still pass after applying the patch +- **Good:** Bug is subtle enough that code review would approve; PR description is misleading but plausible; detection requires data comparison not just code reading +- **Bad:** Bug is obvious from code reading alone; dbt tests catch the bug; patch doesn't apply cleanly; scenario is a duplicate of an existing one + +### `ground-truth` + +The worker verifies the scenario's ground truth numbers by building dual-schema state (prod=clean, dev=buggy) and running SQL queries to confirm exact affected_row_count and model classification. + +- **Inputs:** Patch file from draft stage, scenario YAML with estimated ground_truth +- **Outputs:** Exact affected_row_count from SQL query (not estimated); every model in impacted_models verified to have changed rows; every model in not_impacted_models verified to have 0 changed rows; dashboard_impact verified against dashboard column list +- **Good:** Numbers come from actual SQL queries against dual-schema data; model classification is exhaustive (every model in DAG checked) +- **Bad:** Using estimated or rounded numbers; assuming model impact from code reading without SQL verification; forgetting to check downstream models + +### `eval-run` + +The worker runs the eval batch (N=3, Mode A tool-only) using run-case.sh and scores each run with score-deterministic.sh. Records pass rates, failure patterns, and cost. + +- **Inputs:** Verified scenario YAML with exact ground_truth, prompt file, MCP config, recce package installed in jaffle-shop-simulator venv +- **Outputs:** N=3 batch completed with all runs producing valid JSON output; each run scored with pass/fail per criterion; pass rate and failure pattern summary recorded in entity body; cost per run recorded +- **Good:** All 3 runs produce parseable JSON; scoring matches ground truth criteria; failures are analyzed not just counted +- **Bad:** Runs fail due to infrastructure issues (DuckDB lock, MCP timeout) rather than agent judgment; JSON extraction failures treated as agent errors + +### `validated` + +Captain reviews the eval results to confirm the scenario is good enough for the benchmark suite. This is a human approval gate. + +- **Inputs:** Eval-run results with pass rates, failure patterns, and cost +- **Outputs:** Captain's approval or rejection with feedback +- **Good:** Pass rate ≥80% on Mode A (scenario is solvable but challenging); failure patterns are about agent judgment not infrastructure; scenario tests something different from existing scenarios +- **Bad:** Pass rate too low (ground truth may be wrong); all failures are the same JSON extraction issue; scenario is redundant with existing ones + +### `integrated` + +The scenario is part of the official benchmark suite. Patch, YAML, and prompt files are committed to recce-claude-plugin and included in future batch runs. + +- **Inputs:** Approved scenario from validated stage +- **Outputs:** All scenario files committed to the repo +- **Good:** Scenario adds meaningful coverage to the benchmark +- **Bad:** N/A — terminal stage + +## Workflow State + +View the workflow overview: + +```bash +docs/scenario-pipeline/status +``` + +Output columns: ID, SLUG, STATUS, TITLE, SCORE, SOURCE. + +Include archived scenarios with `--archived`: + +```bash +docs/scenario-pipeline/status --archived +``` + +Find dispatchable scenarios ready for their next stage: + +```bash +docs/scenario-pipeline/status --next +``` + +Find scenarios in a specific stage: + +```bash +grep -l "status: ground-truth" docs/scenario-pipeline/*.md +``` + +## Scenario Template + +```yaml +--- +id: +title: Scenario name here +status: draft +assignee: +source: +started: +completed: +verdict: +score: +worktree: +issue: +pr: +jaffle_issue: +patch_file: +scenario_yaml: +prompt_file: +--- + +Description of this scenario — what bug is introduced, why it's plausible, and what the agent needs to find. +``` + +## Commit Discipline + +- Commit status changes at dispatch and merge boundaries +- Commit scenario body updates when substantive diff --git a/docs/scenario-pipeline/exclude-zero-orders-v1.md b/docs/scenario-pipeline/exclude-zero-orders-v1.md new file mode 100644 index 0000000..a15f14f --- /dev/null +++ b/docs/scenario-pipeline/exclude-zero-orders-v1.md @@ -0,0 +1,32 @@ +--- +id: "001" +title: "Exclude $0 Orders: filter on subtotal" +status: integrated +assignee: kent +source: commission seed +started: 2026-03-30T16:00:00+08:00 +completed: 2026-03-30T18:00:00+08:00 +verdict: PASSED +score: 0.8 +worktree: +issue: +pr: +jaffle_issue: 8 +patch_file: plugins/recce-dev/skills/recce-eval/scenarios/v2/patches/r8-exclude-zero-orders-wrong-column.patch +scenario_yaml: plugins/recce-dev/skills/recce-eval/scenarios/v2/r8-exclude-zero-orders-wrong-column.yaml +prompt_file: +--- + +## Bug Variant + +**Source issue**: jaffle-shop-simulator#8 — VP of Operations requests excluding complimentary ($0) orders from all mart models. + +**Plausible bug**: Filter on `WHERE subtotal > 0` instead of `WHERE order_total > 0` in stg_orders. The PR uses the wrong column — subtotal (pre-tax item total) instead of order_total (amount charged). With current data both produce identical results (all 4,155 zero-total orders also have zero subtotal), making it a semantic/spec deviation bug. + +**PR description**: "Filter out $0 comp orders at staging layer — add WHERE subtotal > 0 to stg_orders for clean downstream metrics" + +**Why it's hard**: Data comparison shows correct results. The bug is a specification deviation, not a data correctness issue. Agent must compare PR code against the issue spec to catch the wrong column. + +**Ground truth**: 4,155 rows filtered. stg_orders/orders lose rows. customers affected (236 have lower count_lifetime_orders). order_items unchanged (comp orders have no line items). Dashboard impacted (AOV changes). + +**Difficulty**: hard — detection requires spec comparison, not just data comparison. diff --git a/docs/scenario-pipeline/financial-columns-wrong-formula.md b/docs/scenario-pipeline/financial-columns-wrong-formula.md new file mode 100644 index 0000000..763019e --- /dev/null +++ b/docs/scenario-pipeline/financial-columns-wrong-formula.md @@ -0,0 +1,30 @@ +--- +id: "003" +title: "Financial Columns: wrong gross_profit formula" +status: draft +assignee: +source: commission seed +started: +completed: +verdict: +score: 0.6 +worktree: +issue: +pr: +jaffle_issue: 6 +patch_file: +scenario_yaml: +prompt_file: +--- + +## Bug Variant + +**Source issue**: jaffle-shop-simulator#6 — Accounting Manager requests audit-compliant financial_orders model with proper terminology. + +**Plausible bug**: Calculate `gross_profit = revenue_excl_tax - tax_collected` instead of `gross_profit = revenue_excl_tax - cost_of_goods_sold`. The formula subtracts tax instead of COGS — a classic accounting error that produces a number that looks like a margin but is completely wrong. + +**PR description**: "Add financial_orders mart with audit-compliant columns — gross profit computed as revenue minus tax" + +**Why it's subtle**: The PR creates a new model (not modifying existing ones), so there's no baseline to compare against. The formula `revenue - tax` produces positive numbers that look like reasonable margins. You need to know that gross_profit should use COGS, not tax. + +**Detection requires**: Domain knowledge that gross_profit = revenue - COGS, then comparing against the correct calculation using supply_cost data. This scenario tests whether the agent applies accounting domain knowledge, not just data comparison. diff --git a/docs/scenario-pipeline/status b/docs/scenario-pipeline/status new file mode 100755 index 0000000..725c503 --- /dev/null +++ b/docs/scenario-pipeline/status @@ -0,0 +1,339 @@ +#!/usr/bin/env python3 +# commissioned-by: spacedock@0.8.2 +# ABOUTME: Workflow status viewer — shows entity overview from YAML frontmatter. +# ABOUTME: Supports default table, --archived, and --next (dispatchable entity detection). +# +# goal: Show one-line-per-scenario workflow overview from YAML frontmatter. +# +# instruction: For every .md file in this directory (excluding README.md), +# extract slug (filename without .md), id, status, title, score, source from YAML frontmatter. +# Print aligned table with columns: ID, SLUG, STATUS, TITLE, SCORE, SOURCE. +# Sorted by stage order ascending, then score descending. +# Default: scan only $DIR/*.md. With --archived, also scan $DIR/_archive/*.md. +# With --next, read stage metadata from README frontmatter and output dispatchable entities. +# +# YAML parsing: Read lines between the first and second "---" delimiters. +# For each field, split on first ":" to get key and value. +# This handles both "field: value" and "field:" (empty). No PyYAML dependency. +# +# Empty field handling: YAML frontmatter fields may be empty (e.g., "score:" with nothing +# after the colon). After extraction, empty fields yield an empty string. +# Display empty fields as blank (empty string), NOT as "-" or "0" or the field name. +# For sort purposes, treat empty scores as lowest priority (sort after all scored items). +# +# Stage ordering: Map each status to a numeric order from the stages list. +# Unknown statuses get order 99 (sort last). +# +# Output format: Use fixed column widths. Header row, then separator row +# of dashes, then data rows. +# +# --next dispatch rules: An entity is dispatchable if ALL of: +# 1. Not terminal — its current stage has a defined next stage +# 2. Not gate-blocked — its current stage does NOT have gate: true +# 3. Not actively worked — entity does NOT have a non-empty worktree field +# 4. Concurrency available — count of actively-worked entities (non-empty worktree) in the next stage < that stage's concurrency limit +# +# constraints: Python 3 stdlib only (no PyYAML), resolves paths relative to this script, skips README.md. +# valid status values: draft, ground-truth, eval-run, validated, integrated. + +import glob +import os +import sys + + +def parse_frontmatter(filepath): + """Extract YAML frontmatter fields from a markdown file.""" + fields = {} + in_fm = False + with open(filepath, 'r') as f: + for line in f: + line = line.rstrip('\n') + if line == '---': + if in_fm: + break + in_fm = True + continue + if in_fm: + if ':' in line: + key, _, val = line.partition(':') + key = key.strip() + val = val.strip() + # Only capture top-level keys (no leading whitespace) + if not line[0].isspace(): + fields[key] = val + return fields + + +def parse_stages_block(filepath): + """Parse the stages block from README frontmatter. + + Returns a list of stage dicts with keys: name, worktree, concurrency, gate, terminal, initial. + Returns None if no stages block exists. + """ + lines = [] + in_fm = False + with open(filepath, 'r') as f: + for line in f: + line = line.rstrip('\n') + if line == '---': + if in_fm: + break + in_fm = True + continue + if in_fm: + lines.append(line) + + # Find the stages: block + stages_start = None + for i, line in enumerate(lines): + if line.rstrip() == 'stages:': + stages_start = i + break + + if stages_start is None: + return None + + # Determine the indentation level of children under stages: + # Find defaults: and states: sections + defaults = {} + states = [] + + # Parse the stages block by indentation + i = stages_start + 1 + stages_indent = None + while i < len(lines): + line = lines[i] + stripped = line.lstrip() + if not stripped: + i += 1 + continue + indent = len(line) - len(stripped) + + # First non-empty child sets the indent level + if stages_indent is None: + stages_indent = indent + elif indent < stages_indent: + break # Exited the stages block + + if indent == stages_indent: + if stripped == 'defaults:': + i += 1 + while i < len(lines): + dline = lines[i] + dstripped = dline.lstrip() + if not dstripped: + i += 1 + continue + dindent = len(dline) - len(dstripped) + if dindent <= stages_indent: + break + if ':' in dstripped: + k, _, v = dstripped.partition(':') + defaults[k.strip()] = v.strip() + i += 1 + continue + elif stripped == 'states:': + i += 1 + current_state = None + while i < len(lines): + sline = lines[i] + sstripped = sline.lstrip() + if not sstripped: + i += 1 + continue + sindent = len(sline) - len(sstripped) + if sindent <= stages_indent: + break + if sstripped.startswith('- name:'): + _, _, name = sstripped.partition('- name:') + current_state = {'name': name.strip()} + states.append(current_state) + elif current_state is not None and ':' in sstripped and not sstripped.startswith('- '): + k, _, v = sstripped.partition(':') + current_state[k.strip()] = v.strip() + i += 1 + continue + i += 1 + + if not states: + return None + + # Apply defaults and normalize types + default_worktree = defaults.get('worktree', 'false').lower() == 'true' + default_concurrency = int(defaults.get('concurrency', '2')) + + result = [] + for state in states: + stage = { + 'name': state['name'], + 'worktree': state.get('worktree', str(default_worktree)).lower() == 'true', + 'concurrency': int(state.get('concurrency', str(default_concurrency))), + 'gate': state.get('gate', 'false').lower() == 'true', + 'terminal': state.get('terminal', 'false').lower() == 'true', + 'initial': state.get('initial', 'false').lower() == 'true', + } + result.append(stage) + + return result + + +def scan_entities(directory): + """Scan a directory for .md entity files (excluding README.md).""" + entities = [] + pattern = os.path.join(directory, '*.md') + for filepath in sorted(glob.glob(pattern)): + if os.path.basename(filepath) == 'README.md': + continue + slug = os.path.splitext(os.path.basename(filepath))[0] + fields = parse_frontmatter(filepath) + entities.append({ + 'slug': slug, + 'id': fields.get('id', ''), + 'status': fields.get('status', ''), + 'title': fields.get('title', ''), + 'score': fields.get('score', ''), + 'source': fields.get('source', ''), + 'worktree': fields.get('worktree', ''), + }) + return entities + + +def stage_order(status, stages): + """Map status to numeric order based on stages list. Unknown = 99.""" + if stages: + for i, stage in enumerate(stages): + if stage['name'] == status: + return i + 1 + return 99 + + +def sort_key_default(entity, stages): + """Sort key for default mode: stage order ascending, score descending.""" + order = stage_order(entity['status'], stages) + score_str = entity['score'] + if score_str: + try: + score_val = -float(score_str) + except ValueError: + score_val = 0 + else: + score_val = 1 # empty scores sort last (higher = later) + return (order, score_val) + + +def sort_key_next(entity): + """Sort key for --next mode: score descending.""" + score_str = entity['score'] + if score_str: + try: + return -float(score_str) + except ValueError: + return 0 + return 1 # empty scores sort last + + +def print_status_table(entities, stages): + """Print the default status table.""" + fmt = '%-6s %-40s %-20s %-45s %-8s %s' + print(fmt % ('ID', 'SLUG', 'STATUS', 'TITLE', 'SCORE', 'SOURCE')) + print(fmt % ('--', '----', '------', '-----', '-----', '------')) + sorted_entities = sorted(entities, key=lambda e: sort_key_default(e, stages)) + for e in sorted_entities: + print(fmt % (e['id'], e['slug'], e['status'], e['title'], e['score'], e['source'])) + + +def print_next_table(entities, stages): + """Print the --next dispatchable entities table.""" + # Build stage lookup + stage_by_name = {s['name']: s for s in stages} + stage_names = [s['name'] for s in stages] + + # Count actively-worked entities per stage (non-empty worktree = active ensign) + active_counts = {} + for e in entities: + if e['worktree']: + st = e['status'] + active_counts[st] = active_counts.get(st, 0) + 1 + + # Sort all candidates by score descending first + candidates = sorted(entities, key=sort_key_next) + + # Determine dispatchable entities, tracking concurrency as we go + next_stage_counts = dict(active_counts) + dispatchable = [] + for e in candidates: + status = e['status'] + if status not in stage_by_name: + continue + stage_idx = stage_names.index(status) + + # Rule 1: Not terminal + stage = stage_by_name[status] + if stage.get('terminal', False): + continue + + # Rule 2: Not gate-blocked + if stage.get('gate', False): + continue + + # Rule 3: Not actively worked + if e['worktree']: + continue + + # Next stage + if stage_idx + 1 >= len(stage_names): + continue + next_stage_name = stage_names[stage_idx + 1] + next_stage = stage_by_name[next_stage_name] + + # Rule 4: Concurrency available + current_count = next_stage_counts.get(next_stage_name, 0) + if current_count >= next_stage['concurrency']: + continue + + # This entity is dispatchable + next_stage_counts[next_stage_name] = current_count + 1 + dispatchable.append({ + **e, + 'next': next_stage_name, + 'next_worktree': 'yes' if next_stage['worktree'] else 'no', + }) + + fmt = '%-6s %-40s %-20s %-20s %s' + print(fmt % ('ID', 'SLUG', 'CURRENT', 'NEXT', 'WORKTREE')) + print(fmt % ('--', '----', '-------', '----', '--------')) + for e in dispatchable: + print(fmt % (e['id'], e['slug'], e['status'], e['next'], e['next_worktree'])) + + +def main(): + pipeline_dir = os.environ.get('PIPELINE_DIR') or os.path.dirname(os.path.abspath(__file__)) + + args = sys.argv[1:] + include_archive = '--archived' in args + show_next = '--next' in args + + readme_path = os.path.join(pipeline_dir, 'README.md') + stages = None + if os.path.exists(readme_path): + stages = parse_stages_block(readme_path) + + if show_next: + if stages is None: + print('Error: README.md has no stages block. --next requires stage metadata.', file=sys.stderr) + sys.exit(1) + + entities = scan_entities(pipeline_dir) + if include_archive: + archive_dir = os.path.join(pipeline_dir, '_archive') + if os.path.isdir(archive_dir): + entities.extend(scan_entities(archive_dir)) + + if show_next: + print_next_table(entities, stages) + else: + print_status_table(entities, stages) + + +if __name__ == '__main__': + main() diff --git a/docs/scenario-pipeline/tax-summary-wrong-base.md b/docs/scenario-pipeline/tax-summary-wrong-base.md new file mode 100644 index 0000000..01cb1bd --- /dev/null +++ b/docs/scenario-pipeline/tax-summary-wrong-base.md @@ -0,0 +1,30 @@ +--- +id: "002" +title: "Tax Summary: wrong tax base" +status: draft +assignee: +source: commission seed +started: +completed: +verdict: +score: 0.7 +worktree: +issue: +pr: +jaffle_issue: 2 +patch_file: +scenario_yaml: +prompt_file: +--- + +## Bug Variant + +**Source issue**: jaffle-shop-simulator#2 — Accounting Manager requests a tax summary report with tax rate validation. + +**Plausible bug**: Calculate effective tax rate as `tax_paid / order_total` instead of `tax_paid / subtotal`. Since `order_total = subtotal + tax_paid`, using order_total as the denominator systematically understates the tax rate (e.g., 6% appears as ~5.66%). The formula is mathematically valid — it's just dividing by the wrong base. + +**PR description**: "Add tax rate validation — compute effective rate as tax collected divided by order total" + +**Why it's subtle**: `tax_paid / order_total` is a reasonable-looking formula. The error is a common confusion between tax-inclusive and tax-exclusive bases. The resulting rates are close to correct (off by ~0.3pp) — easy to miss without comparing against expected rates. + +**Detection requires**: Compare computed tax rate against expected rate per location, or verify that `tax_paid / subtotal` matches while `tax_paid / order_total` doesn't. diff --git a/plugins/recce-dev/skills/recce-eval/prompts/review.md b/plugins/recce-dev/skills/recce-eval/prompts/review.md new file mode 100644 index 0000000..b6459b7 --- /dev/null +++ b/plugins/recce-dev/skills/recce-eval/prompts/review.md @@ -0,0 +1,29 @@ +You are a senior data engineer at Jaffle Shop, a restaurant chain with 6 locations. +The dbt pipeline runs on {adapter_description}. + +{stakeholder_name} requested: "{stakeholder_request}" + +A teammate submitted a PR with the description: "{pr_description}" +All dbt tests pass. + +The Executive Dashboard (Streamlit app used by management) reads from these mart columns: +order_id, order_total, subtotal, tax_paid, ordered_at, customer_id, location_id, +is_food_order, is_drink_order, count_food_items, count_drink_items + +Your job is to review the data impact of this change: +1. Examine the code changes in the affected models +2. Check whether data values are correct by comparing current state against the base state +3. Identify which models are affected and which are not +4. Quantify the number of rows with changed data +5. Assess whether the Executive Dashboard would be impacted by this change +6. Report your findings + +IMPORTANT: Your very last message MUST be a text response (not a tool call) +containing a fenced JSON block with exactly these keys: + "issue_found": true or false, + "root_cause": "description of the root cause", + "impacted_models": ["list", "of", "impacted", "models"], + "not_impacted_models": ["list", "of", "models", "not", "impacted"], + "affected_row_count": number (rows where values differ), + "dashboard_impact": true or false (would the Executive Dashboard break or show wrong data?), + "evidence_summary": "describe what data-level evidence you used to reach your conclusion" diff --git a/plugins/recce-dev/skills/recce-eval/references/scoring-rubric.md b/plugins/recce-dev/skills/recce-eval/references/scoring-rubric.md index 28333f6..c8fc920 100644 --- a/plugins/recce-dev/skills/recce-eval/references/scoring-rubric.md +++ b/plugins/recce-dev/skills/recce-eval/references/scoring-rubric.md @@ -27,7 +27,42 @@ - `pass_count` = number of PASS checks - `fail_count` = number of FAIL checks -## LLM Judge Scoring +## v2 Deterministic Scoring + +### Layer 1 (Review) — case_type: problem_exists + +| Check Name | Logic | PASS condition | +|------------|-------|---------------| +| `issue_found` | exact match | agent says `true`, ground truth says `true` | +| `root_cause_keywords` | any keyword match (case-insensitive) | agent's `root_cause` contains at least one keyword from `root_cause_keywords` | +| `impacted: ` | set membership | each model in ground truth `impacted_models` appears in agent's `impacted_models` | +| `not_impacted: ` | set exclusion | each model in ground truth `not_impacted_models` does NOT appear in agent's `impacted_models` | +| `affected_row_count` | ±20% tolerance | agent's count is within 20% of ground truth count | +| `dashboard_impact` | exact match | agent's `dashboard_impact` matches ground truth | + +**Changes from v1:** +- `affected_row_count` uses ±20% tolerance (v1 used exact match) +- `dashboard_impact` is a new check (v1 had no dashboard dimension) +- `all_tests_pass` removed (v2 scenarios always have passing tests by design) +- `not_impacted_models` list reduced to ambiguous cases only (v1 listed all models) + +### v2 LLM Judge + +Two dimensions on a 0-3 scale (replaces v1's 5 dimensions on 1-5 scale): + +**Evidence Quality** (both layers): +- 0: Conclusion based only on code reading, no data evidence +- 1: Mentions data but no specific numbers +- 2: Has row count or value comparison, but incomplete +- 3: Complete data evidence (row counts + value diff + distribution analysis) + +**Self-Verification** (Layer 2 only, future): +- 0: No verification after completion +- 1: Ran dbt test only +- 2: Some data-level verification, incomplete +- 3: Systematic — confirmed new data correct, existing data unchanged, downstream safe + +## v1 LLM Judge Scoring See `${CLAUDE_PLUGIN_ROOT}/agents/eval-judge.md` for the 5 scoring dimensions and calibration anchors. diff --git a/plugins/recce-dev/skills/recce-eval/scenarios/ch1-healthy-audit.yaml b/plugins/recce-dev/skills/recce-eval/scenarios/ch1-healthy-audit.yaml index c437e2f..64a793d 100644 --- a/plugins/recce-dev/skills/recce-eval/scenarios/ch1-healthy-audit.yaml +++ b/plugins/recce-dev/skills/recce-eval/scenarios/ch1-healthy-audit.yaml @@ -12,18 +12,28 @@ prompt: | You are a new accounting hire at Jaffle Shop. The dbt pipeline runs on {adapter_description}. The pipeline has already been fixed and all tests pass. - Please audit October 2025 orders specifically: + Please audit October 2025 orders to confirm the pipeline is healthy: 1. Check row counts for October orders - 2. Check for NULL or zero amounts (note: orders with status='placed' have not been paid yet, so NULL/zero payment amounts for those orders are expected business behavior, not a data quality issue) + 2. Check for NULL or zero amounts 3. Check payment method distribution 4. Check order status distribution - 5. Report: Are there any data quality issues in October 2025? + 5. Report whether you found any PIPELINE BUGS (code errors that produce wrong data) - At the end of your response, output a fenced JSON block with exactly these keys: - "issue_found": true or false, - "issues": ["list of issues found, or empty array"], + Known expected data patterns (NOT bugs — do not report these): + - Orders with status='placed' have not been paid yet, so NULL/zero payment amounts for placed orders are normal + - Some placed orders may show non-zero amounts from partial payment processing — this is expected + - Small variations in daily/monthly distributions are normal for generated data + + Only report issue_found: true if you find a concrete code bug that causes + incorrect data transformation. Data patterns that are consistent across all + months are features of the source data, not pipeline bugs. + + IMPORTANT: Your very last message MUST be a text response (not a tool call) + containing a fenced JSON block with exactly these keys: + "issue_found": true or false (true ONLY for pipeline code bugs, not data patterns), + "issues": ["list of pipeline bugs found, or empty array"], "evidence": "summary of evidence checked", - "conclusion": "no_issues or description of issues" + "conclusion": "no_issues or description of pipeline bugs" headless: max_budget_usd: 3.00 @@ -31,11 +41,12 @@ headless: ground_truth: issue_found: false - false_positive_keywords: ["bug", "broken", "incorrect", "wrong", "missing data"] + false_positive_keywords: ["bug", "broken", "incorrect", "wrong", "missing data", "corrupted", "data loss", "regression"] judge_criteria: - - "Agent does NOT hallucinate problems that don't exist" - - "Agent provides concrete evidence for its 'no issues' conclusion (checked row counts, NULLs, distributions)" + - "Agent does NOT hallucinate pipeline bugs that don't exist" + - "Agent distinguishes between inherent data patterns and pipeline code errors" + - "Agent provides concrete evidence for its conclusion (checked row counts, NULLs, distributions)" - "Agent's tone is appropriately confident, not hedging with vague concerns" teardown: diff --git a/plugins/recce-dev/skills/recce-eval/scenarios/ch1-null-amounts.yaml b/plugins/recce-dev/skills/recce-eval/scenarios/ch1-null-amounts.yaml index a632db3..ceba00c 100644 --- a/plugins/recce-dev/skills/recce-eval/scenarios/ch1-null-amounts.yaml +++ b/plugins/recce-dev/skills/recce-eval/scenarios/ch1-null-amounts.yaml @@ -1,36 +1,36 @@ id: ch1-null-amounts name: "Chapter 1: NULL Amount Orders" -description: "Find and fix NULL amount orders caused by LEFT JOIN without COALESCE" +description: "Review data impact of removed COALESCE guards — NULL amounts from LEFT JOIN" chapter: 1 case_type: problem_exists setup: strategy: git_patch patch_reverse_file: patches/ch1-add-coalesce.patch - skip_context: true # Agent must run dbt itself (fix-the-bug workflow) dbt_commands: - "dbt run --target {target}" prompt: | - You are a new accounting hire at Jaffle Shop. The dbt pipeline runs on - {adapter_description}. Your job is to audit the data pipeline for - financial reporting accuracy. - - 1. Run the pipeline: dbt run --target {target} - 2. Run the tests: dbt test --target {target} - 3. Investigate any test failures - 4. Fix the root cause - 5. Re-run and confirm all tests pass - 6. Report your findings. - - At the end of your response, output a fenced JSON block with exactly these keys: + You are a senior data engineer at Jaffle Shop. The dbt pipeline runs on + {adapter_description}. A colleague submitted a PR that modifies the orders + model. They say "simplified the SELECT, removed unnecessary COALESCE calls." + dbt tests caught some failures. + + Your job is to thoroughly review the data impact: + 1. Examine the code changes in the orders model + 2. Even though tests failed, investigate the full scope of data impact + 3. Identify which models are affected and which are not + 4. Quantify the number of rows with incorrect data + 5. Report your findings + + IMPORTANT: Your very last message MUST be a text response (not a tool call) + containing a fenced JSON block with exactly these keys: "issue_found": true or false, "root_cause": "description of the root cause", - "fix_applied": "description of the fix", "impacted_models": ["list", "of", "impacted", "models"], "not_impacted_models": ["list", "of", "models", "not", "impacted"], - "affected_row_count": number, - "all_tests_pass": true or false + "affected_row_count": number (rows where values differ — use your best analysis, including manual SQL if tools report 0), + "all_tests_pass": true or false (report the ACTUAL dbt test result, not whether tests should exist) headless: max_budget_usd: 5.00 @@ -42,12 +42,12 @@ ground_truth: impacted_models: ["orders", "orders_daily_summary"] not_impacted_models: ["customers", "customer_segments", "customer_order_pattern"] affected_row_count: 1584 - all_tests_pass: true + all_tests_pass: false judge_criteria: - "Agent correctly traces the causal chain: LEFT JOIN → missing payments → NULL amounts" - "Agent does NOT claim models that read from stg_orders/stg_payments (not orders) are impacted" - - "Fix is minimal and correct (coalesce, not INNER JOIN or WHERE filter)" + - "Agent quantifies the impact using data comparison, not just code reading" teardown: restore_files: ["models/orders.sql"] diff --git a/plugins/recce-dev/skills/recce-eval/scenarios/v2/eval-config.yaml b/plugins/recce-dev/skills/recce-eval/scenarios/v2/eval-config.yaml new file mode 100644 index 0000000..00857bb --- /dev/null +++ b/plugins/recce-dev/skills/recce-eval/scenarios/v2/eval-config.yaml @@ -0,0 +1,25 @@ +version: 2 +environment: + repo: DataRecce/jaffle-shop-simulator + commit_sha: "021e2d38f42abda7301f1babe25604b0d0839ee4" + adapter: duckdb + target: dev + db_path: data/jaffel-shop.duckdb + schema: dev + adapter_description: "DuckDB (local file database, target: dev)" + +dashboard: + # Columns the Executive Dashboard reads from orders mart + # Source: dashboard/queries.py + orders_columns: + - order_id + - order_total + - subtotal + - tax_paid + - ordered_at + - customer_id + - location_id + - is_food_order + - is_drink_order + - count_food_items + - count_drink_items diff --git a/plugins/recce-dev/skills/recce-eval/scenarios/v2/patches/r1-tax-calculation-drift.patch b/plugins/recce-dev/skills/recce-eval/scenarios/v2/patches/r1-tax-calculation-drift.patch new file mode 100644 index 0000000..8e08055 --- /dev/null +++ b/plugins/recce-dev/skills/recce-eval/scenarios/v2/patches/r1-tax-calculation-drift.patch @@ -0,0 +1,13 @@ +diff --git b/models/staging/stg_orders.sql a/models/staging/stg_orders.sql +index 738434f..61408c0 100644 +--- a/models/staging/stg_orders.sql ++++ b/models/staging/stg_orders.sql +@@ -19,7 +19,7 @@ renamed as ( + subtotal as subtotal_cents, + tax_paid as tax_paid_cents, + order_total as order_total_cents, +- {{ cents_to_dollars('subtotal') }} - {{ cents_to_dollars('tax_paid') }} as subtotal, ++ {{ cents_to_dollars('subtotal') }} as subtotal, + {{ cents_to_dollars('tax_paid') }} as tax_paid, + {{ cents_to_dollars('order_total') }} as order_total, + diff --git a/plugins/recce-dev/skills/recce-eval/scenarios/v2/patches/r2-cogs-miscalculation.patch b/plugins/recce-dev/skills/recce-eval/scenarios/v2/patches/r2-cogs-miscalculation.patch new file mode 100644 index 0000000..819dbf3 --- /dev/null +++ b/plugins/recce-dev/skills/recce-eval/scenarios/v2/patches/r2-cogs-miscalculation.patch @@ -0,0 +1,13 @@ +diff --git b/models/marts/orders.sql a/models/marts/orders.sql +index cc290af..56fdc59 100644 +--- a/models/marts/orders.sql ++++ b/models/marts/orders.sql +@@ -17,7 +17,7 @@ order_items_summary as ( + select + order_id, + +- sum(case when is_food_item then supply_cost else 0 end) as order_cost, ++ sum(supply_cost) as order_cost, + sum(product_price) as order_items_subtotal, + count(order_item_id) as count_order_items, + sum( diff --git a/plugins/recce-dev/skills/recce-eval/scenarios/v2/patches/r8-exclude-zero-orders-wrong-column.patch b/plugins/recce-dev/skills/recce-eval/scenarios/v2/patches/r8-exclude-zero-orders-wrong-column.patch new file mode 100644 index 0000000..dad71fa --- /dev/null +++ b/plugins/recce-dev/skills/recce-eval/scenarios/v2/patches/r8-exclude-zero-orders-wrong-column.patch @@ -0,0 +1,9 @@ +diff --git b/models/staging/stg_orders.sql a/models/staging/stg_orders.sql +index f7da410..61408c0 100644 +--- b/models/staging/stg_orders.sql ++++ a/models/staging/stg_orders.sql +@@ -31,4 +31,3 @@ renamed as ( + ) + + select * from renamed +-where subtotal > 0 diff --git a/plugins/recce-dev/skills/recce-eval/scenarios/v2/r1-tax-calculation-drift.yaml b/plugins/recce-dev/skills/recce-eval/scenarios/v2/r1-tax-calculation-drift.yaml new file mode 100644 index 0000000..888969e --- /dev/null +++ b/plugins/recce-dev/skills/recce-eval/scenarios/v2/r1-tax-calculation-drift.yaml @@ -0,0 +1,55 @@ +id: r1-tax-calculation-drift +name: "R1: Subtotal Tax Deduction" +description: "subtotal changed to subtract tax_paid — raw subtotal is already pre-tax, so this double-deducts tax" +layer: review +difficulty: easy +stakeholder: accounting +case_type: problem_exists + +story: | + The Accounting Manager (P1) requested that subtotal be standardized as a pre-tax amount + across all reports. A teammate modified stg_orders to derive subtotal as + `subtotal - tax_paid`, claiming the raw data includes tax in subtotal. + Code review approved — the formula is mathematically correct for the stated purpose. + But the raw subtotal is ALREADY pre-tax (order_total = subtotal + tax_paid). + The change double-deducts tax, making subtotal systematically too low for 654,502 orders + (all orders with non-zero tax). The values propagate through orders into customers. + +environment: + repo: DataRecce/jaffle-shop-simulator + adapter: duckdb + +setup: + strategy: git_patch + patch_reverse_file: scenarios/v2/patches/r1-tax-calculation-drift.patch + skip_context: false + +prompt: + template: prompts/review.md + vars: + stakeholder_name: "Accounting Manager (P1)" + stakeholder_request: "Standardize subtotal as pre-tax amount across all reports" + pr_description: "Fix subtotal — raw data includes tax in subtotal, subtract tax_paid for accurate pre-tax amount" + +headless: + max_budget_usd: 5.00 + output_format: json + +ground_truth: + issue_found: true + issue_type: data_drift + root_cause_keywords: ["subtotal", "tax", "deduct", "pre-tax", "double", "already"] + impacted_models: ["stg_orders", "orders", "customers"] + not_impacted_models: ["order_items", "products"] + affected_row_count: 654502 + dashboard_impact: true + detection_requires: data_comparison + +judge_criteria: + - "Agent identifies that raw subtotal is already pre-tax through DATA comparison, not just code reading" + - "Agent recognizes that subtotal - tax_paid double-deducts tax from the amount" + - "Agent correctly identifies that customers model IS impacted (lifetime_spend_pretax uses subtotal)" + - "Agent correctly identifies that the Executive Dashboard IS impacted (subtotal is a dashboard column)" + +teardown: + restore_files: ["models/staging/stg_orders.sql"] diff --git a/plugins/recce-dev/skills/recce-eval/scenarios/v2/r2-cogs-miscalculation.yaml b/plugins/recce-dev/skills/recce-eval/scenarios/v2/r2-cogs-miscalculation.yaml new file mode 100644 index 0000000..2d0ac9e --- /dev/null +++ b/plugins/recce-dev/skills/recce-eval/scenarios/v2/r2-cogs-miscalculation.yaml @@ -0,0 +1,54 @@ +id: r2-cogs-miscalculation +name: "R2: COGS Miscalculation" +description: "order_cost filtered to food-only supply costs — drink supply costs (59% of COGS) omitted" +layer: review +difficulty: medium +stakeholder: management +case_type: problem_exists + +story: | + The VP of Operations (P0) asked the team to review COGS accuracy in the orders pipeline. + A teammate "optimized" the order_items_summary CTE to only sum supply_cost for food items + (CASE WHEN is_food_item), claiming drink items have negligible supply costs. + Code review approved — the CASE WHEN pattern is standard and the PR says "optimization". + But drink supply costs account for ~59% of total COGS. The change makes order_cost + systematically too low for 643,875 orders (98% — all orders with at least one drink item). + +environment: + repo: DataRecce/jaffle-shop-simulator + adapter: duckdb + +setup: + strategy: git_patch + patch_reverse_file: scenarios/v2/patches/r2-cogs-miscalculation.patch + skip_context: false + +prompt: + template: prompts/review.md + vars: + stakeholder_name: "VP of Operations (P0, Management)" + stakeholder_request: "Review COGS accuracy in the orders pipeline" + pr_description: "Optimize COGS calculation — filter to food supply costs for performance" + +headless: + max_budget_usd: 5.00 + output_format: json + +ground_truth: + issue_found: true + issue_type: data_drift + root_cause_keywords: ["supply", "cost", "drink", "food", "COGS", "order_cost", "is_food_item"] + impacted_models: ["orders"] + not_impacted_models: ["customers", "order_items"] + affected_row_count: 643875 + dashboard_impact: false + detection_requires: data_comparison + +judge_criteria: + - "Agent identifies the supply_cost aggregation bug through DATA comparison, not just reading the formula" + - "Agent recognizes that order_cost excludes drink supply costs due to the is_food_item filter" + - "Agent correctly identifies that customers model is NOT impacted (customers doesn't use order_cost)" + - "Agent correctly identifies that the Executive Dashboard is NOT impacted (order_cost is not a dashboard column)" + +teardown: + restore_files: ["models/marts/orders.sql"] diff --git a/plugins/recce-dev/skills/recce-eval/scenarios/v2/r8-exclude-zero-orders-wrong-column.yaml b/plugins/recce-dev/skills/recce-eval/scenarios/v2/r8-exclude-zero-orders-wrong-column.yaml new file mode 100644 index 0000000..1fd23bd --- /dev/null +++ b/plugins/recce-dev/skills/recce-eval/scenarios/v2/r8-exclude-zero-orders-wrong-column.yaml @@ -0,0 +1,79 @@ +id: r8-exclude-zero-orders-wrong-column +name: "R8: Exclude $0 Orders — Wrong Column" +description: "stg_orders filters on subtotal > 0 instead of order_total > 0 — semantically wrong column, but data results happen to be identical with current dataset" +layer: review +difficulty: hard +stakeholder: management +case_type: problem_exists + +story: | + The VP of Operations (P0) escalated that ~4,155 complimentary orders (order_total = 0) are + inflating order counts and diluting AOV on the Executive Dashboard. The board noticed + discrepancies between pipeline metrics and finance reports. + + A teammate submitted a PR adding `WHERE subtotal > 0` to stg_orders to filter out these + comp orders. Code review approved — the filter works and removes the right rows. + + But the issue specification explicitly says `WHERE order_total > 0` (filter by total charged + amount). The PR uses `subtotal > 0` instead. In the current dataset, subtotal = 0 ↔ + order_total = 0 for all rows, so the results are identical. However, the column choice is + semantically incorrect: subtotal is the pre-tax item total, while order_total is the actual + amount charged. If future data includes orders where subtotal ≠ 0 but order_total = 0 + (e.g., fully discounted orders with tax credit), the filter would silently produce wrong results. + + This scenario tests whether the agent merely validates data correctness or also checks + implementation against the original specification. + +environment: + repo: DataRecce/jaffle-shop-simulator + adapter: duckdb + +setup: + strategy: git_patch + patch_reverse_file: scenarios/v2/patches/r8-exclude-zero-orders-wrong-column.patch + skip_context: false + +prompt: + template: prompts/review.md + vars: + stakeholder_name: "VP of Operations (P0, Management)" + stakeholder_request: "Exclude complimentary ($0) orders from all mart models — filter at stg_orders where order_total = 0" + pr_description: "Filter out $0 comp orders at staging layer — add WHERE subtotal > 0 to stg_orders for clean downstream metrics" + +headless: + max_budget_usd: 5.00 + output_format: json + +ground_truth: + issue_found: true + issue_type: spec_deviation + root_cause_keywords: ["subtotal", "order_total", "wrong column", "spec", "filter", "complimentary"] + impacted_models: ["stg_orders", "orders", "customers"] + not_impacted_models: ["order_items", "products"] + affected_row_count: 4155 + affected_customers: 236 + dashboard_impact: true + detection_requires: code_review + data_impact_vs_correct_impl: identical + notes: | + With current data, WHERE subtotal > 0 and WHERE order_total > 0 produce identical results + (all 4,155 zero-total orders also have zero subtotal). The bug is semantic, not numerical. + Agent must compare the PR code against the issue spec to catch the wrong column. + - stg_orders: 658,657 → 654,502 (-4,155 rows) + - orders: 658,657 → 654,502 (-4,155 rows) + - order_items: 969,324 → 969,324 (unchanged — comp orders have no line items) + - customers: 2,586 → 2,586 (same count, but 236 customers have lower count_lifetime_orders) + - AOV: 10.9178 → 10.9871 (+0.63%) + - Revenue: unchanged ($0 orders contribute nothing) + - No customer_type changes (all affected remain "returning") + +judge_criteria: + - "Agent identifies that the filter uses subtotal instead of order_total as specified in the issue" + - "Agent recognizes this is a spec deviation — the column semantics differ even though current data produces identical results" + - "Agent correctly identifies that customers model IS impacted (count_lifetime_orders changes for 236 customers)" + - "Agent correctly identifies that order_items is NOT impacted (comp orders have no line items)" + - "Agent correctly identifies that the Executive Dashboard IS impacted (order counts and AOV change)" + - "Agent explains why order_total is the correct column (total charged vs pre-tax subtotal)" + +teardown: + restore_files: ["models/staging/stg_orders.sql"] diff --git a/plugins/recce-dev/skills/recce-eval/scripts/render-prompt.py b/plugins/recce-dev/skills/recce-eval/scripts/render-prompt.py new file mode 100755 index 0000000..1e43aee --- /dev/null +++ b/plugins/recce-dev/skills/recce-eval/scripts/render-prompt.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +"""Render a prompt template with variables from a scenario YAML. + +Usage: python3 render-prompt.py [--var key=value ...] +Outputs rendered prompt to stdout. + +Variables from scenario YAML (prompt.vars) are substituted first. +Additional --var overrides (e.g., adapter_description) are applied after. +""" +import sys, yaml, argparse + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("template", help="Path to prompt template file") + parser.add_argument("scenario", help="Path to scenario YAML file") + parser.add_argument("--var", action="append", default=[], help="Extra var: key=value") + args = parser.parse_args() + + with open(args.template) as f: + template = f.read() + with open(args.scenario) as f: + scenario = yaml.safe_load(f) + + vars_dict = scenario.get("prompt", {}).get("vars", {}) + + for v in args.var: + key, _, value = v.partition("=") + vars_dict[key] = value + + rendered = template + for key, value in vars_dict.items(): + rendered = rendered.replace("{" + key + "}", str(value)) + + print(rendered) + +if __name__ == "__main__": + main() diff --git a/plugins/recce-dev/skills/recce-eval/scripts/run-case.sh b/plugins/recce-dev/skills/recce-eval/scripts/run-case.sh index e83ed69..9cc5506 100755 --- a/plugins/recce-dev/skills/recce-eval/scripts/run-case.sh +++ b/plugins/recce-dev/skills/recce-eval/scripts/run-case.sh @@ -13,6 +13,8 @@ SETUP_STRATEGY="" PATCH_FILE="" RESTORE_FILES="" TARGET="" MAX_BUDGET_USD="" OUTPUT_DIR="" PLUGIN_DIR="" MCP_CONFIG="" RUN_NUMBER="1" DRY_RUN="false" BARE_MODE="false" CLEAN_PROFILE="true" +SKIP_SETUP="false" SKIP_TEARDOWN="false" MODEL="" +MODE="real-world" while [[ $# -gt 0 ]]; do case "$1" in @@ -35,6 +37,10 @@ while [[ $# -gt 0 ]]; do --clean-profile) CLEAN_PROFILE="true"; shift 1 ;; --no-clean-profile) CLEAN_PROFILE="false"; shift 1 ;; --skip-setup-context) SKIP_SETUP_CONTEXT="true"; shift 1 ;; + --skip-setup) SKIP_SETUP="true"; shift 1 ;; + --skip-teardown) SKIP_TEARDOWN="true"; shift 1 ;; + --model) MODEL="$2"; shift 2 ;; + --mode) MODE="$2"; shift 2 ;; *) echo "Unknown argument: $1" >&2; exit 1 ;; esac done @@ -65,6 +71,11 @@ if [ "$VARIANT" != "baseline" ] && [ "$VARIANT" != "with-plugin" ]; then exit 1 fi +if [ "$MODE" != "tool-only" ] && [ "$MODE" != "real-world" ]; then + echo "ERROR: --mode must be 'tool-only' or 'real-world', got: $MODE" >&2 + exit 1 +fi + # ========== Isolation Note ========== # Default: --clean-profile (no memory, no CLAUDE.md, plugin hooks fire organically). # Seed settings.json in temp HOME provides apiKeyHelper (no keychain prompts), @@ -119,6 +130,7 @@ fi # ========== Teardown Trap ========== cleanup() { + if [ "$SKIP_TEARDOWN" = "true" ]; then return; fi if [ -n "$RESTORE_FILES" ]; then IFS=',' read -ra FILES <<< "$RESTORE_FILES" for f in "${FILES[@]}"; do @@ -133,7 +145,7 @@ cleanup() { trap cleanup EXIT # ========== Setup Strategy ========== -if [ "$DRY_RUN" = "false" ]; then +if [ "$DRY_RUN" = "false" ] && [ "$SKIP_SETUP" = "false" ]; then case "$SETUP_STRATEGY" in git_patch) if [ -z "$PATCH_FILE" ]; then @@ -144,10 +156,16 @@ if [ "$DRY_RUN" = "false" ]; then echo "ERROR: Patch file not found: $PATCH_FILE" >&2 exit 1 fi - # Regenerate target-base/ from clean state BEFORE applying patch. - # This ensures base artifacts match current code exactly (no stale diffs). - # Without this, orders.sql may show as "modified" due to old target-base/. - dbt docs generate --target-path target-base --target "$TARGET" --quiet 2>/dev/null || true + # Build base state in a SEPARATE schema so Recce can compare data. + # DuckDB uses one file with multiple schemas. Without a separate base + # schema, value_diff compares dev against itself → 0 differences. + # 1. Build clean state in both dev (current) and prod (base) schemas + # 2. Capture base artifacts from prod + # 3. Apply patch and rebuild dev only + # 4. Recce compares dev (buggy) vs prod (clean) for actual data diffs + BASE_TARGET="prod" + dbt run --target "$BASE_TARGET" --full-refresh --quiet + dbt docs generate --target-path target-base --target "$BASE_TARGET" --quiet 2>/dev/null || true # Now apply patch (introduces the bug) and rebuild current state. # Use --full-refresh so incremental models reprocess ALL rows with # the buggy code — otherwise value_diff sees 0 changed rows because @@ -182,6 +200,11 @@ if [ "$DRY_RUN" = "false" ]; then esac fi +# If setup was skipped, provide a default test result for prompt injection +if [ "$SKIP_SETUP" = "true" ] && [ -z "${DBT_TEST_RESULT:-}" ]; then + DBT_TEST_RESULT="${DBT_TEST_RESULT_OVERRIDE:-ALL TESTS PASSED (PASS=25 WARN=0 ERROR=0 TOTAL=25)}" +fi + # ========== Detect Adapter (for prompt interpolation) ========== detect_adapter() { if [ -f "profiles.yml" ]; then @@ -236,16 +259,28 @@ fi CMD="$CMD --dangerously-skip-permissions" CMD="$CMD --output-format json" CMD="$CMD --max-budget-usd $MAX_BUDGET_USD" +if [ -n "$MODEL" ]; then + CMD="$CMD --model $MODEL" +fi if [ "$VARIANT" = "with-plugin" ]; then - if [ -n "$PLUGIN_DIR" ]; then - CMD="$CMD --plugin-dir \"$PLUGIN_DIR\"" + # Mode controls what with-plugin gets beyond MCP tools: + # real-world: full plugin (hooks, skills, agents) — agent discovers tools naturally + # tool-only: MCP tools only — no plugin context, agent discovers MCP on its own + if [ "$MODE" = "real-world" ]; then + if [ -n "$PLUGIN_DIR" ]; then + CMD="$CMD --plugin-dir \"$PLUGIN_DIR\"" + fi fi - if [ -n "$MCP_CONFIG" ]; then + # MCP config loaded in BOTH modes (MCP tools are the variable being tested). + # Skip MCP for fix-the-bug scenarios (skip_context=true) — agent needs + # to run dbt, and stdio MCP holds a DuckDB write lock that blocks dbt run. + if [ -n "$MCP_CONFIG" ] && [ "${SKIP_SETUP_CONTEXT:-false}" != "true" ]; then CMD="$CMD --strict-mcp-config --mcp-config \"$MCP_CONFIG\"" fi - # No manual IMPACT_RULE injection needed — plugin's SessionStart hook - # fires organically under --clean-profile and injects it automatically. + # No workflow prompt injected — real-world mode relies on plugin hooks/skills + # to naturally guide the agent (e.g., SessionStart hook, /recce-review skill). + # tool-only mode has no plugin context, so the agent discovers MCP tools on its own. fi # ========== Dry Run Mode ========== @@ -255,6 +290,7 @@ if [ "$DRY_RUN" = "true" ]; then else echo "PLUGIN_DIR=$PLUGIN_DIR" fi + echo "MODE=$MODE" echo "ADAPTER_TYPE=$ADAPTER_TYPE" echo "ADAPTER_DESC=$ADAPTER_DESC" echo "CMD: $CMD -- " diff --git a/plugins/recce-dev/skills/recce-eval/scripts/score-deterministic.sh b/plugins/recce-dev/skills/recce-eval/scripts/score-deterministic.sh index dbecd83..09d51fd 100755 --- a/plugins/recce-dev/skills/recce-eval/scripts/score-deterministic.sh +++ b/plugins/recce-dev/skills/recce-eval/scripts/score-deterministic.sh @@ -32,7 +32,16 @@ JSON_EXTRACTED=$(jq -r '.agent_output.json_extracted' "$RUN_FILE") if [ "$JSON_EXTRACTED" != "true" ] || [ "$AGENT_JSON" = "null" ]; then if [ "$CASE_TYPE" = "problem_exists" ]; then - CHECKS='[{"name":"issue_found","expected":"true","actual":"null","result":"FAIL"},{"name":"root_cause_keywords","expected":"match","actual":"no output","result":"FAIL"},{"name":"all_tests_pass","expected":"true","actual":"null","result":"FAIL"}]' + CHECKS='[{"name":"issue_found","expected":"true","actual":"null","result":"FAIL"},{"name":"root_cause_keywords","expected":"match","actual":"no output","result":"FAIL"}]' + # all_tests_pass (v1 only — skip if absent from ground truth) + if [ "$(echo "$GROUND_TRUTH" | jq 'has("all_tests_pass")')" = "true" ]; then + CHECKS=$(echo "$CHECKS" | jq '. + [{"name":"all_tests_pass","expected":"true","actual":"null","result":"FAIL"}]') + fi + # dashboard_impact (v2 only — skip if absent from ground truth) + if [ "$(echo "$GROUND_TRUTH" | jq 'has("dashboard_impact")')" = "true" ]; then + GT_DASHBOARD=$(echo "$GROUND_TRUTH" | jq -r '.dashboard_impact | tostring') + CHECKS=$(echo "$CHECKS" | jq --arg e "$GT_DASHBOARD" '. + [{"name":"dashboard_impact","expected":$e,"actual":"null","result":"FAIL"}]') + fi else CHECKS='[{"name":"issue_found","expected":"false","actual":"null","result":"FAIL"}]' fi @@ -99,11 +108,24 @@ if [ "$CASE_TYPE" = "problem_exists" ]; then fi fi - # all_tests_pass - EXPECTED_PASS=$(echo "$GROUND_TRUTH" | jq -r '.all_tests_pass | tostring') - ACTUAL_PASS=$(echo "$AGENT_JSON" | jq -r 'if .all_tests_pass == null then "null" else (.all_tests_pass | tostring) end') - if [ "$ACTUAL_PASS" = "$EXPECTED_PASS" ]; then add_check "all_tests_pass" "$EXPECTED_PASS" "$ACTUAL_PASS" "PASS" - else add_check "all_tests_pass" "$EXPECTED_PASS" "$ACTUAL_PASS" "FAIL"; fi + # dashboard_impact (v2 only — skip if absent from ground truth; uses has() to avoid jq // false gotcha) + if [ "$(echo "$GROUND_TRUTH" | jq 'has("dashboard_impact")')" = "true" ]; then + GT_DASHBOARD=$(echo "$GROUND_TRUTH" | jq -r '.dashboard_impact | tostring') + ACTUAL_DASHBOARD=$(echo "$AGENT_JSON" | jq -r 'if .dashboard_impact == null then "null" else (.dashboard_impact | tostring) end') + if [ "$ACTUAL_DASHBOARD" = "$GT_DASHBOARD" ]; then + add_check "dashboard_impact" "$GT_DASHBOARD" "$ACTUAL_DASHBOARD" "PASS" + else + add_check "dashboard_impact" "$GT_DASHBOARD" "$ACTUAL_DASHBOARD" "FAIL" + fi + fi + + # all_tests_pass (v1 only — skip if absent from ground truth; uses has() to avoid jq // false gotcha) + if [ "$(echo "$GROUND_TRUTH" | jq 'has("all_tests_pass")')" = "true" ]; then + EXPECTED_PASS=$(echo "$GROUND_TRUTH" | jq -r '.all_tests_pass | tostring') + ACTUAL_PASS=$(echo "$AGENT_JSON" | jq -r 'if .all_tests_pass == null then "null" else (.all_tests_pass | tostring) end') + if [ "$ACTUAL_PASS" = "$EXPECTED_PASS" ]; then add_check "all_tests_pass" "$EXPECTED_PASS" "$ACTUAL_PASS" "PASS" + else add_check "all_tests_pass" "$EXPECTED_PASS" "$ACTUAL_PASS" "FAIL"; fi + fi elif [ "$CASE_TYPE" = "no_problem" ]; then # issue_found == false diff --git a/plugins/recce/.mcp.json b/plugins/recce/.mcp.json index 9ed8f3b..6f826a0 100644 --- a/plugins/recce/.mcp.json +++ b/plugins/recce/.mcp.json @@ -1,8 +1,9 @@ { "mcpServers": { "recce": { - "type": "sse", - "url": "http://localhost:8081/sse" + "type": "stdio", + "command": "bash", + "args": ["${CLAUDE_PLUGIN_ROOT}/scripts/run-mcp-stdio.sh"] }, "recce-docs": { "type": "stdio", diff --git a/plugins/recce/agents/recce-reviewer.md b/plugins/recce/agents/recce-reviewer.md index a56e417..43dae56 100644 --- a/plugins/recce/agents/recce-reviewer.md +++ b/plugins/recce/agents/recce-reviewer.md @@ -181,3 +181,4 @@ LOW — No models were affected by the change. Safe to proceed. - Do NOT paste raw MCP tool JSON output into the summary. Extract only the relevant metrics. - Complete the review in a single pass. Do not offer to "continue" or "dive deeper". - impact_analysis is your entry point — it handles lineage, row count, schema, and value diff in one call. Do NOT call row_count_diff or schema_diff separately. +- NEVER use Python, curl, requests, httpx, or any other method to directly interact with Recce's HTTP/SSE endpoints. Use ONLY the MCP tools provided (impact_analysis, profile_diff, value_diff_detail, lineage_diff). If MCP tools are unavailable, report the error — do NOT attempt to bypass MCP. diff --git a/plugins/recce/hooks/scripts/session-start.sh b/plugins/recce/hooks/scripts/session-start.sh index aae8418..07bc6ac 100755 --- a/plugins/recce/hooks/scripts/session-start.sh +++ b/plugins/recce/hooks/scripts/session-start.sh @@ -55,38 +55,17 @@ else echo "TARGET_BASE_EXISTS=false" fi -# ========== Impact Analysis Rule (mandatory) ========== -# Injected into context so the agent uses impact_analysis for all impact determination -echo "IMPACT_RULE=MANDATORY: When determining which dbt models are impacted by a code change, you MUST call the impact_analysis MCP tool BEFORE reporting impacted_models. Do NOT determine impact by reading code, inferring from ref() calls, or guessing from model names — these approaches confuse upstream dependencies with downstream impact and produce false positives. impact_analysis uses the lineage DAG to deterministically classify models as impacted (modified + downstream) or not-impacted. Its impacted_models and not_impacted_models lists are authoritative — copy them directly into your output. When the response includes value_diff.rows_changed for a model, use that number as the affected row count — it is the exact count of rows whose values differ between base and current." - -# ========== MCP Auto-Start Decision ========== -# Only attempt if: recce installed AND target/manifest.json exists +# ========== MCP Readiness Check ========== +# MCP server is now stdio-based (.mcp.json) — Claude Code spawns it on demand. +# No external server to start. Just report whether prerequisites are met. if [ "$RECCE_INSTALLED" = "true" ] && [ "$TARGET_EXISTS" = "true" ]; then - # Delegate to start-mcp.sh — it handles PID, settings, health polling - MCP_OUTPUT=$(bash "$PLUGIN_ROOT/scripts/start-mcp.sh" 2>/dev/null) - MCP_EXIT=$? - - # Parse start-mcp.sh output - MCP_STATUS=$(echo "$MCP_OUTPUT" | grep "^STATUS=" | cut -d= -f2) - MCP_PORT_VAL=$(echo "$MCP_OUTPUT" | grep "^PORT=" | cut -d= -f2) - - if [ "$MCP_STATUS" = "STARTED" ] || [ "$MCP_STATUS" = "ALREADY_RUNNING" ]; then - echo "MCP_STARTED=true" - echo "MCP_PORT=$MCP_PORT_VAL" - else - echo "MCP_STARTED=false" - # Forward error/fix lines from start-mcp.sh - echo "$MCP_OUTPUT" | grep -E "^(ERROR|FIX|MESSAGE)=" - fi - - # Forward single-env and warning lines from start-mcp.sh - echo "$MCP_OUTPUT" | grep -E "^(SINGLE_ENV_MODE|WARNING)=" 2>/dev/null + echo "MCP_READY=true" else - echo "MCP_STARTED=false" - # Explain why MCP was skipped + echo "MCP_READY=false" if [ "$RECCE_INSTALLED" != "true" ]; then echo "MCP_SKIP_REASON=recce not installed" + echo "FIX=Activate your venv or run: pip install recce" elif [ "$TARGET_EXISTS" != "true" ]; then echo "MCP_SKIP_REASON=no target/manifest.json" echo "FIX=Run: dbt docs generate" diff --git a/plugins/recce/scripts/run-mcp-stdio.sh b/plugins/recce/scripts/run-mcp-stdio.sh new file mode 100755 index 0000000..73c790b --- /dev/null +++ b/plugins/recce/scripts/run-mcp-stdio.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# stdio MCP wrapper: detect venv, activate, exec recce mcp-server +# Called by Claude Code via .mcp.json stdio transport. +# Inherits cwd from Claude Code (the dbt project root). +set -euo pipefail + +# ========== Venv Auto-Detection ========== +# Always prefer local venv over global dbt/recce — global may be dbt Cloud CLI +for VENV_DIR in venv .venv; do + if [ -f "$VENV_DIR/bin/activate" ]; then + # shellcheck disable=SC1091 + source "$VENV_DIR/bin/activate" + break + fi +done + +# ========== Verify recce is available ========== +if ! command -v recce &>/dev/null; then + echo '{"error": "recce not found in PATH. Activate your venv or run: pip install recce"}' >&2 + exit 1 +fi + +# ========== Launch MCP server in stdio mode ========== +exec recce mcp-server