saadshahd · saadshahd · Dec 16, 2025 · Dec 16, 2025 · Dec 16, 2025 · greptile-apps
diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
@@ -0,0 +1,103 @@
+name: Skill Evaluations
+
+on:
+  pull_request:
+    paths:
+      - 'hope/**'
+      - 'product/**'
+      - 'wordsmith/**'
+      - 'founder/**'
+      - 'career/**'
+      - 'eval/**'
+  workflow_dispatch:
+
+jobs:
+  eval:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      id-token: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        test:
+          - hope-gate-completion
+          - hope-soul-planning
+          - hope-trace-debugging
+          - product-prd-request
+          - wordsmith-edit-request
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Run Eval - ${{ matrix.test }}
+        id: eval
+        uses: anthropics/claude-code-action@v1
+        with:
+          claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
+          prompt: |
+            You are evaluating skill auto-triggering for the moo.md plugin marketplace.
+
+            ## Instructions
+            1. Read eval/cases/skill-triggers/${{ matrix.test }}.yaml
+            2. Extract the "prompt" and "expected_behaviors" fields
+            3. Process the prompt as if a user sent it (let skills auto-trigger naturally)
+            4. Self-evaluate your response against expected behaviors
+
+            ## Required Output Format
+            You MUST end your response with exactly one of these lines:
+            - `VERDICT: PASS` - if skill triggered AND all expected behaviors observed
+            - `VERDICT: PARTIAL` - if skill triggered but some behaviors missing
+            - `VERDICT: FAIL` - if skill did not trigger or wrong skill triggered
+
+            Before the verdict, explain your reasoning briefly.
+          claude_args: '--plugin-dir ./hope --plugin-dir ./product --plugin-dir ./wordsmith --plugin-dir ./founder --plugin-dir ./career'
+          show_full_output: true
+
+      - name: Check Verdict
+        env:
+          EXECUTION_FILE: ${{ steps.eval.outputs.execution_file }}
+          SESSION_ID: ${{ steps.eval.outputs.session_id }}
+          TEST_NAME: ${{ matrix.test }}
+        run: |
+          echo "=== Debug Info ==="
+          echo "Test: $TEST_NAME"
+          echo "Execution file: $EXECUTION_FILE"
+          echo "Session ID: $SESSION_ID"
+          echo ""
+
+          # Check if execution file exists
+          if [ -z "$EXECUTION_FILE" ]; then
+            echo "::error::execution_file output is empty"
+            echo "This likely means claude-code-action did not run successfully."
+            exit 1
+          fi
+
+          if [ ! -f "$EXECUTION_FILE" ]; then
+            echo "::error::Execution file does not exist: $EXECUTION_FILE"
+            echo "Available files in workspace:"
+            ls -la
+            exit 1
+          fi
+
+          echo "=== Execution Output ==="
+          RESULT=$(cat "$EXECUTION_FILE")
+          echo "$RESULT"
+          echo ""
+          echo "=== Verdict Check ==="
+
+          # Check for verdict in output
+          if echo "$RESULT" | grep -q "VERDICT: PASS"; then
+            echo "✓ $TEST_NAME: PASS"
+          elif echo "$RESULT" | grep -q "VERDICT: PARTIAL"; then
+            echo "⚠ $TEST_NAME: PARTIAL"
+          elif echo "$RESULT" | grep -q "VERDICT: FAIL"; then
+            echo "::error::Test $TEST_NAME failed"
+            exit 1
+          else
+            echo "::warning::No verdict found in output for $TEST_NAME"
+            echo "The output may not contain the expected VERDICT line."
+            exit 1
+          fi
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,12 @@ Versioning follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
 
+### Added
+- **eval**: Skill evaluation framework using claude-code-action
+  - CI workflow runs on PRs touching plugin code
+  - Local testing via `./eval/run.sh --simple`
+  - 5 test cases for skill auto-triggering (hope:gate, hope:soul, hope:trace, product, wordsmith)
+
 ---
 
 ## [0.4.5] - 2025-12-15

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -38,6 +38,13 @@ Plugin discovery uses `.claude-plugin/marketplace.json` at repo root (lists all
 /plugin install hope@moo.md
 ```
 
+## Evaluations
+
+Run `./eval/run.sh --simple` to test skill auto-triggering locally. CI runs evals on PRs touching plugin code.
+
+**When adding skills:** Add test case to `eval/cases/skill-triggers/`.
+**When adding plugins:** Add `--plugin-dir ./new-plugin` to `.github/workflows/eval.yml`.
+
 ## Conventions
 
 ### Frontmatter (Required)

diff --git a/docs/dev/evaluations.md b/docs/dev/evaluations.md
@@ -0,0 +1,107 @@
+# Skill Evaluations
+
+Developer documentation for the moo.md evaluation system.
+
+## Philosophy
+
+**Claude evaluates Claude.** Instead of external test frameworks, we use Claude Code's own capabilities:
+
+1. Claude processes test prompts (letting skills auto-trigger)
+2. Claude self-evaluates against expected behaviors
+3. JSON schema enforces structured output via constrained decoding
+
+This matches Claude Code's "evaluation-through-usage" philosophy.
+
+## Architecture
+
+```
+Test Case (YAML) → claude-code-action → JSON Schema → Pass/Fail
+```
+
+### Components
+
+| Component | Location | Purpose |
+|-----------|----------|---------|
+| Test cases | `eval/cases/skill-triggers/*.yaml` | Define prompts and expected behaviors |
+| JSON schema | `eval/schema.json` | Enforce structured eval output |
+| CI workflow | `.github/workflows/eval.yml` | Run evals on PRs |
+
+## CI Workflow
+
+Runs automatically on PRs touching:
+- `hope/**`, `product/**`, `wordsmith/**`, `founder/**`, `career/**`
+- `eval/**`
+
+Also available via manual `workflow_dispatch`.
+
+### Matrix Strategy
+
+Each test runs in parallel:
+- `hope-gate-completion`
+- `hope-soul-planning`
+- `hope-trace-debugging`
+- `product-prd-request`
+- `wordsmith-edit-request`
+
+## Local Testing
+
+```bash
+# Run all tests
+./eval/run.sh
+
+# Run single test
+./eval/run.sh hope-gate-completion
+```
+
+See `eval/README.md` for more details.
+
+## Adding Tests
+
+1. Create `eval/cases/skill-triggers/<name>.yaml`
+2. Add to matrix in `.github/workflows/eval.yml`
+
+### Test Case Format
+
+```yaml
+name: kebab-case-name
+description: What this validates
+plugin: hope
+skill: gate
+prompt: "User message that should trigger the skill"
+expected_behaviors:
+  - "Specific observable behavior"
+  - "Another behavior to check"
+```
+
+### Expected Behaviors Guidelines
+
+- Be specific and observable
+- Reference actual skill output markers (e.g., "Silent Audit", "Evidence Hierarchy")
+- Avoid subjective criteria
+
+## Troubleshooting
+
+### Skill not triggering
+
+- Check skill description in SKILL.md frontmatter
+- Ensure prompt contains trigger keywords from description
+- Verify skill is properly registered in plugin.json
+
+### Flaky tests
+
+- Make expected_behaviors more specific
+- Add more context to the test prompt
+- Check if skill description needs refinement
+
+## Why Not promptfoo?
+
+- Third-party tool (not Anthropic-endorsed)
+- Adds dependency and config files
+- Slower, more brittle in practice
+- claude-code-action already handles what we need
+
+## Why Not Claude Agent SDK?
+
+- Requires `ANTHROPIC_API_KEY` (separate billing)
+- Claude Max only provides OAuth token for claude-code-action
+- Would add unnecessary cost
diff --git a/eval/README.md b/eval/README.md
@@ -0,0 +1,67 @@
+# Skill Evaluations
+
+Automated testing for skill triggering and output quality using claude-code-action.
+
+## How It Works
+
+1. **Test cases** define prompts and expected behaviors
+2. **Claude evaluates Claude** - processes the prompt and self-evaluates results
+3. **JSON schema** enforces structured output via constrained decoding
+4. **CI runs on every PR** touching plugin code
+
+## Running Locally
+
+```bash
+# Run all tests (simple mode - works with OAuth)
+./eval/run.sh --simple
+
+# Run single test
+./eval/run.sh --simple hope-gate-completion
+
+# With JSON schema (requires API key, may not work with OAuth)
+./eval/run.sh hope-gate-completion
+
+# Help
+./eval/run.sh --help
+```
+
+**Note:** Use `--simple` for local testing with Claude Max (OAuth). JSON schema mode requires API key.
+
+## Test Cases
+
+| File | Skill | Purpose |
+|------|-------|---------|
+| hope-soul-planning.yaml | hope:soul | Planning tasks trigger soul |
+| hope-trace-debugging.yaml | hope:trace | Debugging triggers trace |
+| hope-gate-completion.yaml | hope:gate | Completion claims trigger gate |
+| product-prd-request.yaml | product:product | PRD requests trigger product |
+| wordsmith-edit-request.yaml | wordsmith:writing | Edit requests trigger writing |
+
+## Adding New Tests
+
+Create `eval/cases/skill-triggers/<test-name>.yaml`:
+
+```yaml
+name: descriptive-test-name
+description: What this test validates
+plugin: hope
+skill: gate
+prompt: "User message that should trigger the skill"
+expected_behaviors:
+  - "Observable behavior 1"
+  - "Observable behavior 2"
+```
+
+Then add to CI matrix in `.github/workflows/eval.yml`.
+
+## Interpreting Results
+
+| Verdict | Meaning |
+|---------|---------|
+| PASS | Expected skill triggered, all behaviors observed |
+| PARTIAL | Skill triggered but some behaviors missing |
+| FAIL | Wrong/no skill triggered |
+
+## Schema
+
+See `eval/schema.json` for the structured output format.
diff --git a/eval/cases/skill-triggers/hope-gate-completion.yaml b/eval/cases/skill-triggers/hope-gate-completion.yaml
@@ -0,0 +1,26 @@
+# Test: hope:gate triggers on completion claims
+---
+name: hope-gate-triggers-on-completion
+description: Gate skill should auto-activate when claiming work is done/fixed/complete
+category: skill-trigger
+plugin: hope
+skill: gate
+
+prompt: "I've finished implementing the login feature. The code is done and ready to merge."
+
+model_tier: sonnet
+timeout_ms: 120000
+
+evaluators:
+  - trigger-evaluator:
+      expected_skill: "hope:gate"
+      pass_threshold: 0.70
+
+expected_behaviors:
+  - "Gate skill activates on 'done/finished/complete' claims"
+  - "Verification checklist should appear"
+  - "Evidence hierarchy mentioned"
+  - "Asks for proof before accepting completion"
+
+tags: [core, regression, phase1]
+created: 2024-12-15
diff --git a/eval/cases/skill-triggers/hope-soul-planning.yaml b/eval/cases/skill-triggers/hope-soul-planning.yaml
@@ -0,0 +1,26 @@
+# Test: hope:soul triggers on planning tasks
+---
+name: hope-soul-triggers-on-planning
+description: Soul skill should auto-activate on planning/build tasks
+category: skill-trigger
+plugin: hope
+skill: soul
+
+prompt: "Help me plan a new user dashboard feature for our SaaS app"
+
+model_tier: sonnet
+timeout_ms: 120000
+
+evaluators:
+  - trigger-evaluator:
+      expected_skill: "hope:soul"
+      pass_threshold: 0.70
+
+expected_behaviors:
+  - "Soul skill auto-activates due to planning context"
+  - "Silent Audit checklist should appear"
+  - "Confidence percentages used"
+  - "Quality Footer present"
+
+tags: [core, regression, phase1]
+created: 2024-12-15
diff --git a/eval/cases/skill-triggers/hope-trace-debugging.yaml b/eval/cases/skill-triggers/hope-trace-debugging.yaml
@@ -0,0 +1,26 @@
+# Test: hope:trace triggers on debugging/root cause tasks
+---
+name: hope-trace-triggers-on-debugging
+description: Trace skill should auto-activate for root cause analysis and debugging
+category: skill-trigger
+plugin: hope
+skill: trace
+
+prompt: "Why does the API keep returning 500 errors? The bug keeps coming back after we fix it."
+
+model_tier: sonnet
+timeout_ms: 120000
+
+evaluators:
+  - trigger-evaluator:
+      expected_skill: "hope:trace"
+      pass_threshold: 0.70
+
+expected_behaviors:
+  - "Trace skill activates on 'why' questions about bugs"
+  - "Effect -> Cause -> Root chain mentioned"
+  - "Confidence levels for potential causes"
+  - "Five Whys or similar root cause technique"
+
+tags: [core, regression, phase1]
+created: 2024-12-15