diff --git a/.github/patches/creator.diff b/.github/patches/creator.diff new file mode 100644 index 0000000..3e06484 --- /dev/null +++ b/.github/patches/creator.diff @@ -0,0 +1,26 @@ +diff --git a/mcpdoc/cli.py b/mcpdoc/cli.py +index 2634dd4..d5b2796 100644 +--- a/mcpdoc/cli.py ++++ b/mcpdoc/cli.py +@@ -179,12 +179,18 @@ def create_doc_sources_from_urls(urls: List[str]) -> List[DocSource]: + for entry in urls: + if not entry.strip(): + continue +- if ":" in entry and not entry.startswith(("http:", "https:")): ++ parts = entry.split(":", 1) ++ is_windows_drive = len(parts[0]) == 1 and parts[0].isalpha() ++ if ( ++ ":" in entry ++ and not entry.startswith(("http:", "https:", "file:")) ++ and not is_windows_drive ++ ): + # Format is name:url +- name, url = entry.split(":", 1) ++ name, url = parts + doc_sources.append({"name": name, "llms_txt": url}) + else: +- # Format is just url ++ # Format is just url or file: URL or Windows drive path + doc_sources.append({"llms_txt": entry}) + return doc_sources + diff --git a/.github/workflows/phase_a_gate.yml b/.github/workflows/phase_a_gate.yml new file mode 100644 index 0000000..17e6da5 --- /dev/null +++ b/.github/workflows/phase_a_gate.yml @@ -0,0 +1,81 @@ +# Phase A — Human PR Validation +# +# Runs when a human annotator opens a PR with a creator patch + F2P tests. +# Validates that the patch meets architectural quality standards before the +# task is frozen for Phase B agent evaluation. +# +# Checks enforced: +# ✓ api_surface_score — no breaking public API changes +# ✓ new_dependencies — no unexpected new external packages +# ✓ dead_code_count — no dead code introduced +# +# Not enforced (always 1.0 — no agent reference exists yet): +# ~ scope_score — N/A in Phase A +# ~ blast_ratio — N/A in Phase A +# +# The architectural_gate package is installed from amanchaudhary-ops/Merge-Bench-Scripts (central repo). +# This workflow file is the only architectural gate artifact that lives in each task repo. + +name: Architecture Gate — Phase A (Human PR) + +on: + pull_request: + branches: + - main + +permissions: + contents: read + actions: write + +jobs: + phase-a-gate: + runs-on: ubuntu-latest + + steps: + - name: Checkout task repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Checkout Architectural_Gate (architectural_gate source) + uses: actions/checkout@v4 + with: + repository: aneesh-spec/Merge-Bench-Scripts + path: Merge-Bench-Scripts + token: ${{ secrets.MERGE_BENCH_TOKEN }} + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install architectural_gate from Merge-Bench-Scripts + run: pip install ./Merge-Bench-Scripts + + - name: Materialize initial vs changed trees + shell: bash + run: | + set -euo pipefail + echo "ARCH_GATE_JSON_LOG=${GITHUB_WORKSPACE}/architectural-gate-log.json" >> "${GITHUB_ENV}" + I="${RUNNER_TEMP}/arch-gate-initial" + C="${RUNNER_TEMP}/arch-gate-changed" + rm -rf "$I" "$C" + BASE_SHA="${{ github.event.pull_request.base.sha }}" + HEAD_SHA="${{ github.event.pull_request.head.sha }}" + git fetch --no-tags --depth=50 origin "${BASE_SHA}" "${HEAD_SHA}" 2>/dev/null || git fetch --no-tags --depth=0 + git worktree add "$I" "${BASE_SHA}" + git worktree add "$C" "${HEAD_SHA}" + echo "ARCH_GATE_REPO_INITIAL=${I}" >> "${GITHUB_ENV}" + echo "ARCH_GATE_REPO_CHANGED=${C}" >> "${GITHUB_ENV}" + # ARCH_GATE_CREATOR_DIFF is intentionally NOT set — triggers Phase A mode + + - name: Run Architectural Gate (Phase A) + run: python -m architectural_gate + + - name: Upload gate log + uses: actions/upload-artifact@v4 + if: always() + with: + name: architectural-gate-log-phase-a + path: architectural-gate-log.json + if-no-files-found: ignore diff --git a/.github/workflows/phase_b_gate.yml b/.github/workflows/phase_b_gate.yml new file mode 100644 index 0000000..64386cb --- /dev/null +++ b/.github/workflows/phase_b_gate.yml @@ -0,0 +1,148 @@ +# Phase B — Agent Patch Evaluation +# +# Runs when an AI agent produces a patch for a frozen task from Phase A. +# Evaluates the agent's patch against the creator's reference patch. +# +# Checks enforced (all five): +# ✓ scope_score — agent stayed within creator's file scope +# ✓ blast_ratio — agent wrote proportionally as much code as creator +# ✓ api_surface_score — no breaking public API changes +# ✓ new_dependencies — no unexpected new external packages +# ✓ dead_code_count — no dead code introduced +# +# The architectural_gate package is installed from aneesh-spec/Merge-Bench-Scripts. +# +# Agent diff is built automatically from the PR's base..head diff. +# Creator diff must exist in the repo at the path specified by creator_diff_path. +# +# Trigger: manually via workflow_dispatch on the agent's PR branch. +# +# Example inputs: +# creator_diff_path: .github/patches/creator.diff +# base_sha: main +# head_sha: (leave blank to use current HEAD) + +name: Architecture Gate — Phase B (Agent Evaluation) + +on: + pull_request: + branches: + - main + workflow_dispatch: + inputs: + creator_diff_path: + description: "Repo-relative path to creator unified diff (e.g. .github/patches/creator.diff)" + required: true + default: ".github/patches/creator.diff" + base_sha: + description: "Base branch/SHA for baseline tree and agent diff base" + required: false + default: "main" + head_sha: + description: "Head branch/SHA for agent diff and changed tree (leave blank for current HEAD)" + required: false + default: "" + +permissions: + contents: read + actions: write + +jobs: + phase-b-gate: + runs-on: ubuntu-latest + + steps: + - name: Checkout task repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Check creator diff exists + id: check_creator + shell: bash + run: | + CREATOR_PATH="${{ inputs.creator_diff_path || '.github/patches/creator.diff' }}" + if [ ! -f "${GITHUB_WORKSPACE}/${CREATOR_PATH}" ]; then + echo "exists=false" >> "${GITHUB_OUTPUT}" + echo "Creator diff not found at ${CREATOR_PATH} — skipping Phase B gate." + else + echo "exists=true" >> "${GITHUB_OUTPUT}" + echo "Creator diff found at ${CREATOR_PATH} — running Phase B gate." + fi + + - name: Checkout Merge-Bench-Scripts (architectural_gate source) + if: steps.check_creator.outputs.exists == 'true' + uses: actions/checkout@v4 + with: + repository: aneesh-spec/Merge-Bench-Scripts + path: Merge-Bench-Scripts + token: ${{ secrets.MERGE_BENCH_TOKEN }} + + - name: Setup Python + if: steps.check_creator.outputs.exists == 'true' + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Install architectural_gate from Merge-Bench-Scripts + if: steps.check_creator.outputs.exists == 'true' + run: pip install ./Merge-Bench-Scripts + + - name: Materialize before/after trees and build agent diff + if: steps.check_creator.outputs.exists == 'true' + shell: bash + run: | + set -euo pipefail + I="${RUNNER_TEMP}/arch-gate-initial" + C="${RUNNER_TEMP}/arch-gate-changed" + rm -rf "$I" "$C" + + # pull_request: SHAs provided directly by GitHub event context. + # workflow_dispatch: fetch base branch explicitly by name. + PR_BASE="${{ github.event.pull_request.base.sha }}" + PR_HEAD="${{ github.event.pull_request.head.sha }}" + + if [ -n "${PR_BASE}" ]; then + BASE_SHA="${PR_BASE}" + HEAD_SHA="${PR_HEAD}" + else + BASE_INPUT="${{ inputs.base_sha || 'main' }}" + git fetch --no-tags --depth=50 origin "${BASE_INPUT}" + BASE_SHA=$(git rev-parse "origin/${BASE_INPUT}") + HEAD_INPUT="${{ inputs.head_sha }}" + HEAD_SHA=$(git rev-parse "${HEAD_INPUT:-HEAD}") + fi + + echo "Baseline: ${BASE_SHA}" + echo "Head: ${HEAD_SHA}" + + git worktree add "$I" "${BASE_SHA}" + git worktree add "$C" "${HEAD_SHA}" + + git diff "${BASE_SHA}".."${HEAD_SHA}" > "${RUNNER_TEMP}/agent.diff" + echo "Agent diff lines: $(wc -l < ${RUNNER_TEMP}/agent.diff)" + + echo "ARCH_GATE_REPO_INITIAL=${I}" >> "${GITHUB_ENV}" + echo "ARCH_GATE_REPO_CHANGED=${C}" >> "${GITHUB_ENV}" + echo "ARCH_GATE_CHANGE_DIFF=${RUNNER_TEMP}/agent.diff" >> "${GITHUB_ENV}" + + - name: Set creator diff and log path + if: steps.check_creator.outputs.exists == 'true' + shell: bash + run: | + set -euo pipefail + CREATOR_PATH="${{ inputs.creator_diff_path || '.github/patches/creator.diff' }}" + echo "ARCH_GATE_JSON_LOG=${GITHUB_WORKSPACE}/architectural-gate-log.json" >> "${GITHUB_ENV}" + echo "ARCH_GATE_CREATOR_DIFF=${GITHUB_WORKSPACE}/${CREATOR_PATH}" >> "${GITHUB_ENV}" + + - name: Run Architectural Gate (Phase B) + if: steps.check_creator.outputs.exists == 'true' + run: python -m architectural_gate + + - name: Upload gate log + uses: actions/upload-artifact@v4 + if: steps.check_creator.outputs.exists == 'true' + with: + name: architectural-gate-log-phase-b + path: architectural-gate-log.json + if-no-files-found: ignore diff --git a/mcpdoc/cli.py b/mcpdoc/cli.py index 2634dd4..7ea5c3e 100644 --- a/mcpdoc/cli.py +++ b/mcpdoc/cli.py @@ -165,6 +165,23 @@ def load_config_file(file_path: str, file_format: str) -> List[Dict[str, str]]: sys.exit(1) +def _is_label_prefix(entry: str) -> bool: + """Return True only if the first colon in entry is a label separator. + + Colons that are NOT label separators: + - URL schemes: http:, https:, file: + - Windows drive letters: C:, D:, etc. (single alpha char before colon) + """ + if ":" not in entry: + return False + if entry.startswith(("http:", "https:", "file:")): + return False + prefix = entry.split(":", 1)[0] + if len(prefix) == 1 and prefix.isalpha(): + return False + return True + + def create_doc_sources_from_urls(urls: List[str]) -> List[DocSource]: """Create doc sources from a list of URLs or file paths with optional names. @@ -179,12 +196,12 @@ def create_doc_sources_from_urls(urls: List[str]) -> List[DocSource]: for entry in urls: if not entry.strip(): continue - if ":" in entry and not entry.startswith(("http:", "https:")): - # Format is name:url + if _is_label_prefix(entry): + # Format is name:url_or_path name, url = entry.split(":", 1) doc_sources.append({"name": name, "llms_txt": url}) else: - # Format is just url + # Format is just url or path (file:, http:, https:, Windows drive, unix path) doc_sources.append({"llms_txt": entry}) return doc_sources diff --git a/tests/unit_tests/test_cli.py b/tests/unit_tests/test_cli.py new file mode 100644 index 0000000..adc35a4 --- /dev/null +++ b/tests/unit_tests/test_cli.py @@ -0,0 +1,130 @@ +"""Tests for mcpdoc.cli module — focused on create_doc_sources_from_urls parsing. + +These are fail-to-pass (F2P) tests: they fail on main (bug present) and pass +on the aneesh-fix branch (bug fixed). + +Bug: --urls tokens are split on the first ':' even when that colon is part of +a 'file:' URL scheme or a Windows drive letter, not a label separator. +""" + +from mcpdoc.cli import create_doc_sources_from_urls + + +# --------------------------------------------------------------------------- +# file: URL handling +# --------------------------------------------------------------------------- + + +def test_file_url_not_split_into_label(): + """file:///path/to/llms.txt must NOT be treated as label='file', url='///path/to/llms.txt'. + + On main this fails because 'file:' is not excluded from the colon-split logic. + """ + sources = create_doc_sources_from_urls(["file:///path/to/llms.txt"]) + assert len(sources) == 1 + assert sources[0]["llms_txt"] == "file:///path/to/llms.txt" + assert "name" not in sources[0] + + +def test_file_url_preserves_full_value(): + """The full file: URL must reach llms_txt unchanged.""" + url = "file:///home/user/docs/llms.txt" + sources = create_doc_sources_from_urls([url]) + assert sources[0]["llms_txt"] == url + + +# --------------------------------------------------------------------------- +# Windows drive path handling +# --------------------------------------------------------------------------- + + +def test_windows_drive_path_not_split_into_label(): + """C:/Users/docs/llms.txt must NOT be treated as label='C', url='/Users/docs/llms.txt'. + + On main this fails because a single-letter prefix before ':' is not excluded. + """ + sources = create_doc_sources_from_urls(["C:/Users/docs/llms.txt"]) + assert len(sources) == 1 + assert sources[0]["llms_txt"] == "C:/Users/docs/llms.txt" + assert "name" not in sources[0] + + +def test_windows_drive_path_lowercase(): + """Lowercase drive letter (c:/...) should also be treated as a path, not a label.""" + sources = create_doc_sources_from_urls(["c:/path/to/llms.txt"]) + assert len(sources) == 1 + assert sources[0]["llms_txt"] == "c:/path/to/llms.txt" + assert "name" not in sources[0] + + +# --------------------------------------------------------------------------- +# Correct label:url format must still work +# --------------------------------------------------------------------------- + + +def test_label_with_http_url(): + """LangGraph:https://example.com/llms.txt must still parse label + URL correctly.""" + sources = create_doc_sources_from_urls( + ["LangGraph:https://langchain-ai.github.io/langgraph/llms.txt"] + ) + assert len(sources) == 1 + assert sources[0]["name"] == "LangGraph" + assert sources[0]["llms_txt"] == "https://langchain-ai.github.io/langgraph/llms.txt" + + +def test_label_with_local_path(): + """MyDocs:/path/to/llms.txt — multi-char label with unix path must split correctly.""" + sources = create_doc_sources_from_urls(["MyDocs:/path/to/llms.txt"]) + assert len(sources) == 1 + assert sources[0]["name"] == "MyDocs" + assert sources[0]["llms_txt"] == "/path/to/llms.txt" + + +# --------------------------------------------------------------------------- +# Plain URLs / paths with no label +# --------------------------------------------------------------------------- + + +def test_plain_http_url_no_label(): + """A bare https URL with no label prefix must be stored as-is.""" + sources = create_doc_sources_from_urls( + ["https://langchain-ai.github.io/langgraph/llms.txt"] + ) + assert len(sources) == 1 + assert sources[0]["llms_txt"] == "https://langchain-ai.github.io/langgraph/llms.txt" + assert "name" not in sources[0] + + +def test_plain_unix_path_no_label(): + """A bare unix path with no colon must be stored as-is.""" + sources = create_doc_sources_from_urls(["/home/user/llms.txt"]) + assert len(sources) == 1 + assert sources[0]["llms_txt"] == "/home/user/llms.txt" + assert "name" not in sources[0] + + +# --------------------------------------------------------------------------- +# Mixed input — multiple entries together +# --------------------------------------------------------------------------- + + +def test_mixed_entries_parsed_correctly(): + """When file: URL, Windows path, and a labelled URL are combined, all parse correctly. + + On main this fails because the first two entries are mis-parsed. + """ + inputs = [ + "file:///opt/docs/llms.txt", + "C:/docs/llms.txt", + "LangGraph:https://langchain-ai.github.io/langgraph/llms.txt", + "https://plain-url.com/llms.txt", + ] + sources = create_doc_sources_from_urls(inputs) + + assert sources[0] == {"llms_txt": "file:///opt/docs/llms.txt"} + assert sources[1] == {"llms_txt": "C:/docs/llms.txt"} + assert sources[2] == { + "name": "LangGraph", + "llms_txt": "https://langchain-ai.github.io/langgraph/llms.txt", + } + assert sources[3] == {"llms_txt": "https://plain-url.com/llms.txt"}