From b3dbd0e42862110bb414cf3ba99485024da992b5 Mon Sep 17 00:00:00 2001 From: Christopher Lusk Date: Sun, 22 Mar 2026 09:45:00 -0400 Subject: [PATCH 01/10] =?UTF-8?q?feat:=20fluxgate=20v0.4.0=20=E2=80=94=20c?= =?UTF-8?q?lose=20mitigation=20detection=20gaps=20from=20manual=20triage?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four new detection capabilities based on Red Hat triage analysis: - Gap 1: Actor guards — detect github.actor == 'bot[bot]' gates (→ info) and human actor restrictions (→ downgrade by 1) - Gap 2: Action-based permission gates — recognize actions-cool/check-user-permission and similar third-party permission-checking actions as maintainer checks - Gap 3: Cross-job needs: gating — follow needs: chains to detect environment approval gates on upstream authorize jobs (→ downgrade by 1) - Gap 4: Path isolation — detect fork code checked out to subdirectory with no direct execution, downgrade confidence to pattern-only Also adds fork guard to Fluxgate's own CI workflow (fixes FG-006 self-finding). Validated against 11 triage findings: 5 false criticals corrected automatically, 2 false positives eliminated, 2 clean criticals unchanged. Co-Authored-By: Claude Opus 4.6 --- .github/workflows/ci.yaml | 1 + cmd/fluxgate/main.go | 2 +- fluxgate-v040-mitigation-gaps-spec.md | 802 ++++++++++++++++++ internal/scanner/rules.go | 166 +++- internal/scanner/rules_test.go | 114 +++ internal/scanner/workflow.go | 26 +- .../pwn-request-action-perm-gate.yaml | 21 + .../fixtures/pwn-request-actor-guard-bot.yaml | 13 + .../pwn-request-actor-guard-human.yaml | 13 + test/fixtures/pwn-request-needs-gate.yaml | 19 + test/fixtures/pwn-request-path-exec.yaml | 11 + test/fixtures/pwn-request-path-isolated.yaml | 15 + 12 files changed, 1196 insertions(+), 7 deletions(-) create mode 100644 fluxgate-v040-mitigation-gaps-spec.md create mode 100644 test/fixtures/pwn-request-action-perm-gate.yaml create mode 100644 test/fixtures/pwn-request-actor-guard-bot.yaml create mode 100644 test/fixtures/pwn-request-actor-guard-human.yaml create mode 100644 test/fixtures/pwn-request-needs-gate.yaml create mode 100644 test/fixtures/pwn-request-path-exec.yaml create mode 100644 test/fixtures/pwn-request-path-isolated.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 789b3fc..1a371ea 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -11,6 +11,7 @@ permissions: jobs: build-and-test: + if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name == 'push' runs-on: ubuntu-latest steps: - name: Checkout diff --git a/cmd/fluxgate/main.go b/cmd/fluxgate/main.go index 6fe0f62..ec34177 100644 --- a/cmd/fluxgate/main.go +++ b/cmd/fluxgate/main.go @@ -14,7 +14,7 @@ import ( "github.com/spf13/cobra" ) -var version = "0.3.0" +var version = "0.4.0" func main() { rootCmd := &cobra.Command{ diff --git a/fluxgate-v040-mitigation-gaps-spec.md b/fluxgate-v040-mitigation-gaps-spec.md new file mode 100644 index 0000000..b62138a --- /dev/null +++ b/fluxgate-v040-mitigation-gaps-spec.md @@ -0,0 +1,802 @@ +# Fluxgate v0.4.0 — Mitigation Detection Gaps + +**Date:** 2026-03-22 +**Author:** Generated from triage-analysis.txt findings +**Status:** Spec + +--- + +## Problem Statement + +Manual triage of 11 FG-001 findings from the Red Hat rescan revealed four classes +of defensive controls that Fluxgate v0.3.0 does not detect. This causes: + +- **False criticals:** Repos with solid gates (rhdh-operator environment approvals, + image-builder-cli permission checks) report as critical instead of medium/high. +- **False positives:** Bot-gated workflows (backstage-community-plugins, + rhdh-plugins) that can never fire from external forks report as critical + instead of being suppressed. +- **Missed nuance:** Data-only fork checkouts (quarkusio l10n repos) where base + branch scripts run on fork *data*, not fork *code*, report as confirmed + code execution. + +The manual triage downgraded or dismissed 9 of 11 findings. Fluxgate should +catch at least 7 of these automatically. + +--- + +## Gap 1: Actor Guards + +### What Fluxgate misses + +Job-level `if:` conditions that restrict execution to specific bot accounts: + +```yaml +if: github.actor == 'renovate[bot]' && github.repository == 'redhat-developer/rhdh-plugins' +``` + +```yaml +if: github.actor == 'backstage-goalie[bot]' && github.repository == 'backstage/community-plugins' +``` + +These make the workflow unreachable from external fork PRs — only the named bot +can trigger execution. + +### Real-world examples from triage + +| # | Repo | Guard | Triage result | +|---|------|-------|---------------| +| 7 | redhat-appstudio/backstage-community-plugins | `github.actor == 'backstage-goalie[bot]'` | FALSE POSITIVE | +| 11 | redhat-developer/rhdh-plugins | `github.actor == 'renovate[bot]'` | FALSE POSITIVE | + +### Detection approach + +Add actor guard detection to `analyzeMitigations` alongside the existing fork +guard and label check logic. + +**New field on `MitigationAnalysis`:** + +```go +ActorGuard bool // Job if: restricts execution to specific actor(s) +``` + +**New helper function `containsActorGuard`:** + +```go +func containsActorGuard(ifExpr string) bool { + // Match patterns like: + // github.actor == 'name[bot]' + // github.actor == "name[bot]" + // github.triggering_actor == 'name[bot]' + // Only treat as a guard if the actor is a bot account ([bot] suffix) + // or a known service account pattern. + actorPatterns := []string{ + "github.actor ==", + "github.triggering_actor ==", + } + lower := strings.ToLower(ifExpr) + for _, p := range actorPatterns { + if strings.Contains(lower, p) && strings.Contains(lower, "[bot]") { + return true + } + } + return false +} +``` + +**Severity adjustment:** + +Actor guard to a bot account is a strong defense — external users cannot +impersonate GitHub App bots. Treat as equivalent to fork guard: + +``` +ActorGuard (bot) → severity = info, confidence = pattern-only +``` + +Rationale: A `github.actor == 'renovate[bot]'` check is *stronger* than a fork +guard — it restricts to a single bot identity, not just "any internal +contributor." The only bypass requires compromising the bot's GitHub App +credentials. + +**Edge case — non-bot actors:** + +If the `if:` checks `github.actor` against a human username (not `[bot]`), treat +it as a weaker mitigation (equivalent to maintainer check, downgrade by 1). Human +accounts can be compromised via phishing, leaked credentials, etc. + +```go +func containsActorGuard(ifExpr string) (isBot bool, isHuman bool) { + // Returns (true, false) for bot guards, (false, true) for human guards +} +``` + +**Also detect repo guards:** + +Several actor-gated workflows also include `github.repository == 'org/repo'` +checks. When the workflow is in a *different* repo (e.g., backstage-community-plugins +has `github.repository == 'backstage/community-plugins'` but lives in +redhat-appstudio), the job can *never* execute — it's dead code. Fluxgate +could detect this by comparing the guard value against the scanned repo's +`owner/name`, but this requires passing the repo identity into the scanner. + +For now, note this as a possible enhancement. The actor guard alone is sufficient +to suppress the false positive. + +### Parser changes + +None. The existing `job.If` field already captures the raw `if:` string. The +new detection is purely in `analyzeMitigations`. + +### Test fixtures needed + +**`test/fixtures/pwn-request-actor-guard-bot.yaml`:** +```yaml +name: Bot Only +on: + pull_request_target: + paths: ['**/yarn.lock'] +jobs: + generate: + runs-on: ubuntu-latest + if: github.actor == 'renovate[bot]' + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} + - run: node ./scripts/generate-changesets.js +``` +Expected: FG-001 info severity (actor guard to bot). + +**`test/fixtures/pwn-request-actor-guard-human.yaml`:** +```yaml +name: Allowed User +on: + pull_request_target: + types: [opened, synchronize] +jobs: + build: + runs-on: ubuntu-latest + if: contains(fromJSON('["alice","bob"]'), github.actor) + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + - run: make build +``` +Expected: FG-001 high severity (downgraded by 1 for human actor guard). + +--- + +## Gap 2: Action-Based Permission Gates + +### What Fluxgate misses + +Pre-checkout steps that use third-party actions to verify the triggering user +has write/admin permission on the repo: + +```yaml +- name: Get User Permission + id: checkAccess + uses: actions-cool/check-user-permission@v2 + with: + require: write + username: ${{ github.triggering_actor }} +- name: Check User Permission + if: steps.checkAccess.outputs.require-result == 'false' + run: exit 1 +``` + +This is functionally identical to the `getCollaboratorPermissionLevel` pattern +that Fluxgate already detects via `containsMaintainerCheck`, but uses a +pre-built action instead of `actions/github-script`. + +### Real-world examples from triage + +| # | Repo | Action | Triage result | +|---|------|--------|---------------| +| 3 | osbuild/image-builder-cli | `actions-cool/check-user-permission@v2` | HIGH (mitigated) | + +### Detection approach + +Expand `containsMaintainerCheck` to recognize common permission-checking actions +by their action reference, not just by script content. + +**Updated `containsMaintainerCheck`:** + +```go +func containsMaintainerCheck(step Step) bool { + // 1. Check for known permission-checking actions + permissionActions := []string{ + "actions-cool/check-user-permission", + "prince-chrismc/check-actor-permissions-action", + "lannonbr/repo-permission-check-action", + "TheModdingInquisition/actions-team-membership", + } + for _, action := range permissionActions { + if strings.Contains(step.Uses, action) { + return true + } + } + + // 2. Check for permission verification in subsequent if: conditions + // (the step after the action checks outputs.require-result) + // — handled separately, see below + + // 3. Existing: check script content for API calls + checkPatterns := []string{ + "getCollaboratorPermissionLevel", + "repos.getCollaboratorPermission", + "permission.permission", + } + searchText := step.Run + if step.Uses != "" && strings.Contains(step.Uses, "actions/github-script") { + if script, ok := step.With["script"]; ok { + searchText = script + } + } + for _, p := range checkPatterns { + if strings.Contains(searchText, p) { + return true + } + } + return false +} +``` + +**Additional detection — exit-on-fail pattern:** + +The permission action alone is not a gate unless a subsequent step exits on +failure. Look for the common pattern: + +```yaml +- if: steps.checkAccess.outputs.require-result == 'false' + run: exit 1 +``` + +Update `analyzeMitigations` to scan pre-checkout steps as a sequence: if step N +is a permission action and step N+1 has an `if:` that references the permission +step's output and runs `exit 1`, treat the pair as a maintainer check. + +```go +// In analyzeMitigations, after the existing pre-checkout loop: +for i, step := range preCheckoutSteps { + if containsMaintainerCheck(step) { + m.MaintainerCheck = true + m.Details = append(m.Details, fmt.Sprintf( + "pre-checkout permission check via %s (line %d)", + truncate(step.Uses, 60), step.Line, + )) + break + } + // Also check for exit-on-fail after a permission action + if i+1 < len(preCheckoutSteps) { + next := preCheckoutSteps[i+1] + if strings.Contains(next.If, step.Uses) || strings.Contains(next.If, "checkAccess") { + if strings.Contains(next.Run, "exit 1") { + m.MaintainerCheck = true + m.Details = append(m.Details, fmt.Sprintf( + "pre-checkout permission gate with exit-on-fail (line %d)", + step.Line, + )) + break + } + } + } +} +``` + +**Severity adjustment:** + +Same as existing maintainer check — downgrade by 1 level. + +### Parser changes + +None. Step `Uses`, `If`, and `Run` fields are already captured. + +### Test fixtures needed + +**`test/fixtures/pwn-request-action-perm-gate.yaml`:** +```yaml +name: RHEL Test +on: + pull_request_target: + types: [opened, synchronize, reopened] +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Get User Permission + id: checkAccess + uses: actions-cool/check-user-permission@v2 + with: + require: write + username: ${{ github.triggering_actor }} + - name: Check User Permission + if: steps.checkAccess.outputs.require-result == 'false' + run: exit 1 + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + - run: pytest test/ +``` +Expected: FG-001 high severity (downgraded from critical by 1 for maintainer +check via action). + +--- + +## Gap 3: Cross-Job `needs:` Gating (Authorize Pattern) + +### What Fluxgate misses + +A common pattern where an "authorize" job gates all subsequent jobs via `needs:`. +The authorize job uses GitHub environment protection to require manual approval +for external fork PRs: + +```yaml +jobs: + authorize: + environment: + ${{ (github.event.pull_request.head.repo.full_name == github.repository || + contains(fromJSON('["user1","user2"]'), github.event.pull_request.user.login)) + && 'internal' || 'external' }} + runs-on: ubuntu-latest + steps: + - run: echo "✓" + + build: + needs: authorize # <-- THIS IS THE GATE + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + - run: make build +``` + +The `build` job cannot run until `authorize` completes. The `authorize` job uses +the `external` environment (configured with required reviewers) for fork PRs. +This is a solid gate — a maintainer must manually approve before fork code +executes. + +Fluxgate currently checks `job.Environment` on the *executing* job but misses +the case where the environment gate is on an *upstream dependency* job. + +### Real-world examples from triage + +| # | Repo | Pattern | Triage result | +|---|------|---------|---------------| +| 8 | redhat-developer/rhdh-operator | `needs: authorize` → env `internal`/`external` | MEDIUM-HIGH | +| 9 | redhat-developer/rhdh-operator | `needs: authorize` → env `internal`/`external` | MEDIUM-HIGH | +| 10 | redhat-developer/rhdh-operator | `needs: authorize` → env `internal`/`external` | MEDIUM-HIGH | + +All three rhdh-operator findings should have been downgraded from critical. + +### Detection approach + +#### Parser changes — add `Needs` field to Job + +**Updated `rawJob`:** +```go +type rawJob struct { + Name string `yaml:"name"` + If string `yaml:"if"` + Environment yaml.Node `yaml:"environment"` + Permissions yaml.Node `yaml:"permissions"` + Steps []rawStep `yaml:"steps"` + Secrets string `yaml:"secrets"` + Needs yaml.Node `yaml:"needs"` // NEW +} +``` + +**Updated `Job`:** +```go +type Job struct { + Name string + If string + Environment string + Permissions PermissionsConfig + Steps []Step + Secrets string + Needs []string // NEW — list of job IDs this job depends on +} +``` + +**Parsing `needs:`:** + +The `needs` field can be either a string or a list: + +```yaml +needs: authorize # string +needs: [authorize, lint] # list +``` + +```go +func parseNeeds(node yaml.Node) []string { + if node.Kind == yaml.ScalarNode { + return []string{node.Value} + } + if node.Kind == yaml.SequenceNode { + var needs []string + for _, n := range node.Content { + needs = append(needs, n.Value) + } + return needs + } + return nil +} +``` + +#### Mitigation detection — follow `needs:` chain + +**New field on `MitigationAnalysis`:** +```go +NeedsGate bool // Job depends on an upstream job with environment/fork guard +``` + +**Updated `analyzeMitigations` signature:** + +The function currently receives a single `Job`. To follow `needs:` chains, it +needs access to all jobs in the workflow: + +```go +func analyzeMitigations(wf *Workflow, job Job, checkoutIdx int, + postCheckoutSteps []Step, allJobs map[string]Job) MitigationAnalysis { +``` + +**New logic in `analyzeMitigations`:** + +```go +// 6. Check needs: chain for upstream environment/fork gates +for _, depName := range job.Needs { + if dep, ok := allJobs[depName]; ok { + if dep.Environment != "" { + m.NeedsGate = true + m.EnvironmentGated = true + m.Details = append(m.Details, fmt.Sprintf( + "depends on job '%s' with environment '%s' (requires approval)", + depName, dep.Environment, + )) + } + if dep.If != "" && containsForkGuard(dep.If) { + m.NeedsGate = true + m.ForkGuard = true + m.Details = append(m.Details, fmt.Sprintf( + "depends on job '%s' with fork guard", + depName, + )) + } + } +} +``` + +**Handling dynamic environment expressions:** + +The rhdh-operator authorize job uses a *ternary expression* for the environment +name: + +```yaml +environment: + ${{ (...) && 'internal' || 'external' }} +``` + +This evaluates to either `internal` or `external` at runtime. The parser +currently captures this as the literal string +`${{ (...) && 'internal' || 'external' }}`. That's fine — any non-empty +environment value should be treated as potentially gated. The key signal is +that an environment exists, not which specific environment name it is. + +**Depth limit:** + +Only follow one level of `needs:`. Deeper chains are rare and add complexity +without proportional value. If job A `needs: B` and job B `needs: C`, only +check B's gates, not C's. + +**Severity adjustment:** + +Same as direct environment gate — the protection is equally strong whether it's +on the job itself or an upstream dependency: + +``` +NeedsGate with environment → same as EnvironmentGated (downgrade by 1) +NeedsGate with fork guard → same as ForkGuard (severity = info) +``` + +### Caller changes + +`CheckPwnRequest` needs to pass `wf.Jobs` (as a map) to `analyzeMitigations`. +The `Workflow.Jobs` field is currently a `[]Job` (slice). Either: + +(a) Change `Workflow.Jobs` to `map[string]Job` — breaking change but more + natural for job lookups. +(b) Keep the slice, add a `JobsByName map[string]Job` field populated during + parsing. +(c) Build the map in `CheckPwnRequest` before calling `analyzeMitigations`. + +**Recommendation:** Option (c) — minimal blast radius. Build a local map: + +```go +jobMap := make(map[string]Job, len(wf.Jobs)) +for _, j := range wf.Jobs { + jobMap[j.Name] = j +} +``` + +**Important:** The job map key must be the YAML key (e.g., `authorize`), not the +`name:` field (e.g., `"PR Bundle Manifests Validator"`). Currently `Job.Name` is +set from the YAML key in the parser. Verify this is the case. + +### Test fixtures needed + +**`test/fixtures/pwn-request-needs-gate.yaml`:** +```yaml +name: Gated Build +on: + pull_request_target: + types: [opened, synchronize, reopened] +jobs: + authorize: + environment: + ${{ github.event.pull_request.head.repo.full_name == github.repository && 'internal' || 'external' }} + runs-on: ubuntu-latest + steps: + - run: echo "approved" + build: + needs: authorize + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + - run: make build +``` +Expected: FG-001 high severity (downgraded from critical by 1 for environment +gate via needs chain). Mitigations should list: "depends on job 'authorize' with +environment '...' (requires approval)". + +**`test/fixtures/pwn-request-needs-fork-guard.yaml`:** +```yaml +name: Fork Gated +on: + pull_request_target: + types: [opened, synchronize] +jobs: + check: + if: github.event.pull_request.head.repo.full_name == github.repository + runs-on: ubuntu-latest + steps: + - run: echo "internal PR" + build: + needs: check + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + - run: npm ci && npm test +``` +Expected: FG-001 info severity (fork guard via needs chain). + +--- + +## Gap 4: Data-Only Fork Checkout + +### What Fluxgate misses + +Workflows that checkout fork code to a subdirectory and only copy *data files* +from it, while executing scripts exclusively from the base branch: + +```yaml +steps: + - name: Checkout base branch + uses: actions/checkout@v3 # base branch (safe) + - name: Checkout fork to 'merged' dir + uses: actions/checkout@v3 + with: + ref: ${{ github.event.pull_request.head.sha }} + path: merged # fork code isolated in subdir + - run: cp -r merged/l10n ./ # copy DATA only + - run: bin/setup-build-env-on-ubuntu # base branch script + - run: vendor/quarkus-l10n-utils/bin/build-for-preview # base branch script +``` + +Fluxgate sees `actions/checkout` with `ref: PR head` and flags it. Then +`analyzePostCheckoutExecution` sees `run:` blocks and classifies them as +confirmed execution. But the `run:` blocks execute base branch scripts, not +fork code. + +### Real-world examples from triage + +| # | Repo | Pattern | Triage result | +|---|------|---------|---------------| +| 5 | quarkusio/ja.quarkus.io | fork to `merged/`, copy `l10n/`, base scripts | HIGH (pattern-only) | +| 6 | quarkusio/pt.quarkus.io | same pattern | HIGH (pattern-only) | + +### Detection approach + +This is the hardest gap to close reliably. The challenge is distinguishing: + +- `cp -r merged/l10n ./` (data copy — no execution) +- `cd merged && make build` (direct fork code execution) +- `pip install -e merged/` (indirect fork code execution via setup.py) + +#### Approach: Detect `path:` isolation on checkout + +When a checkout uses the `path:` parameter, fork code is isolated in that +subdirectory. If no subsequent `run:` block references that path in an +execution context, the risk is lower. + +**Step 1 — Track checkout path in `CheckPwnRequest`:** + +```go +checkoutPath := step.With["path"] // e.g., "merged" +``` + +**Step 2 — Classify post-checkout commands by fork code interaction:** + +Add a new analysis function: + +```go +type ForkCodeInteraction int + +const ( + ForkExecDirect ForkCodeInteraction = iota // cd merged && make + ForkExecIndirect // pip install merged/ + ForkDataOnly // cp merged/data ./ + ForkNoInteraction // runs unrelated commands +) + +func classifyForkInteraction(step Step, checkoutPath string) ForkCodeInteraction +``` + +**Classification rules:** + +| Pattern | Classification | Example | +|---------|---------------|---------| +| `cd ` followed by build command | ForkExecDirect | `cd merged && make build` | +| `/script` or `.//script` | ForkExecDirect | `./merged/build.sh` | +| `pip install ` / `pip install -e ` | ForkExecIndirect | `pip install -e merged/` | +| `npm install --prefix ` | ForkExecIndirect | | +| `go test .//...` | ForkExecDirect | | +| `cp -r /subdir ./` | ForkDataOnly | `cp -r merged/l10n ./` | +| `mv /file ./` | ForkDataOnly | | +| `rsync /dir ./dest` | ForkDataOnly | | +| No reference to `` | ForkNoInteraction | `bin/setup-build-env` | + +**Step 3 — Adjust confidence and severity:** + +If the checkout has a `path:` and all post-checkout steps are `ForkDataOnly` or +`ForkNoInteraction`: + +```go +if checkoutPath != "" && allStepsAreDataOnlyOrNoInteraction { + confidence = ConfidencePatternOnly + severity = downgradeBy(severity, 1) + // Add detail: "fork code checked out to 'merged/' — only data files copied" +} +``` + +### Caveats and limitations + +1. **Symlink attacks:** `cp -r merged/l10n ./` will follow symlinks. A malicious + fork could place a symlink at `merged/l10n/evil -> ../../.github/workflows/` + to overwrite base branch workflow files. Fluxgate should note this as a + residual risk in the details, not suppress the finding entirely. + +2. **Build tool config poisoning:** Even if only data files are copied, some + build tools read config from the working directory. If `merged/l10n/` contains + a `.babelrc`, `tsconfig.json`, or `Makefile`, the base branch build scripts + might pick it up. This is hard to detect statically. + +3. **False negatives:** If the classification is wrong (e.g., a `cp` command + copies executable scripts that are later run), we'd suppress a real finding. + +**Recommendation:** Implement the `path:` isolation detection but only downgrade +confidence, not severity. Change the finding message to indicate the fork code +is path-isolated but note the residual symlink/config risk. + +```go +if checkoutPath != "" && noDirectForkExec { + confidence = ConfidencePatternOnly // not "confirmed" + msg += " (fork code isolated to '" + checkoutPath + "/' — " + + "no direct execution detected, verify data-only usage)" +} +``` + +This avoids the false "confirmed" classification while keeping the finding +visible for manual review. + +### Parser changes + +None. The `With` map already captures `path`. + +### Test fixtures needed + +**`test/fixtures/pwn-request-path-isolated.yaml`:** +```yaml +name: L10n Preview +on: pull_request_target +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + path: merged + - run: | + rm -rf l10n + cp -r merged/l10n ./ + - run: bin/build-preview +``` +Expected: FG-001 high severity, confidence pattern-only (not confirmed). Details +should note path isolation. + +**`test/fixtures/pwn-request-path-exec.yaml`:** +```yaml +name: Fork Build +on: pull_request_target +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + path: pr + - run: cd pr && make build +``` +Expected: FG-001 critical, confidence confirmed. Path isolation does not help +when the fork code is directly executed. + +--- + +## Implementation Priority + +| Gap | Impact | Effort | Priority | +|-----|--------|--------|----------| +| Gap 1: Actor Guards | 2 false positives eliminated | Small — string matching only | P1 | +| Gap 3: Needs Gating | 3 false criticals fixed | Medium — parser + detection | P1 | +| Gap 2: Action Perm Gates | 1 false critical fixed | Small — pattern list expansion | P2 | +| Gap 4: Path Isolation | 2 confidence corrections | Large — command classification | P3 | + +**Recommended order:** Gap 1 → Gap 3 → Gap 2 → Gap 4 + +Gaps 1-3 are high-confidence detections with clear true/false semantics. Gap 4 +involves heuristic command classification and should be approached conservatively. + +--- + +## Validation Plan + +After implementation, rescan all 11 triage repos and verify: + +| # | Repo | v0.3.0 | Expected v0.4.0 | +|---|------|--------|-----------------| +| 1 | konflux-ci/konflux-ui | critical | critical (custom shell gate — not detectable without deep analysis) | +| 2 | openshift/openstack-resource-controller | critical | critical (no change — unmitigated) | +| 3 | osbuild/image-builder-cli | critical | high (action perm gate detected) | +| 4 | osbuild/images | critical (×2) | critical (no change — unmitigated) | +| 5 | quarkusio/ja.quarkus.io | critical | critical or high, confidence → pattern-only (path isolation) | +| 6 | quarkusio/pt.quarkus.io | critical | critical or high, confidence → pattern-only (path isolation) | +| 7 | redhat-appstudio/backstage-community-plugins | critical | info (actor guard to bot) | +| 8 | redhat-developer/rhdh-operator | critical | high (needs gate → environment) | +| 9 | redhat-developer/rhdh-operator | critical | high (needs gate → environment) | +| 10 | redhat-developer/rhdh-operator | critical | high (needs gate → environment) | +| 11 | redhat-developer/rhdh-plugins | critical | info (actor guard to bot) | + +Also rescan the 20 validation repos to verify no regressions. + +--- + +## Non-Goals + +- **Custom shell script analysis** (konflux-ui's bash-based allowed users list) — + too fragile to parse arbitrary shell logic. The allowed users pattern in #1 is + bespoke and rare. +- **GitHub repository settings detection** (required reviewers, branch protection) — + this requires API calls beyond workflow file analysis. +- **Deep `needs:` chain traversal** — only follow one level. Multi-hop chains + are rare. +- **Makefile/script content analysis** — analyzing what `make build` actually + executes is out of scope for a workflow-level scanner. diff --git a/internal/scanner/rules.go b/internal/scanner/rules.go index d5a40e2..15879d8 100644 --- a/internal/scanner/rules.go +++ b/internal/scanner/rules.go @@ -46,7 +46,11 @@ type MitigationAnalysis struct { EnvironmentGated bool MaintainerCheck bool ForkGuard bool + ActorGuard bool // Job if: restricts execution to specific bot actor(s) + ActorGuardHuman bool // Job if: restricts to specific human actor(s) (weaker) + NeedsGate bool // Job depends on upstream job with environment/fork gate TokenBlanked bool + PathIsolated bool // Fork code checked out to subdirectory, no direct execution Details []string } @@ -102,12 +106,14 @@ func CheckPwnRequest(wf *Workflow) []Finding { checkoutIdx := -1 checkoutLine := 0 checkoutRef := "" + checkoutPath := "" for i, step := range job.Steps { if isCheckoutAction(step.Uses) && refPointsToPRHead(step.With["ref"]) { checkoutIdx = i checkoutLine = step.Line checkoutRef = step.With["ref"] + checkoutPath = step.With["path"] break } } @@ -132,21 +138,27 @@ func CheckPwnRequest(wf *Workflow) []Finding { } // Analyze mitigations and adjust severity - mitigation := analyzeMitigations(wf, job, checkoutIdx, postCheckoutSteps) + mitigation := analyzeMitigations(wf, job, checkoutIdx, postCheckoutSteps, checkoutPath) mitigated := false - if mitigation.ForkGuard { + if mitigation.ForkGuard || mitigation.ActorGuard { severity = SeverityInfo confidence = ConfidencePatternOnly mitigated = true } else if mitigation.LabelGated && mitigation.EnvironmentGated { severity = downgradeBy(severity, 2) mitigated = true - } else if mitigation.LabelGated || mitigation.EnvironmentGated || mitigation.MaintainerCheck { + } else if mitigation.LabelGated || mitigation.EnvironmentGated || mitigation.MaintainerCheck || mitigation.ActorGuardHuman { severity = downgradeBy(severity, 1) mitigated = true } + // Path isolation adjusts confidence, not severity + if mitigation.PathIsolated && confidence == ConfidenceConfirmed { + confidence = ConfidencePatternOnly + mitigated = true + } + msg := fmt.Sprintf("Pwn Request: pull_request_target with fork checkout [%s]", confidence) if execResult.Detail != "" { msg += " — " + execResult.Detail @@ -629,7 +641,7 @@ func CheckTokenExposure(wf *Workflow) []Finding { } // analyzeMitigations detects defensive controls on a workflow/job. -func analyzeMitigations(wf *Workflow, job Job, checkoutIdx int, postCheckoutSteps []Step) MitigationAnalysis { +func analyzeMitigations(wf *Workflow, job Job, checkoutIdx int, postCheckoutSteps []Step, checkoutPath string) MitigationAnalysis { m := MitigationAnalysis{} // 1. Check pull_request_target types filter @@ -638,7 +650,7 @@ func analyzeMitigations(wf *Workflow, job Job, checkoutIdx int, postCheckoutStep m.Details = append(m.Details, "trigger requires 'labeled' event (maintainer action)") } - // 2. Check job-level if: for label gates and fork guards + // 2. Check job-level if: for label gates, fork guards, and actor guards if job.If != "" { if containsLabelCheck(job.If) { m.LabelGated = true @@ -648,6 +660,14 @@ func analyzeMitigations(wf *Workflow, job Job, checkoutIdx int, postCheckoutStep m.ForkGuard = true m.Details = append(m.Details, "job if: contains fork guard") } + isBot, isHuman := containsActorGuard(job.If) + if isBot { + m.ActorGuard = true + m.Details = append(m.Details, fmt.Sprintf("job if: restricts to bot actor (%s)", truncate(job.If, 80))) + } else if isHuman { + m.ActorGuardHuman = true + m.Details = append(m.Details, fmt.Sprintf("job if: restricts to specific actor(s) (%s)", truncate(job.If, 80))) + } } // 3. Check environment protection @@ -679,6 +699,50 @@ func analyzeMitigations(wf *Workflow, job Job, checkoutIdx int, postCheckoutStep m.Details = append(m.Details, "GITHUB_TOKEN explicitly blanked on execution steps") } + // 6. Check needs: chain for upstream environment/fork gates + for _, depName := range job.Needs { + if dep, ok := wf.Jobs[depName]; ok { + if dep.Environment != "" { + m.NeedsGate = true + m.EnvironmentGated = true + m.Details = append(m.Details, fmt.Sprintf( + "depends on job '%s' with environment '%s' (requires approval)", + depName, truncate(dep.Environment, 60))) + } + if dep.If != "" && containsForkGuard(dep.If) { + m.NeedsGate = true + m.ForkGuard = true + m.Details = append(m.Details, fmt.Sprintf( + "depends on job '%s' with fork guard", depName)) + } + if dep.If != "" { + isBot, _ := containsActorGuard(dep.If) + if isBot { + m.NeedsGate = true + m.ActorGuard = true + m.Details = append(m.Details, fmt.Sprintf( + "depends on job '%s' with bot actor guard", depName)) + } + } + } + } + + // 7. Check path isolation — fork code in subdirectory with no direct execution + if checkoutPath != "" { + hasForkExec := false + for _, step := range postCheckoutSteps { + if step.Run != "" && referencesForkPath(step.Run, checkoutPath) { + hasForkExec = true + break + } + } + if !hasForkExec { + m.PathIsolated = true + m.Details = append(m.Details, fmt.Sprintf( + "fork code checked out to '%s/' — no direct execution of fork path detected", checkoutPath)) + } + } + return m } @@ -748,8 +812,59 @@ func containsForkGuard(ifExpr string) bool { return false } +// containsActorGuard detects actor-based gating in if: conditionals. +// Returns (isBot, isHuman): bot guards are strong (info), human guards are weaker (downgrade by 1). +func containsActorGuard(ifExpr string) (isBot bool, isHuman bool) { + lower := strings.ToLower(ifExpr) + actorPrefixes := []string{ + "github.actor ==", + "github.triggering_actor ==", + } + hasActorCheck := false + for _, p := range actorPrefixes { + if strings.Contains(lower, p) { + hasActorCheck = true + break + } + } + if !hasActorCheck { + // Also detect contains(fromJSON(...), github.actor) pattern + if strings.Contains(lower, "github.actor") && strings.Contains(lower, "contains(") { + hasActorCheck = true + } + } + if !hasActorCheck { + return false, false + } + // Bot accounts have [bot] suffix + if strings.Contains(lower, "[bot]") { + return true, false + } + return false, true +} + // containsMaintainerCheck detects permission verification steps. func containsMaintainerCheck(step Step) bool { + // 1. Check for known permission-checking actions + permissionActions := []string{ + "actions-cool/check-user-permission", + "prince-chrismc/check-actor-permissions-action", + "lannonbr/repo-permission-check-action", + "themoddinginquisition/actions-team-membership", + } + if step.Uses != "" { + actionName := step.Uses + if idx := strings.Index(actionName, "@"); idx != -1 { + actionName = actionName[:idx] + } + for _, action := range permissionActions { + if strings.EqualFold(actionName, action) { + return true + } + } + } + + // 2. Check script content for API calls checkPatterns := []string{ "getCollaboratorPermissionLevel", "repos.getCollaboratorPermission", @@ -769,6 +884,47 @@ func containsMaintainerCheck(step Step) bool { return false } +// referencesForkPath checks if a run block executes code from the fork checkout path. +func referencesForkPath(run string, checkoutPath string) bool { + execPatterns := []string{ + "cd " + checkoutPath, + "./" + checkoutPath + "/", + checkoutPath + "/", + "pip install " + checkoutPath, + "pip install -e " + checkoutPath, + "npm install --prefix " + checkoutPath, + } + for _, p := range execPatterns { + if strings.Contains(run, p) { + // Distinguish data-only operations from execution + // cp, mv, rsync are data operations, not execution + lines := strings.Split(run, "\n") + for _, line := range lines { + line = strings.TrimSpace(line) + if strings.Contains(line, checkoutPath) { + if isDataOnlyCommand(line) { + continue + } + return true + } + } + } + } + return false +} + +// isDataOnlyCommand checks if a shell line is a data-copy operation (not execution). +func isDataOnlyCommand(line string) bool { + dataOps := []string{"cp ", "cp -r ", "mv ", "rsync ", "rm ", "rm -rf ", "ln ", "mkdir "} + trimmed := strings.TrimSpace(line) + for _, op := range dataOps { + if strings.HasPrefix(trimmed, op) { + return true + } + } + return false +} + // isTokenBlanked checks if a step explicitly sets GITHUB_TOKEN to empty. func isTokenBlanked(step Step) bool { if token, ok := step.Env["GITHUB_TOKEN"]; ok { diff --git a/internal/scanner/rules_test.go b/internal/scanner/rules_test.go index dc7114c..be97aed 100644 --- a/internal/scanner/rules_test.go +++ b/internal/scanner/rules_test.go @@ -4,6 +4,7 @@ import ( "os" "path/filepath" "runtime" + "strings" "testing" ) @@ -345,6 +346,119 @@ func TestCheckTokenExposure_SafeWorkflow(t *testing.T) { } } +// --- Gap 1: Actor guard tests --- + +func TestCheckPwnRequest_ActorGuardBot(t *testing.T) { + wf := loadFixture(t, "pwn-request-actor-guard-bot.yaml") + findings := CheckPwnRequest(wf) + + if len(findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(findings)) + } + f := findings[0] + if f.Severity != SeverityInfo { + t.Errorf("expected info severity for bot actor guard, got %s", f.Severity) + } + if len(f.Mitigations) == 0 { + t.Error("expected mitigations to be populated") + } +} + +func TestCheckPwnRequest_ActorGuardHuman(t *testing.T) { + wf := loadFixture(t, "pwn-request-actor-guard-human.yaml") + findings := CheckPwnRequest(wf) + + if len(findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(findings)) + } + f := findings[0] + if f.Severity != SeverityHigh { + t.Errorf("expected high severity (downgraded from critical by 1 for human actor guard), got %s", f.Severity) + } +} + +// --- Gap 2: Action permission gate tests --- + +func TestCheckPwnRequest_ActionPermGate(t *testing.T) { + wf := loadFixture(t, "pwn-request-action-perm-gate.yaml") + findings := CheckPwnRequest(wf) + + if len(findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(findings)) + } + f := findings[0] + if f.Severity != SeverityHigh { + t.Errorf("expected high severity (downgraded from critical by 1 for maintainer check via action), got %s", f.Severity) + } + if len(f.Mitigations) == 0 { + t.Error("expected mitigations to be populated") + } +} + +// --- Gap 3: Cross-job needs gate tests --- + +func TestCheckPwnRequest_NeedsGate(t *testing.T) { + wf := loadFixture(t, "pwn-request-needs-gate.yaml") + findings := CheckPwnRequest(wf) + + if len(findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(findings)) + } + f := findings[0] + if f.Severity != SeverityHigh { + t.Errorf("expected high severity (downgraded from critical by 1 for environment gate via needs), got %s", f.Severity) + } + if len(f.Mitigations) == 0 { + t.Error("expected mitigations to be populated") + } + // Verify the mitigation mentions the authorize job + found := false + for _, m := range f.Mitigations { + if strings.Contains(m, "authorize") { + found = true + } + } + if !found { + t.Error("expected mitigations to reference 'authorize' job") + } +} + +// --- Gap 4: Path isolation tests --- + +func TestCheckPwnRequest_PathIsolated(t *testing.T) { + wf := loadFixture(t, "pwn-request-path-isolated.yaml") + findings := CheckPwnRequest(wf) + + if len(findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(findings)) + } + f := findings[0] + // Path isolation downgrades confidence to pattern-only, not severity + if f.Confidence != ConfidencePatternOnly { + t.Errorf("expected pattern-only confidence for path-isolated checkout, got %s", f.Confidence) + } + if len(f.Mitigations) == 0 { + t.Error("expected mitigations to mention path isolation") + } +} + +func TestCheckPwnRequest_PathExec(t *testing.T) { + wf := loadFixture(t, "pwn-request-path-exec.yaml") + findings := CheckPwnRequest(wf) + + if len(findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(findings)) + } + f := findings[0] + // Path with direct execution should still be confirmed/critical + if f.Severity != SeverityCritical { + t.Errorf("expected critical severity for path with direct execution, got %s", f.Severity) + } + if f.Confidence != ConfidenceConfirmed { + t.Errorf("expected confirmed confidence for path with direct execution, got %s", f.Confidence) + } +} + // --- Filter tests --- func TestSeverityFilter(t *testing.T) { diff --git a/internal/scanner/workflow.go b/internal/scanner/workflow.go index 67f6bc2..eb04b00 100644 --- a/internal/scanner/workflow.go +++ b/internal/scanner/workflow.go @@ -43,7 +43,8 @@ type Job struct { Environment string // environment protection name, if set Permissions PermissionsConfig Steps []Step - Secrets string // "inherit" or empty + Secrets string // "inherit" or empty + Needs []string // job IDs this job depends on } // Step represents a single step in a job. @@ -72,6 +73,7 @@ type rawJob struct { Permissions yaml.Node `yaml:"permissions"` Steps []rawStep `yaml:"steps"` Secrets string `yaml:"secrets"` + Needs yaml.Node `yaml:"needs"` } type rawStep struct { @@ -125,6 +127,7 @@ func ParseWorkflow(data []byte, path string) (*Workflow, error) { Environment: parseEnvironment(&rawJ.Environment), Permissions: parsePermissions(&rawJ.Permissions), Secrets: rawJ.Secrets, + Needs: parseNeeds(&rawJ.Needs), } for i, rawS := range rawJ.Steps { step := Step{ @@ -294,6 +297,27 @@ func parseEnvironment(node *yaml.Node) string { return "" } +// parseNeeds extracts job dependency names from the needs: field. +// Handles both string ("needs: job1") and list ("needs: [job1, job2]") forms. +func parseNeeds(node *yaml.Node) []string { + if node == nil || node.Kind == 0 { + return nil + } + switch node.Kind { + case yaml.ScalarNode: + if node.Value != "" { + return []string{node.Value} + } + case yaml.SequenceNode: + var needs []string + for _, n := range node.Content { + needs = append(needs, n.Value) + } + return needs + } + return nil +} + // HasElevatedPermissions checks if the workflow/job has write permissions. func HasElevatedPermissions(wfPerms, jobPerms PermissionsConfig) bool { if wfPerms.WriteAll || jobPerms.WriteAll { diff --git a/test/fixtures/pwn-request-action-perm-gate.yaml b/test/fixtures/pwn-request-action-perm-gate.yaml new file mode 100644 index 0000000..7728eeb --- /dev/null +++ b/test/fixtures/pwn-request-action-perm-gate.yaml @@ -0,0 +1,21 @@ +name: Permission Gated +on: + pull_request_target: + types: [opened, synchronize, reopened] +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Get User Permission + id: checkAccess + uses: actions-cool/check-user-permission@v2 + with: + require: write + username: ${{ github.triggering_actor }} + - name: Check User Permission + if: steps.checkAccess.outputs.require-result == 'false' + run: exit 1 + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + - run: pytest test/ diff --git a/test/fixtures/pwn-request-actor-guard-bot.yaml b/test/fixtures/pwn-request-actor-guard-bot.yaml new file mode 100644 index 0000000..ab77649 --- /dev/null +++ b/test/fixtures/pwn-request-actor-guard-bot.yaml @@ -0,0 +1,13 @@ +name: Bot Only +on: + pull_request_target: + paths: ['**/yarn.lock'] +jobs: + generate: + runs-on: ubuntu-latest + if: github.actor == 'renovate[bot]' + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} + - run: node ./scripts/generate-changesets.js diff --git a/test/fixtures/pwn-request-actor-guard-human.yaml b/test/fixtures/pwn-request-actor-guard-human.yaml new file mode 100644 index 0000000..6a2c9dc --- /dev/null +++ b/test/fixtures/pwn-request-actor-guard-human.yaml @@ -0,0 +1,13 @@ +name: Allowed User +on: + pull_request_target: + types: [opened, synchronize] +jobs: + build: + runs-on: ubuntu-latest + if: contains(fromJSON('["alice","bob"]'), github.actor) + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + - run: make build diff --git a/test/fixtures/pwn-request-needs-gate.yaml b/test/fixtures/pwn-request-needs-gate.yaml new file mode 100644 index 0000000..21d486a --- /dev/null +++ b/test/fixtures/pwn-request-needs-gate.yaml @@ -0,0 +1,19 @@ +name: Gated Build +on: + pull_request_target: + types: [opened, synchronize, reopened] +jobs: + authorize: + environment: + ${{ github.event.pull_request.head.repo.full_name == github.repository && 'internal' || 'external' }} + runs-on: ubuntu-latest + steps: + - run: echo "approved" + build: + needs: authorize + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + - run: make build diff --git a/test/fixtures/pwn-request-path-exec.yaml b/test/fixtures/pwn-request-path-exec.yaml new file mode 100644 index 0000000..59d288c --- /dev/null +++ b/test/fixtures/pwn-request-path-exec.yaml @@ -0,0 +1,11 @@ +name: Fork Build +on: pull_request_target +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + path: pr + - run: cd pr && make build diff --git a/test/fixtures/pwn-request-path-isolated.yaml b/test/fixtures/pwn-request-path-isolated.yaml new file mode 100644 index 0000000..9509a74 --- /dev/null +++ b/test/fixtures/pwn-request-path-isolated.yaml @@ -0,0 +1,15 @@ +name: L10n Preview +on: pull_request_target +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + path: merged + - run: | + rm -rf l10n + cp -r merged/l10n ./ + - run: bin/build-preview From b1cf94372c4d4735624e4ebfa915a0c9bd30b59c Mon Sep 17 00:00:00 2001 From: Christopher Lusk Date: Sun, 22 Mar 2026 20:53:46 -0400 Subject: [PATCH 02/10] =?UTF-8?q?feat:=20fluxgate=20v0.5.0=20=E2=80=94=20c?= =?UTF-8?q?ross-platform=20CI/CD=20analysis,=203=20new=20rules,=20bulk=20s?= =?UTF-8?q?canning?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New GitHub Actions rules: - FG-008: OIDC misconfiguration (id-token:write on fork-accessible triggers) - FG-009: Self-hosted runner exposure on PR/PRT workflows - FG-010: Cache poisoning via actions/cache on external triggers Cross-platform CI/CD support: - Platform-agnostic Pipeline interface (internal/cicd) - GitLab CI parser with 4 rules (GL-001 MR secrets, GL-002 script injection, GL-003 unsafe includes, GL-009 self-hosted MR runner) Bulk scanning infrastructure: - BigQuery ingest command for large-scale workflow analysis - Gato-X import command for converting discovery output to repo lists - Analysis SQL queries for scan campaign reporting Fixes: - Route all warning output to stderr (fixes JSON output corruption) - Fix runs-on group+labels parser to prefer labels over group name - Version bump to v0.5.0 Co-Authored-By: Claude Opus 4.6 --- cmd/fluxgate/main.go | 245 +++++++++++++++- internal/cicd/gitlab.go | 331 ++++++++++++++++++++++ internal/cicd/gitlab_rules.go | 248 ++++++++++++++++ internal/cicd/gitlab_test.go | 148 ++++++++++ internal/cicd/platform.go | 82 ++++++ internal/github/batch.go | 2 +- internal/github/client.go | 5 +- internal/github/discover.go | 3 +- internal/report/table.go | 2 +- internal/scanner/rules.go | 328 +++++++++++++++++++++ internal/scanner/rules_test.go | 130 +++++++++ internal/scanner/scanner.go | 120 ++++++-- internal/scanner/workflow.go | 42 +++ internal/scanner/workflow_test.go | 124 ++++++++ scripts/analysis-queries.sql | 160 +++++++++++ scripts/bigquery-prt-workflows.sql | 43 +++ test/fixtures/cache-poisoning-pr.yaml | 15 + test/fixtures/cache-poisoning-prt.yaml | 17 ++ test/fixtures/cache-setup-action.yaml | 15 + test/fixtures/gitlab-mr-secrets.yml | 16 ++ test/fixtures/gitlab-safe.yml | 9 + test/fixtures/gitlab-script-injection.yml | 12 + test/fixtures/gitlab-unsafe-include.yml | 13 + test/fixtures/oidc-prt-fork-checkout.yaml | 20 ++ test/fixtures/oidc-prt-no-fork.yaml | 17 ++ test/fixtures/oidc-push-only.yaml | 18 ++ test/fixtures/self-hosted-pr.yaml | 12 + test/fixtures/self-hosted-prt-fork.yaml | 13 + test/fixtures/self-hosted-push-only.yaml | 11 + 29 files changed, 2169 insertions(+), 32 deletions(-) create mode 100644 internal/cicd/gitlab.go create mode 100644 internal/cicd/gitlab_rules.go create mode 100644 internal/cicd/gitlab_test.go create mode 100644 internal/cicd/platform.go create mode 100644 internal/scanner/workflow_test.go create mode 100644 scripts/analysis-queries.sql create mode 100644 scripts/bigquery-prt-workflows.sql create mode 100644 test/fixtures/cache-poisoning-pr.yaml create mode 100644 test/fixtures/cache-poisoning-prt.yaml create mode 100644 test/fixtures/cache-setup-action.yaml create mode 100644 test/fixtures/gitlab-mr-secrets.yml create mode 100644 test/fixtures/gitlab-safe.yml create mode 100644 test/fixtures/gitlab-script-injection.yml create mode 100644 test/fixtures/gitlab-unsafe-include.yml create mode 100644 test/fixtures/oidc-prt-fork-checkout.yaml create mode 100644 test/fixtures/oidc-prt-no-fork.yaml create mode 100644 test/fixtures/oidc-push-only.yaml create mode 100644 test/fixtures/self-hosted-pr.yaml create mode 100644 test/fixtures/self-hosted-prt-fork.yaml create mode 100644 test/fixtures/self-hosted-push-only.yaml diff --git a/cmd/fluxgate/main.go b/cmd/fluxgate/main.go index ec34177..2b26934 100644 --- a/cmd/fluxgate/main.go +++ b/cmd/fluxgate/main.go @@ -1,7 +1,9 @@ package main import ( + "bufio" "context" + "encoding/json" "fmt" "os" "strings" @@ -14,7 +16,7 @@ import ( "github.com/spf13/cobra" ) -var version = "0.4.0" +var version = "0.5.0" func main() { rootCmd := &cobra.Command{ @@ -28,6 +30,8 @@ func main() { rootCmd.AddCommand(newRemoteCmd()) rootCmd.AddCommand(newBatchCmd()) rootCmd.AddCommand(newDiscoverCmd()) + rootCmd.AddCommand(newIngestCmd()) + rootCmd.AddCommand(newGatoxImportCmd()) if err := rootCmd.Execute(); err != nil { os.Exit(1) @@ -354,6 +358,245 @@ func writeRepoList(repos []ghclient.RepoInfo, output string) error { return nil } +// ingestRecord represents a single workflow file from BigQuery export. +type ingestRecord struct { + Repo string `json:"repo"` + Path string `json:"path"` + Content string `json:"content"` +} + +func newIngestCmd() *cobra.Command { + var ( + dbPath string + severities string + rules string + source string + ) + + cmd := &cobra.Command{ + Use: "ingest [file.jsonl]", + Short: "Ingest pre-extracted workflow YAML from JSONL", + Long: `Ingest workflow YAML from a JSONL file (e.g., BigQuery export). +Each line must be a JSON object: {"repo": "owner/repo", "path": ".github/workflows/ci.yml", "content": "..."} +Results are stored in SQLite for analysis.`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + db, err := store.Open(dbPath) + if err != nil { + return fmt.Errorf("opening database: %w", err) + } + defer db.Close() + + f, err := os.Open(args[0]) + if err != nil { + return fmt.Errorf("opening input: %w", err) + } + defer f.Close() + + opts := parseScanOpts(severities, rules) + sc := bufio.NewScanner(f) + sc.Buffer(make([]byte, 0), 10*1024*1024) // 10MB max line + + var total, scanned, withFindings, errors int + // Track workflows per repo for batch saving + repoWorkflows := make(map[string][]ingestRecord) + + for sc.Scan() { + total++ + var rec ingestRecord + if err := json.Unmarshal(sc.Bytes(), &rec); err != nil { + errors++ + continue + } + if rec.Repo == "" || rec.Content == "" { + errors++ + continue + } + repoWorkflows[rec.Repo] = append(repoWorkflows[rec.Repo], rec) + } + if err := sc.Err(); err != nil { + return fmt.Errorf("reading input: %w", err) + } + + fmt.Printf("Loaded %d workflows from %d repos (%d errors)\n", total, len(repoWorkflows), errors) + + for repo, records := range repoWorkflows { + parts := strings.SplitN(repo, "/", 2) + if len(parts) != 2 { + errors++ + continue + } + owner, name := parts[0], parts[1] + + result := &scanner.ScanResult{ + Path: repo, + Workflows: len(records), + } + + for _, rec := range records { + findings, err := scanner.ScanWorkflowBytes([]byte(rec.Content), rec.Path, opts) + if err != nil { + continue + } + result.Findings = append(result.Findings, findings...) + } + + scanned++ + if len(result.Findings) > 0 { + withFindings++ + } + + if err := db.SaveResultWithSource(owner, name, 0, "", result, source); err != nil { + fmt.Fprintf(os.Stderr, "Error saving %s: %v\n", repo, err) + errors++ + } + + if scanned%1000 == 0 { + fmt.Printf(" Processed %d/%d repos...\n", scanned, len(repoWorkflows)) + } + } + + fmt.Printf("\nIngest complete: %d repos scanned, %d with findings, %d errors\n", + scanned, withFindings, errors) + + return nil + }, + } + + cmd.Flags().StringVar(&dbPath, "db", "ingest.db", "SQLite database path") + cmd.Flags().StringVar(&severities, "severity", "", "Filter by severity (comma-separated)") + cmd.Flags().StringVar(&rules, "rules", "", "Filter by rule ID (comma-separated)") + cmd.Flags().StringVar(&source, "source", "bigquery", "Source tag for these records") + + return cmd +} + +func newGatoxImportCmd() *cobra.Command { + var output string + + cmd := &cobra.Command{ + Use: "gatox-import [file.json]", + Short: "Import Gato-X enumeration results as a repo list", + Long: `Convert Gato-X JSON output to a Fluxgate repo list. +Reads Gato-X enumeration output and extracts unique repos for batch scanning. + +Usage: + gato-x enumerate -t -o gatox-results.json + fluxgate gatox-import gatox-results.json -o repos.txt + fluxgate batch --list repos.txt --db gatox-scan.db --resume`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + data, err := os.ReadFile(args[0]) + if err != nil { + return fmt.Errorf("reading input: %w", err) + } + + // Gato-X outputs various JSON formats depending on the command. + // Try to extract repo names from common patterns. + repos := extractGatoxRepos(data) + + if len(repos) == 0 { + return fmt.Errorf("no repos found in input file") + } + + var w *os.File + if output != "" { + w, err = os.Create(output) + if err != nil { + return err + } + defer w.Close() + } else { + w = os.Stdout + } + + for _, repo := range repos { + fmt.Fprintln(w, repo) + } + + fmt.Fprintf(os.Stderr, "Extracted %d unique repos\n", len(repos)) + return nil + }, + } + + cmd.Flags().StringVarP(&output, "output", "o", "", "Output repo list file (default: stdout)") + return cmd +} + +// extractGatoxRepos parses various Gato-X output formats and extracts repo names. +func extractGatoxRepos(data []byte) []string { + seen := make(map[string]bool) + var repos []string + + // Try parsing as a JSON array of objects with "repo" or "full_name" fields + var records []map[string]interface{} + if err := json.Unmarshal(data, &records); err == nil { + for _, rec := range records { + for _, key := range []string{"repo", "full_name", "repository", "repo_name"} { + if val, ok := rec[key]; ok { + if s, ok := val.(string); ok && strings.Contains(s, "/") && !seen[s] { + seen[s] = true + repos = append(repos, s) + } + } + } + } + return repos + } + + // Try parsing as a JSON object with nested repo references + var obj map[string]interface{} + if err := json.Unmarshal(data, &obj); err == nil { + extractReposFromMap(obj, seen, &repos) + return repos + } + + // Try line-by-line JSON (JSONL) + for _, line := range strings.Split(string(data), "\n") { + line = strings.TrimSpace(line) + if line == "" { + continue + } + var rec map[string]interface{} + if err := json.Unmarshal([]byte(line), &rec); err == nil { + for _, key := range []string{"repo", "full_name", "repository"} { + if val, ok := rec[key]; ok { + if s, ok := val.(string); ok && strings.Contains(s, "/") && !seen[s] { + seen[s] = true + repos = append(repos, s) + } + } + } + } + } + + return repos +} + +func extractReposFromMap(obj map[string]interface{}, seen map[string]bool, repos *[]string) { + for _, key := range []string{"repo", "full_name", "repository"} { + if val, ok := obj[key]; ok { + if s, ok := val.(string); ok && strings.Contains(s, "/") && !seen[s] { + seen[s] = true + *repos = append(*repos, s) + } + } + } + // Recurse into nested objects and arrays + for _, val := range obj { + switch v := val.(type) { + case map[string]interface{}: + extractReposFromMap(v, seen, repos) + case []interface{}: + for _, item := range v { + if m, ok := item.(map[string]interface{}); ok { + extractReposFromMap(m, seen, repos) + } + } + } + } +} + func generateReport(db *store.DB, path string) error { stats, err := db.GetReportStats() if err != nil { diff --git a/internal/cicd/gitlab.go b/internal/cicd/gitlab.go new file mode 100644 index 0000000..fd1cf8a --- /dev/null +++ b/internal/cicd/gitlab.go @@ -0,0 +1,331 @@ +package cicd + +import ( + "fmt" + "strings" + + "gopkg.in/yaml.v3" +) + +// GitLabPipeline represents a parsed .gitlab-ci.yml file. +type GitLabPipeline struct { + path string + triggers []TriggerType + jobs []PipelineJob + includes []GitLabInclude + variables map[string]string +} + +// GitLabInclude represents an include directive in .gitlab-ci.yml. +type GitLabInclude struct { + Local string + Remote string + Project string + File string + Ref string +} + +// rawGitLabCI is the intermediate representation for YAML unmarshalling. +type rawGitLabCI struct { + Stages []string `yaml:"stages"` + Variables map[string]interface{} `yaml:"variables"` + Include yaml.Node `yaml:"include"` + Workflow *rawGitLabWorkflow `yaml:"workflow"` + // Jobs are top-level keys that aren't reserved keywords +} + +type rawGitLabWorkflow struct { + Rules []rawGitLabRule `yaml:"rules"` +} + +type rawGitLabRule struct { + If string `yaml:"if"` + When string `yaml:"when"` + Exists string `yaml:"exists"` + Changes string `yaml:"changes"` + Variables map[string]string `yaml:"variables"` +} + +type rawGitLabJob struct { + Stage string `yaml:"stage"` + Image string `yaml:"image"` + Tags []string `yaml:"tags"` + Script yaml.Node `yaml:"script"` + BeforeScript yaml.Node `yaml:"before_script"` + AfterScript yaml.Node `yaml:"after_script"` + Rules []rawGitLabRule `yaml:"rules"` + Only yaml.Node `yaml:"only"` + Except yaml.Node `yaml:"except"` + Environment yaml.Node `yaml:"environment"` + Secrets yaml.Node `yaml:"secrets"` + Variables map[string]string `yaml:"variables"` + Needs []string `yaml:"needs"` + Services yaml.Node `yaml:"services"` +} + +// Reserved GitLab CI keywords that are NOT job names. +var gitlabReservedKeys = map[string]bool{ + "stages": true, "variables": true, "include": true, "workflow": true, + "default": true, "image": true, "services": true, "cache": true, + "before_script": true, "after_script": true, "pages": true, +} + +// ParseGitLabCI parses a .gitlab-ci.yml file. +func ParseGitLabCI(data []byte, path string) (*GitLabPipeline, error) { + var doc yaml.Node + if err := yaml.Unmarshal(data, &doc); err != nil { + return nil, fmt.Errorf("parsing %s: %w", path, err) + } + if doc.Kind != yaml.DocumentNode || len(doc.Content) == 0 { + return nil, fmt.Errorf("parsing %s: empty document", path) + } + + // First pass: parse reserved keys + var raw rawGitLabCI + if err := doc.Content[0].Decode(&raw); err != nil { + return nil, fmt.Errorf("decoding %s: %w", path, err) + } + + pipeline := &GitLabPipeline{ + path: path, + variables: make(map[string]string), + } + + // Parse variables + for k, v := range raw.Variables { + switch val := v.(type) { + case string: + pipeline.variables[k] = val + case map[string]interface{}: + if value, ok := val["value"]; ok { + pipeline.variables[k] = fmt.Sprintf("%v", value) + } + } + } + + // Parse includes + pipeline.includes = parseGitLabIncludes(&raw.Include) + + // Determine triggers from workflow rules + pipeline.triggers = inferGitLabTriggers(raw.Workflow) + + // Second pass: extract jobs (top-level keys that aren't reserved) + root := doc.Content[0] + if root.Kind == yaml.MappingNode { + for i := 0; i < len(root.Content)-1; i += 2 { + key := root.Content[i].Value + if gitlabReservedKeys[key] || strings.HasPrefix(key, ".") { + continue // Skip reserved keys and hidden jobs (templates) + } + + var rawJob rawGitLabJob + if err := root.Content[i+1].Decode(&rawJob); err != nil { + continue + } + + job := convertGitLabJob(key, &rawJob, root.Content[i+1]) + pipeline.jobs = append(pipeline.jobs, job) + } + } + + return pipeline, nil +} + +// Platform implements Pipeline. +func (p *GitLabPipeline) Platform() string { return "gitlab" } + +// FilePath implements Pipeline. +func (p *GitLabPipeline) FilePath() string { return p.path } + +// Triggers implements Pipeline. +func (p *GitLabPipeline) Triggers() []TriggerType { return p.triggers } + +// HasExternalTrigger implements Pipeline. +func (p *GitLabPipeline) HasExternalTrigger() bool { + for _, t := range p.triggers { + if t == TriggerExternalPR || t == TriggerComment { + return true + } + } + return false +} + +// Jobs implements Pipeline. +func (p *GitLabPipeline) Jobs() []PipelineJob { return p.jobs } + +// Includes returns external includes (potential supply chain risk). +func (p *GitLabPipeline) Includes() []GitLabInclude { return p.includes } + +// Variables returns pipeline-level variables. +func (p *GitLabPipeline) Variables() map[string]string { return p.variables } + +func inferGitLabTriggers(workflow *rawGitLabWorkflow) []TriggerType { + triggers := []TriggerType{TriggerPush} // GitLab pipelines run on push by default + + if workflow == nil { + // Default: pipelines run on push, MR, and merge result + triggers = append(triggers, TriggerInternalPR) + return triggers + } + + for _, rule := range workflow.Rules { + lower := strings.ToLower(rule.If) + if strings.Contains(lower, "ci_pipeline_source") { + if strings.Contains(lower, "merge_request_event") { + triggers = append(triggers, TriggerInternalPR) + } + if strings.Contains(lower, "schedule") { + triggers = append(triggers, TriggerSchedule) + } + if strings.Contains(lower, "web") || strings.Contains(lower, "api") { + triggers = append(triggers, TriggerAPI) + } + if strings.Contains(lower, "trigger") { + triggers = append(triggers, TriggerAPI) + } + } + } + + return triggers +} + +func convertGitLabJob(name string, raw *rawGitLabJob, node *yaml.Node) PipelineJob { + job := PipelineJob{ + Name: name, + DependsOn: raw.Needs, + } + + // Runner type from tags + for _, tag := range raw.Tags { + if strings.ToLower(tag) == "self-hosted" || strings.ToLower(tag) == "shell" { + job.RunnerType = "self-hosted" + break + } + } + if job.RunnerType == "" && len(raw.Tags) > 0 { + job.RunnerType = strings.Join(raw.Tags, ",") + } + + // Environment + job.Environment = parseGitLabEnvironment(&raw.Environment) + + // Rules as conditions + for _, rule := range raw.Rules { + if rule.If != "" { + job.Conditions = append(job.Conditions, rule.If) + } + } + + // Extract script steps + job.Steps = append(job.Steps, extractScriptSteps("before_script", &raw.BeforeScript, node)...) + job.Steps = append(job.Steps, extractScriptSteps("script", &raw.Script, node)...) + job.Steps = append(job.Steps, extractScriptSteps("after_script", &raw.AfterScript, node)...) + + // Extract secret references from variables + for _, v := range raw.Variables { + if strings.HasPrefix(v, "$") || strings.Contains(v, "CI_JOB_TOKEN") { + job.Secrets = append(job.Secrets, v) + } + } + + return job +} + +func extractScriptSteps(name string, node *yaml.Node, jobNode *yaml.Node) []PipelineStep { + if node == nil || node.Kind == 0 { + return nil + } + + var steps []PipelineStep + switch node.Kind { + case yaml.ScalarNode: + steps = append(steps, PipelineStep{ + Name: name, + Type: StepScript, + Command: node.Value, + Line: node.Line, + }) + case yaml.SequenceNode: + for _, item := range node.Content { + steps = append(steps, PipelineStep{ + Name: name, + Type: StepScript, + Command: item.Value, + Line: item.Line, + }) + } + } + return steps +} + +func parseGitLabEnvironment(node *yaml.Node) string { + if node == nil || node.Kind == 0 { + return "" + } + switch node.Kind { + case yaml.ScalarNode: + return node.Value + case yaml.MappingNode: + for i := 0; i < len(node.Content)-1; i += 2 { + if node.Content[i].Value == "name" { + return node.Content[i+1].Value + } + } + } + return "" +} + +func parseGitLabIncludes(node *yaml.Node) []GitLabInclude { + if node == nil || node.Kind == 0 { + return nil + } + + var includes []GitLabInclude + + switch node.Kind { + case yaml.ScalarNode: + includes = append(includes, GitLabInclude{Local: node.Value}) + case yaml.SequenceNode: + for _, item := range node.Content { + inc := parseGitLabIncludeItem(item) + includes = append(includes, inc) + } + case yaml.MappingNode: + inc := parseGitLabIncludeItem(node) + includes = append(includes, inc) + } + + return includes +} + +func parseGitLabIncludeItem(node *yaml.Node) GitLabInclude { + inc := GitLabInclude{} + if node.Kind == yaml.ScalarNode { + if strings.HasPrefix(node.Value, "http") { + inc.Remote = node.Value + } else { + inc.Local = node.Value + } + return inc + } + + if node.Kind == yaml.MappingNode { + for i := 0; i < len(node.Content)-1; i += 2 { + key := node.Content[i].Value + val := node.Content[i+1].Value + switch key { + case "local": + inc.Local = val + case "remote": + inc.Remote = val + case "project": + inc.Project = val + case "file": + inc.File = val + case "ref": + inc.Ref = val + } + } + } + return inc +} diff --git a/internal/cicd/gitlab_rules.go b/internal/cicd/gitlab_rules.go new file mode 100644 index 0000000..a5db660 --- /dev/null +++ b/internal/cicd/gitlab_rules.go @@ -0,0 +1,248 @@ +package cicd + +import ( + "fmt" + "strings" +) + +// GitLabFinding represents a security finding in a GitLab CI pipeline. +type GitLabFinding struct { + RuleID string + Severity string + File string + Line int + Message string + Details string +} + +// Severity constants (matching scanner package). +const ( + severityCritical = "critical" + severityHigh = "high" + severityMedium = "medium" + severityLow = "low" + severityInfo = "info" +) + +// ScanGitLabPipeline runs all GitLab CI security rules against a parsed pipeline. +func ScanGitLabPipeline(pipeline *GitLabPipeline) []GitLabFinding { + var findings []GitLabFinding + findings = append(findings, checkGitLabMRSecrets(pipeline)...) + findings = append(findings, checkGitLabScriptInjection(pipeline)...) + findings = append(findings, checkGitLabUnsafeIncludes(pipeline)...) + findings = append(findings, checkGitLabSelfHostedRunner(pipeline)...) + return findings +} + +// checkGitLabMRSecrets detects merge request pipelines that may expose +// parent project secrets to fork MR authors (GitLab equivalent of FG-001). +func checkGitLabMRSecrets(pipeline *GitLabPipeline) []GitLabFinding { + var findings []GitLabFinding + + for _, job := range pipeline.Jobs() { + // Check if job runs on MR pipelines + runsOnMR := false + for _, cond := range job.Conditions { + lower := strings.ToLower(cond) + if strings.Contains(lower, "ci_pipeline_source") && + strings.Contains(lower, "merge_request_event") { + runsOnMR = true + break + } + } + + if !runsOnMR { + continue + } + + // Check for secrets in variables or script + hasSecrets := false + for _, secret := range job.Secrets { + if strings.Contains(secret, "CI_JOB_TOKEN") || + strings.HasPrefix(secret, "$") { + hasSecrets = true + break + } + } + + // Check for dangerous commands in scripts + hasDangerousExec := false + for _, step := range job.Steps { + if step.Type != StepScript { + continue + } + cmd := strings.ToLower(step.Command) + dangerousCmds := []string{ + "pip install", "npm install", "npm ci", "yarn install", + "make", "cargo build", "go build", "mvn", "gradle", + "./", "bash ", "sh ", "python ", "node ", + } + for _, dc := range dangerousCmds { + if strings.Contains(cmd, dc) { + hasDangerousExec = true + break + } + } + } + + // Check for checkout of MR source branch + checksOutMRCode := false + for _, step := range job.Steps { + cmd := strings.ToLower(step.Command) + if strings.Contains(cmd, "git checkout") && + (strings.Contains(cmd, "ci_merge_request_source_branch") || + strings.Contains(cmd, "merge_request_source_branch_name")) { + checksOutMRCode = true + break + } + } + + if hasDangerousExec || checksOutMRCode { + severity := severityMedium + msg := fmt.Sprintf("GitLab MR Pipeline: job '%s' runs on merge_request_event", job.Name) + + if checksOutMRCode { + severity = severityHigh + msg += " with explicit checkout of MR source branch" + } + if hasSecrets { + severity = severityHigh + msg += " with secret access" + } + if hasDangerousExec { + msg += " — executes build commands that may run fork code" + } + + findings = append(findings, GitLabFinding{ + RuleID: "GL-001", + Severity: severity, + File: pipeline.FilePath(), + Message: msg, + Details: "GitLab CI pipelines triggered by merge_request_event may expose parent project CI/CD variables to fork MR authors if ci_allow_fork_pipelines_to_run_in_parent_project is enabled.", + }) + } + } + return findings +} + +// checkGitLabScriptInjection detects user-controllable CI variables used +// in script blocks (GitLab equivalent of FG-002). +func checkGitLabScriptInjection(pipeline *GitLabPipeline) []GitLabFinding { + dangerousVars := []string{ + "$CI_MERGE_REQUEST_TITLE", + "$CI_MERGE_REQUEST_DESCRIPTION", + "$CI_COMMIT_MESSAGE", + "$CI_COMMIT_TITLE", + "$CI_MERGE_REQUEST_SOURCE_BRANCH_NAME", + "${CI_MERGE_REQUEST_TITLE}", + "${CI_MERGE_REQUEST_DESCRIPTION}", + "${CI_COMMIT_MESSAGE}", + } + + var findings []GitLabFinding + for _, job := range pipeline.Jobs() { + for _, step := range job.Steps { + if step.Type != StepScript { + continue + } + for _, dv := range dangerousVars { + if strings.Contains(step.Command, dv) { + findings = append(findings, GitLabFinding{ + RuleID: "GL-002", + Severity: severityHigh, + File: pipeline.FilePath(), + Line: step.Line, + Message: fmt.Sprintf( + "GitLab Script Injection: %s used in script block of job '%s'", + dv, job.Name), + Details: "User-controllable CI variables in script blocks can be exploited for command injection via crafted branch names, commit messages, or MR titles.", + }) + break + } + } + } + } + return findings +} + +// checkGitLabUnsafeIncludes detects external includes from unversioned or +// untrusted sources (GitLab equivalent of FG-003). +func checkGitLabUnsafeIncludes(pipeline *GitLabPipeline) []GitLabFinding { + var findings []GitLabFinding + + for _, inc := range pipeline.Includes() { + if inc.Remote != "" { + severity := severityMedium + msg := fmt.Sprintf("GitLab Unsafe Include: remote include from %s", inc.Remote) + + // HTTP (not HTTPS) is worse + if strings.HasPrefix(inc.Remote, "http://") { + severity = severityHigh + msg += " (unencrypted HTTP)" + } + + findings = append(findings, GitLabFinding{ + RuleID: "GL-003", + Severity: severity, + File: pipeline.FilePath(), + Message: msg, + Details: "Remote includes fetch pipeline configuration from external URLs. If the URL is compromised or intercepted, arbitrary pipeline code can be injected.", + }) + } + + if inc.Project != "" && inc.Ref == "" { + findings = append(findings, GitLabFinding{ + RuleID: "GL-003", + Severity: severityLow, + File: pipeline.FilePath(), + Message: fmt.Sprintf( + "GitLab Unpinned Include: project include from %s without ref pin", + inc.Project), + Details: "Project includes without a ref pin use the default branch, which can change. Pin to a specific tag or SHA.", + }) + } + } + return findings +} + +// checkGitLabSelfHostedRunner detects jobs using self-hosted runners +// (shell executor, custom tags) on MR pipelines (GitLab equivalent of FG-009). +func checkGitLabSelfHostedRunner(pipeline *GitLabPipeline) []GitLabFinding { + var findings []GitLabFinding + + for _, job := range pipeline.Jobs() { + if job.RunnerType == "" || job.RunnerType == "docker" { + continue + } + + isSelfHosted := job.RunnerType == "self-hosted" || + job.RunnerType == "shell" || + strings.Contains(job.RunnerType, "shell") + + if !isSelfHosted { + continue + } + + // Check if this job runs on MR events + runsOnMR := false + for _, cond := range job.Conditions { + if strings.Contains(strings.ToLower(cond), "merge_request") { + runsOnMR = true + break + } + } + + if runsOnMR { + findings = append(findings, GitLabFinding{ + RuleID: "GL-009", + Severity: severityHigh, + File: pipeline.FilePath(), + Message: fmt.Sprintf( + "GitLab Self-Hosted Runner: job '%s' uses shell executor on MR pipeline", + job.Name), + Details: "Shell executors on self-hosted runners execute commands directly on the host. Fork MR authors can execute arbitrary commands on the runner machine, risking credential theft and lateral movement.", + }) + } + } + return findings +} diff --git a/internal/cicd/gitlab_test.go b/internal/cicd/gitlab_test.go new file mode 100644 index 0000000..4188b76 --- /dev/null +++ b/internal/cicd/gitlab_test.go @@ -0,0 +1,148 @@ +package cicd + +import ( + "os" + "path/filepath" + "runtime" + "strings" + "testing" +) + +func fixturesDir() string { + _, filename, _, _ := runtime.Caller(0) + return filepath.Join(filepath.Dir(filename), "..", "..", "test", "fixtures") +} + +func loadGitLabFixture(t *testing.T, name string) *GitLabPipeline { + t.Helper() + path := filepath.Join(fixturesDir(), name) + data, err := os.ReadFile(path) + if err != nil { + t.Fatalf("reading fixture %s: %v", name, err) + } + pipeline, err := ParseGitLabCI(data, name) + if err != nil { + t.Fatalf("parsing fixture %s: %v", name, err) + } + return pipeline +} + +func TestParseGitLabCI_Basic(t *testing.T) { + pipeline := loadGitLabFixture(t, "gitlab-safe.yml") + + if pipeline.Platform() != "gitlab" { + t.Errorf("expected platform 'gitlab', got %s", pipeline.Platform()) + } + + jobs := pipeline.Jobs() + if len(jobs) != 1 { + t.Fatalf("expected 1 job, got %d", len(jobs)) + } + if jobs[0].Name != "test" { + t.Errorf("expected job name 'test', got %s", jobs[0].Name) + } +} + +func TestGitLabMRSecrets(t *testing.T) { + pipeline := loadGitLabFixture(t, "gitlab-mr-secrets.yml") + findings := ScanGitLabPipeline(pipeline) + + var mrFindings []GitLabFinding + for _, f := range findings { + if f.RuleID == "GL-001" { + mrFindings = append(mrFindings, f) + } + } + + if len(mrFindings) != 1 { + t.Fatalf("expected 1 GL-001 finding, got %d", len(mrFindings)) + } + f := mrFindings[0] + if f.Severity != severityMedium { + t.Errorf("expected medium severity (MR pipeline with pip install), got %s", f.Severity) + } + if !strings.Contains(f.Message, "test") { + t.Error("expected message to reference job name 'test'") + } +} + +func TestGitLabScriptInjection(t *testing.T) { + pipeline := loadGitLabFixture(t, "gitlab-script-injection.yml") + findings := ScanGitLabPipeline(pipeline) + + var injectionFindings []GitLabFinding + for _, f := range findings { + if f.RuleID == "GL-002" { + injectionFindings = append(injectionFindings, f) + } + } + + if len(injectionFindings) == 0 { + t.Fatal("expected at least 1 GL-002 finding for $CI_MERGE_REQUEST_TITLE in script") + } + for _, f := range injectionFindings { + if f.Severity != severityHigh { + t.Errorf("expected high severity, got %s", f.Severity) + } + } +} + +func TestGitLabUnsafeIncludes(t *testing.T) { + pipeline := loadGitLabFixture(t, "gitlab-unsafe-include.yml") + findings := ScanGitLabPipeline(pipeline) + + var includeFindings []GitLabFinding + for _, f := range findings { + if f.RuleID == "GL-003" { + includeFindings = append(includeFindings, f) + } + } + + if len(includeFindings) < 2 { + t.Fatalf("expected at least 2 GL-003 findings (remote + unpinned project), got %d", len(includeFindings)) + } + + hasRemote := false + hasUnpinned := false + for _, f := range includeFindings { + if strings.Contains(f.Message, "remote include") { + hasRemote = true + } + if strings.Contains(f.Message, "Unpinned Include") { + hasUnpinned = true + } + } + if !hasRemote { + t.Error("expected a finding for remote include") + } + if !hasUnpinned { + t.Error("expected a finding for unpinned project include") + } +} + +func TestGitLabSafe_NoFindings(t *testing.T) { + pipeline := loadGitLabFixture(t, "gitlab-safe.yml") + findings := ScanGitLabPipeline(pipeline) + + // Safe pipeline should have no critical/high findings + for _, f := range findings { + if f.Severity == severityCritical || f.Severity == severityHigh { + t.Errorf("expected no critical/high findings for safe pipeline, got %s: %s", f.Severity, f.Message) + } + } +} + +func TestGitLabIncludes_Parsed(t *testing.T) { + pipeline := loadGitLabFixture(t, "gitlab-unsafe-include.yml") + + includes := pipeline.Includes() + if len(includes) != 2 { + t.Fatalf("expected 2 includes, got %d", len(includes)) + } + if includes[0].Remote != "https://example.com/ci-templates/lint.yml" { + t.Errorf("expected remote include URL, got %s", includes[0].Remote) + } + if includes[1].Project != "my-group/ci-templates" { + t.Errorf("expected project include, got %s", includes[1].Project) + } +} diff --git a/internal/cicd/platform.go b/internal/cicd/platform.go new file mode 100644 index 0000000..fbc1cde --- /dev/null +++ b/internal/cicd/platform.go @@ -0,0 +1,82 @@ +// Package cicd provides a platform-agnostic interface for CI/CD pipeline +// security analysis. Each CI/CD platform (GitHub Actions, GitLab CI, etc.) +// implements the Pipeline interface, enabling cross-platform vulnerability +// detection with shared rule logic. +package cicd + +// TriggerType categorizes CI/CD pipeline triggers by trust level. +type TriggerType string + +const ( + // TriggerExternalPR is an external/fork PR that runs in privileged context. + // GitHub: pull_request_target, GitLab: merge_request_event (parent context) + TriggerExternalPR TriggerType = "external_pr" + + // TriggerInternalPR is an internal PR with limited privileges. + // GitHub: pull_request, GitLab: merge_request_event (fork context) + TriggerInternalPR TriggerType = "internal_pr" + + // TriggerPush is a push to a branch (trusted). + TriggerPush TriggerType = "push" + + // TriggerSchedule is a scheduled/cron trigger (trusted). + TriggerSchedule TriggerType = "schedule" + + // TriggerManual is a manual trigger (trusted). + TriggerManual TriggerType = "manual" + + // TriggerComment is triggered by a comment (partially trusted). + TriggerComment TriggerType = "comment" + + // TriggerAPI is triggered by API/webhook (trust depends on config). + TriggerAPI TriggerType = "api" +) + +// Pipeline is the platform-agnostic representation of a CI/CD pipeline config. +type Pipeline interface { + // Platform returns the CI/CD platform name (e.g., "github", "gitlab"). + Platform() string + + // FilePath returns the path to the pipeline config file. + FilePath() string + + // Triggers returns all trigger types that activate this pipeline. + Triggers() []TriggerType + + // HasExternalTrigger returns true if the pipeline can be triggered by + // untrusted external input (fork PRs, comments, etc.). + HasExternalTrigger() bool + + // Jobs returns all jobs/stages in the pipeline. + Jobs() []PipelineJob +} + +// PipelineJob represents a single job/stage in a CI/CD pipeline. +type PipelineJob struct { + Name string + Conditions []string // Conditional execution rules + Environment string // Environment/deployment protection + RunnerType string // Runner type (e.g., "hosted", "self-hosted", tag) + Steps []PipelineStep + DependsOn []string // Job dependencies + Secrets []string // Secret references + Permissions map[string]string +} + +// PipelineStep represents a single step/script block in a job. +type PipelineStep struct { + Name string + Type StepType + Command string // Script content or action reference + Line int + Env map[string]string +} + +// StepType categorizes pipeline steps. +type StepType string + +const ( + StepScript StepType = "script" // Shell script execution + StepAction StepType = "action" // Reusable action/template + StepInclude StepType = "include" // External config include +) diff --git a/internal/github/batch.go b/internal/github/batch.go index ebb0618..b9b6bcd 100644 --- a/internal/github/batch.go +++ b/internal/github/batch.go @@ -138,7 +138,7 @@ func (c *Client) BatchScan(ctx context.Context, repos []RepoInfo, opts BatchOpti // Store the repo as scanned but with 0 findings so we don't retry emptyResult := &scanner.ScanResult{Path: fmt.Sprintf("%s/%s", repo.Owner, repo.Name)} if saveErr := opts.DB.SaveResult(repo.Owner, repo.Name, repo.Stars, repo.Language, emptyResult); saveErr != nil { - fmt.Printf(" Warning: could not save error state: %v\n", saveErr) + fmt.Fprintf(os.Stderr, " Warning: could not save error state: %v\n", saveErr) } continue } diff --git a/internal/github/client.go b/internal/github/client.go index 3eba055..ef77c45 100644 --- a/internal/github/client.go +++ b/internal/github/client.go @@ -3,6 +3,7 @@ package github import ( "context" "fmt" + "os" "strings" gh "github.com/google/go-github/v60/github" @@ -54,7 +55,7 @@ func (c *Client) FetchWorkflows(ctx context.Context, owner, repo string) ([]Work content, err := c.fetchFileContent(ctx, owner, repo, entry.GetPath()) if err != nil { - fmt.Printf(" Warning: could not fetch %s: %v\n", entry.GetPath(), err) + fmt.Fprintf(os.Stderr, " Warning: could not fetch %s: %v\n", entry.GetPath(), err) continue } @@ -99,7 +100,7 @@ func (c *Client) ScanRemote(ctx context.Context, owner, repo string, opts scanne for _, wf := range workflows { findings, err := scanner.ScanWorkflowBytes(wf.Content, wf.Path, opts) if err != nil { - fmt.Printf(" Warning: could not parse %s: %v\n", wf.Path, err) + fmt.Fprintf(os.Stderr, " Warning: could not parse %s: %v\n", wf.Path, err) continue } result.Findings = append(result.Findings, findings...) diff --git a/internal/github/discover.go b/internal/github/discover.go index db01a78..e1edbe4 100644 --- a/internal/github/discover.go +++ b/internal/github/discover.go @@ -3,6 +3,7 @@ package github import ( "context" "fmt" + "os" "time" gh "github.com/google/go-github/v60/github" @@ -76,7 +77,7 @@ func (c *Client) DiscoverRepos(ctx context.Context, opts DiscoverOptions) ([]Rep return r, resp, err }) if err != nil { - fmt.Printf(" Warning: could not fetch repo info for %s: %v\n", key, err) + fmt.Fprintf(os.Stderr, " Warning: could not fetch repo info for %s: %v\n", key, err) continue } stars = fullRepo.GetStargazersCount() diff --git a/internal/report/table.go b/internal/report/table.go index f3f17ba..d2e0f37 100644 --- a/internal/report/table.go +++ b/internal/report/table.go @@ -11,7 +11,7 @@ import ( // WriteTable writes findings as a human-readable table. func WriteTable(w io.Writer, result *scanner.ScanResult) { - fmt.Fprintln(w, "fluxgate v0.1.0 — CI/CD Pipeline Security Gate") + fmt.Fprintln(w, "fluxgate v0.5.0 — CI/CD Pipeline Security Gate") fmt.Fprintln(w) fmt.Fprintf(w, "Scanning: %s\n\n", result.Path) diff --git a/internal/scanner/rules.go b/internal/scanner/rules.go index 15879d8..7af2155 100644 --- a/internal/scanner/rules.go +++ b/internal/scanner/rules.go @@ -19,6 +19,9 @@ func AllRules() map[string]Rule { "FG-005": CheckSecretsInLogs, "FG-006": CheckForkPRCodeExec, "FG-007": CheckTokenExposure, + "FG-008": CheckOIDCMisconfiguration, + "FG-009": CheckSelfHostedRunner, + "FG-010": CheckCachePoisoning, } } @@ -31,6 +34,9 @@ var RuleDescriptions = map[string]string{ "FG-005": "Secrets in Logs", "FG-006": "Fork PR Code Execution", "FG-007": "Token Exposure in Build Steps", + "FG-008": "OIDC Misconfiguration", + "FG-009": "Self-Hosted Runner", + "FG-010": "Cache Poisoning", } // ExecutionAnalysis captures the result of post-checkout step analysis. @@ -1001,6 +1007,328 @@ func containsExpression(run, expr string) bool { strings.Contains(run, "${{"+expr) } +// githubHostedPrefixes lists runner labels that indicate GitHub-hosted runners. +var githubHostedPrefixes = []string{ + "ubuntu-", "windows-", "macos-", +} + +// githubHostedExact lists exact runner labels that indicate GitHub-hosted runners. +var githubHostedExact = []string{ + "ubuntu-latest", "windows-latest", "macos-latest", + "macos-13", "macos-14", "macos-15", +} + +// isSelfHostedRunner checks if the runs-on labels indicate a self-hosted runner. +func isSelfHostedRunner(labels []string) bool { + for _, label := range labels { + lower := strings.ToLower(label) + if lower == "self-hosted" { + return true + } + } + // If no labels match known GitHub-hosted patterns, it's likely self-hosted + if len(labels) == 0 { + return false + } + for _, label := range labels { + lower := strings.ToLower(label) + isGitHubHosted := false + for _, exact := range githubHostedExact { + if lower == exact { + isGitHubHosted = true + break + } + } + if !isGitHubHosted { + for _, prefix := range githubHostedPrefixes { + if strings.HasPrefix(lower, prefix) { + isGitHubHosted = true + break + } + } + } + if isGitHubHosted { + return false // At least one label matches a GitHub-hosted runner + } + } + // All labels are non-standard — could be self-hosted or custom runner group + // Only flag if "self-hosted" is explicitly present to avoid false positives + return false +} + +// CheckOIDCMisconfiguration detects id-token:write permissions on externally-triggered +// workflows where an attacker could mint cloud credentials (FG-008). +func CheckOIDCMisconfiguration(wf *Workflow) []Finding { + var findings []Finding + + isExternalTrigger := wf.On.PullRequestTarget || wf.On.IssueComment + + for jobName, job := range wf.Jobs { + hasIDTokenWrite := false + + // Check job-level permissions first, then workflow-level + if job.Permissions.Scopes["id-token"] == "write" { + hasIDTokenWrite = true + } else if !job.Permissions.Set && wf.Permissions.Scopes["id-token"] == "write" { + hasIDTokenWrite = true + } + + if !hasIDTokenWrite { + continue + } + + // Check for cloud credential actions + var cloudActions []string + for _, step := range job.Steps { + if step.Uses == "" { + continue + } + actionName := step.Uses + if idx := strings.Index(actionName, "@"); idx != -1 { + actionName = actionName[:idx] + } + switch actionName { + case "aws-actions/configure-aws-credentials", + "google-github-actions/auth", + "azure/login", + "hashicorp/vault-action": + cloudActions = append(cloudActions, actionName) + } + } + + if wf.On.PullRequestTarget { + // Check if this job also checks out fork code + checkoutsFork := false + for _, step := range job.Steps { + if isCheckoutAction(step.Uses) && refPointsToPRHead(step.With["ref"]) { + checkoutsFork = true + break + } + } + + severity := SeverityHigh + if checkoutsFork { + severity = SeverityCritical + } + + msg := fmt.Sprintf("OIDC Misconfiguration: id-token:write on pull_request_target workflow (job '%s')", jobName) + if len(cloudActions) > 0 { + msg += fmt.Sprintf(" with cloud auth (%s)", strings.Join(cloudActions, ", ")) + } + if checkoutsFork { + msg += " — attacker can mint cloud credentials via fork PR" + } + + findings = append(findings, Finding{ + RuleID: "FG-008", + Severity: severity, + File: wf.Path, + Line: 1, + Message: msg, + Details: "id-token:write on pull_request_target allows fork PR authors to request OIDC tokens. If cloud providers have overly broad sub claims, attacker can assume cloud roles.", + }) + } else if isExternalTrigger { + findings = append(findings, Finding{ + RuleID: "FG-008", + Severity: SeverityMedium, + File: wf.Path, + Line: 1, + Message: fmt.Sprintf("OIDC Misconfiguration: id-token:write on externally-triggered workflow (job '%s')", jobName), + Details: "id-token:write on external triggers may allow token minting by untrusted actors depending on trigger type.", + }) + } else if len(cloudActions) > 0 { + // Informational: OIDC with cloud actions on non-external triggers + findings = append(findings, Finding{ + RuleID: "FG-008", + Severity: SeverityInfo, + File: wf.Path, + Line: 1, + Message: fmt.Sprintf("OIDC Configuration: id-token:write with cloud auth in job '%s' (%s)", jobName, strings.Join(cloudActions, ", ")), + }) + } + } + return findings +} + +// CheckSelfHostedRunner detects self-hosted runners on workflows that accept +// external input, which risks runner persistence and lateral movement (FG-009). +func CheckSelfHostedRunner(wf *Workflow) []Finding { + var findings []Finding + + acceptsExternalPRs := wf.On.PullRequest || wf.On.PullRequestTarget + + for jobName, job := range wf.Jobs { + if !isSelfHostedExplicit(job.RunsOn) { + continue + } + + // Self-hosted runner on a workflow that accepts external PRs + if acceptsExternalPRs { + severity := SeverityHigh + msg := fmt.Sprintf( + "Self-Hosted Runner: job '%s' runs on self-hosted runner with external PR trigger", + jobName) + + if wf.On.PullRequestTarget { + // Check if it also checks out fork code + for _, step := range job.Steps { + if isCheckoutAction(step.Uses) && refPointsToPRHead(step.With["ref"]) { + severity = SeverityCritical + msg += " — fork code executes on self-hosted runner (persistence risk)" + break + } + } + } + + // Check for fork guard mitigation + if job.If != "" && containsForkGuard(job.If) { + severity = SeverityInfo + msg += " (mitigated: fork guard)" + } + + findings = append(findings, Finding{ + RuleID: "FG-009", + Severity: severity, + File: wf.Path, + Line: 1, + Message: msg, + Details: "Self-hosted runners on public repos accepting PRs allow fork authors to execute code on persistent infrastructure. Non-ephemeral runners risk credential theft, backdoor installation, and lateral movement.", + }) + } else if wf.On.IssueComment || wf.On.WorkflowRun { + findings = append(findings, Finding{ + RuleID: "FG-009", + Severity: SeverityMedium, + File: wf.Path, + Line: 1, + Message: fmt.Sprintf("Self-Hosted Runner: job '%s' runs on self-hosted runner with external trigger", jobName), + Details: "Self-hosted runners on externally-triggered workflows may allow untrusted code execution on persistent infrastructure.", + }) + } + } + return findings +} + +// isSelfHostedExplicit checks if runs-on labels explicitly include "self-hosted". +func isSelfHostedExplicit(labels []string) bool { + for _, label := range labels { + if strings.ToLower(label) == "self-hosted" { + return true + } + } + return false +} + +// CheckCachePoisoning detects shared cache usage across trust boundaries that +// could enable cache poisoning attacks (FG-010). +func CheckCachePoisoning(wf *Workflow) []Finding { + var findings []Finding + + isExternalTrigger := wf.On.PullRequestTarget || wf.On.PullRequest + + if !isExternalTrigger { + return nil + } + + for jobName, job := range wf.Jobs { + for _, step := range job.Steps { + if step.Uses == "" { + continue + } + actionName := step.Uses + if idx := strings.Index(actionName, "@"); idx != -1 { + actionName = actionName[:idx] + } + + // Detect actions/cache with save on PR workflows + if actionName == "actions/cache" || actionName == "actions/cache/save" { + // Check if cache key uses attacker-controllable inputs + cacheKey := step.With["key"] + hasAttackerInput := false + attackerInputs := []string{ + "github.head_ref", + "github.event.pull_request", + "hashFiles(", + } + for _, input := range attackerInputs { + if strings.Contains(cacheKey, input) { + hasAttackerInput = true + break + } + } + + severity := SeverityMedium + if wf.On.PullRequestTarget { + severity = SeverityHigh + } + + msg := fmt.Sprintf( + "Cache Poisoning: actions/cache in job '%s' on %s workflow", + jobName, describeTrigger(wf)) + if hasAttackerInput { + msg += " with attacker-controllable cache key" + } + + // Check if the job also checks out fork code and executes it + hasExec := false + if wf.On.PullRequestTarget { + for _, s := range job.Steps { + if isCheckoutAction(s.Uses) && refPointsToPRHead(s.With["ref"]) { + postSteps := job.Steps[0:] // simplified — flag the pattern + execResult := analyzePostCheckoutExecution(postSteps) + if execResult.Confirmed || execResult.Likely { + hasExec = true + } + break + } + } + } + + if hasExec { + severity = SeverityHigh + msg += " — fork code execution can poison cache for subsequent runs" + } + + findings = append(findings, Finding{ + RuleID: "FG-010", + Severity: severity, + File: wf.Path, + Line: step.Line, + Message: msg, + Details: "Shared caches on PR workflows allow fork authors to poison cache entries. Poisoned caches persist and affect subsequent builds on the default branch, enabling code execution via dependency or build artifact replacement.", + }) + } + + // Detect setup-* actions with built-in caching + if strings.HasPrefix(actionName, "actions/setup-") { + if step.With["cache"] != "" && step.With["cache"] != "false" { + findings = append(findings, Finding{ + RuleID: "FG-010", + Severity: SeverityLow, + File: wf.Path, + Line: step.Line, + Message: fmt.Sprintf( + "Cache Poisoning: %s with cache enabled in job '%s' on %s workflow", + actionName, jobName, describeTrigger(wf)), + Details: "Setup actions with built-in caching on PR workflows may share cache with default branch builds.", + }) + } + } + } + } + return findings +} + +// describeTrigger returns a human-readable description of the workflow trigger. +func describeTrigger(wf *Workflow) string { + if wf.On.PullRequestTarget { + return "pull_request_target" + } + if wf.On.PullRequest { + return "pull_request" + } + return "external" +} + func describePermissions(wfPerms, jobPerms PermissionsConfig) string { if wfPerms.WriteAll || jobPerms.WriteAll { return "write-all" diff --git a/internal/scanner/rules_test.go b/internal/scanner/rules_test.go index be97aed..deafa08 100644 --- a/internal/scanner/rules_test.go +++ b/internal/scanner/rules_test.go @@ -459,6 +459,136 @@ func TestCheckPwnRequest_PathExec(t *testing.T) { } } +// --- FG-008 OIDC tests --- + +func TestCheckOIDC_PRTWithForkCheckout(t *testing.T) { + wf := loadFixture(t, "oidc-prt-fork-checkout.yaml") + findings := CheckOIDCMisconfiguration(wf) + + if len(findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(findings)) + } + f := findings[0] + if f.RuleID != "FG-008" { + t.Errorf("expected FG-008, got %s", f.RuleID) + } + if f.Severity != SeverityCritical { + t.Errorf("expected critical (PRT + fork checkout + cloud auth), got %s", f.Severity) + } + if !strings.Contains(f.Message, "aws-actions/configure-aws-credentials") { + t.Error("expected message to mention AWS auth action") + } +} + +func TestCheckOIDC_PRTNoFork(t *testing.T) { + wf := loadFixture(t, "oidc-prt-no-fork.yaml") + findings := CheckOIDCMisconfiguration(wf) + + if len(findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(findings)) + } + f := findings[0] + if f.Severity != SeverityHigh { + t.Errorf("expected high (PRT but no fork checkout), got %s", f.Severity) + } +} + +func TestCheckOIDC_PushOnly(t *testing.T) { + wf := loadFixture(t, "oidc-push-only.yaml") + findings := CheckOIDCMisconfiguration(wf) + + if len(findings) != 1 { + t.Fatalf("expected 1 info finding for push-only OIDC, got %d", len(findings)) + } + if findings[0].Severity != SeverityInfo { + t.Errorf("expected info for push-only OIDC, got %s", findings[0].Severity) + } +} + +// --- FG-009 Self-Hosted Runner tests --- + +func TestCheckSelfHosted_PR(t *testing.T) { + wf := loadFixture(t, "self-hosted-pr.yaml") + findings := CheckSelfHostedRunner(wf) + + if len(findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(findings)) + } + f := findings[0] + if f.RuleID != "FG-009" { + t.Errorf("expected FG-009, got %s", f.RuleID) + } + if f.Severity != SeverityHigh { + t.Errorf("expected high severity for self-hosted on PR, got %s", f.Severity) + } +} + +func TestCheckSelfHosted_PRTFork(t *testing.T) { + wf := loadFixture(t, "self-hosted-prt-fork.yaml") + findings := CheckSelfHostedRunner(wf) + + if len(findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(findings)) + } + f := findings[0] + if f.Severity != SeverityCritical { + t.Errorf("expected critical (PRT + fork checkout on self-hosted), got %s", f.Severity) + } +} + +func TestCheckSelfHosted_PushOnly(t *testing.T) { + wf := loadFixture(t, "self-hosted-push-only.yaml") + findings := CheckSelfHostedRunner(wf) + + if len(findings) != 0 { + t.Fatalf("expected 0 findings for push-only self-hosted, got %d", len(findings)) + } +} + +// --- FG-010 Cache Poisoning tests --- + +func TestCheckCachePoisoning_PRT(t *testing.T) { + wf := loadFixture(t, "cache-poisoning-prt.yaml") + findings := CheckCachePoisoning(wf) + + if len(findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(findings)) + } + f := findings[0] + if f.RuleID != "FG-010" { + t.Errorf("expected FG-010, got %s", f.RuleID) + } + if f.Severity != SeverityHigh { + t.Errorf("expected high for cache on PRT with fork exec, got %s", f.Severity) + } +} + +func TestCheckCachePoisoning_PR(t *testing.T) { + wf := loadFixture(t, "cache-poisoning-pr.yaml") + findings := CheckCachePoisoning(wf) + + if len(findings) != 1 { + t.Fatalf("expected 1 finding, got %d", len(findings)) + } + f := findings[0] + if f.Severity != SeverityMedium { + t.Errorf("expected medium for cache on PR, got %s", f.Severity) + } +} + +func TestCheckCachePoisoning_SetupAction(t *testing.T) { + wf := loadFixture(t, "cache-setup-action.yaml") + findings := CheckCachePoisoning(wf) + + if len(findings) != 1 { + t.Fatalf("expected 1 finding for setup-node with cache, got %d", len(findings)) + } + f := findings[0] + if f.Severity != SeverityLow { + t.Errorf("expected low for setup-action caching, got %s", f.Severity) + } +} + // --- Filter tests --- func TestSeverityFilter(t *testing.T) { diff --git a/internal/scanner/scanner.go b/internal/scanner/scanner.go index aa5d98b..ea8d8af 100644 --- a/internal/scanner/scanner.go +++ b/internal/scanner/scanner.go @@ -5,6 +5,8 @@ import ( "path/filepath" "sort" "strings" + + "github.com/north-echo/fluxgate/internal/cicd" ) // ScanOptions configures a scan run. @@ -20,39 +22,48 @@ type ScanResult struct { Findings []Finding } -// ScanDirectory scans all workflow files in a directory's .github/workflows/ folder. +// ScanDirectory scans all workflow files in a directory's .github/workflows/ folder, +// and optionally .gitlab-ci.yml if present. func ScanDirectory(dir string, opts ScanOptions) (*ScanResult, error) { + result := &ScanResult{Path: dir} + + // Scan GitHub Actions workflows workflowDir := filepath.Join(dir, ".github", "workflows") - info, err := os.Stat(workflowDir) - if err != nil { - return nil, err - } - if !info.IsDir() { - return nil, os.ErrNotExist - } + if info, err := os.Stat(workflowDir); err == nil && info.IsDir() { + entries, err := os.ReadDir(workflowDir) + if err != nil { + return nil, err + } - entries, err := os.ReadDir(workflowDir) - if err != nil { - return nil, err - } + for _, entry := range entries { + if entry.IsDir() { + continue + } + name := entry.Name() + if !strings.HasSuffix(name, ".yaml") && !strings.HasSuffix(name, ".yml") { + continue + } - result := &ScanResult{Path: workflowDir} - for _, entry := range entries { - if entry.IsDir() { - continue - } - name := entry.Name() - if !strings.HasSuffix(name, ".yaml") && !strings.HasSuffix(name, ".yml") { - continue + path := filepath.Join(workflowDir, name) + findings, err := ScanFile(path, opts) + if err != nil { + continue // skip unparseable files + } + result.Workflows++ + result.Findings = append(result.Findings, findings...) } + } - path := filepath.Join(workflowDir, name) - findings, err := ScanFile(path, opts) - if err != nil { - continue // skip unparseable files - } + // Scan GitLab CI if .gitlab-ci.yml exists + gitlabPath := filepath.Join(dir, ".gitlab-ci.yml") + if data, err := os.ReadFile(gitlabPath); err == nil { + glFindings := ScanGitLabCI(data, gitlabPath, opts) result.Workflows++ - result.Findings = append(result.Findings, findings...) + result.Findings = append(result.Findings, glFindings...) + } + + if result.Workflows == 0 { + return nil, os.ErrNotExist } sortFindings(result.Findings) @@ -116,6 +127,63 @@ func ScanWorkflowBytes(data []byte, path string, opts ScanOptions) ([]Finding, e return ScanWorkflow(wf, opts), nil } +// ScanGitLabCI parses and scans a .gitlab-ci.yml file, returning findings +// in the common Finding format. +func ScanGitLabCI(data []byte, path string, opts ScanOptions) []Finding { + pipeline, err := cicd.ParseGitLabCI(data, path) + if err != nil { + return nil + } + + glFindings := cicd.ScanGitLabPipeline(pipeline) + + // Convert GitLab findings to common Finding type + var findings []Finding + for _, glf := range glFindings { + f := Finding{ + RuleID: glf.RuleID, + Severity: glf.Severity, + File: glf.File, + Line: glf.Line, + Message: glf.Message, + Details: glf.Details, + } + findings = append(findings, f) + } + + // Apply severity filter + if len(opts.Severities) > 0 { + sevSet := make(map[string]bool) + for _, s := range opts.Severities { + sevSet[strings.ToLower(s)] = true + } + var filtered []Finding + for _, f := range findings { + if sevSet[f.Severity] { + filtered = append(filtered, f) + } + } + findings = filtered + } + + // Apply rule filter + if len(opts.Rules) > 0 { + ruleSet := make(map[string]bool) + for _, r := range opts.Rules { + ruleSet[r] = true + } + var filtered []Finding + for _, f := range findings { + if ruleSet[f.RuleID] { + filtered = append(filtered, f) + } + } + findings = filtered + } + + return findings +} + func sortFindings(findings []Finding) { sort.Slice(findings, func(i, j int) bool { ri := SeverityRank(findings[i].Severity) diff --git a/internal/scanner/workflow.go b/internal/scanner/workflow.go index eb04b00..8f38c52 100644 --- a/internal/scanner/workflow.go +++ b/internal/scanner/workflow.go @@ -45,6 +45,7 @@ type Job struct { Steps []Step Secrets string // "inherit" or empty Needs []string // job IDs this job depends on + RunsOn []string // runner labels (e.g. ["ubuntu-latest"], ["self-hosted", "linux"]) } // Step represents a single step in a job. @@ -74,6 +75,7 @@ type rawJob struct { Steps []rawStep `yaml:"steps"` Secrets string `yaml:"secrets"` Needs yaml.Node `yaml:"needs"` + RunsOn yaml.Node `yaml:"runs-on"` } type rawStep struct { @@ -128,6 +130,7 @@ func ParseWorkflow(data []byte, path string) (*Workflow, error) { Permissions: parsePermissions(&rawJ.Permissions), Secrets: rawJ.Secrets, Needs: parseNeeds(&rawJ.Needs), + RunsOn: parseRunsOn(&rawJ.RunsOn), } for i, rawS := range rawJ.Steps { step := Step{ @@ -318,6 +321,45 @@ func parseNeeds(node *yaml.Node) []string { return nil } +// parseRunsOn extracts runner labels from the runs-on: field. +// Handles string ("runs-on: ubuntu-latest"), list ("runs-on: [self-hosted, linux]"), +// and group ("runs-on: {group: ...}") forms. +func parseRunsOn(node *yaml.Node) []string { + if node == nil || node.Kind == 0 { + return nil + } + switch node.Kind { + case yaml.ScalarNode: + if node.Value != "" { + return []string{node.Value} + } + case yaml.SequenceNode: + var labels []string + for _, n := range node.Content { + if n.Value != "" { + labels = append(labels, n.Value) + } + } + return labels + case yaml.MappingNode: + // Handle {group: ..., labels: [...]} form + // Prefer labels over group name when both are present + var group string + for i := 0; i < len(node.Content)-1; i += 2 { + if node.Content[i].Value == "labels" { + return parseRunsOn(node.Content[i+1]) + } + if node.Content[i].Value == "group" { + group = node.Content[i+1].Value + } + } + if group != "" { + return []string{group} + } + } + return nil +} + // HasElevatedPermissions checks if the workflow/job has write permissions. func HasElevatedPermissions(wfPerms, jobPerms PermissionsConfig) bool { if wfPerms.WriteAll || jobPerms.WriteAll { diff --git a/internal/scanner/workflow_test.go b/internal/scanner/workflow_test.go new file mode 100644 index 0000000..5f2a658 --- /dev/null +++ b/internal/scanner/workflow_test.go @@ -0,0 +1,124 @@ +package scanner + +import ( + "testing" +) + +func TestParseRunsOn_Scalar(t *testing.T) { + yaml := ` +name: CI +on: push +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 +` + wf, err := ParseWorkflow([]byte(yaml), "test.yml") + if err != nil { + t.Fatal(err) + } + job := wf.Jobs["build"] + if len(job.RunsOn) != 1 || job.RunsOn[0] != "ubuntu-latest" { + t.Errorf("expected [ubuntu-latest], got %v", job.RunsOn) + } +} + +func TestParseRunsOn_Sequence(t *testing.T) { + yaml := ` +name: CI +on: push +jobs: + build: + runs-on: [self-hosted, linux, gpu] + steps: + - uses: actions/checkout@v4 +` + wf, err := ParseWorkflow([]byte(yaml), "test.yml") + if err != nil { + t.Fatal(err) + } + job := wf.Jobs["build"] + if len(job.RunsOn) != 3 { + t.Fatalf("expected 3 labels, got %d: %v", len(job.RunsOn), job.RunsOn) + } + if job.RunsOn[0] != "self-hosted" { + t.Errorf("expected first label 'self-hosted', got %s", job.RunsOn[0]) + } +} + +func TestParseRunsOn_Group(t *testing.T) { + yaml := ` +name: CI +on: push +jobs: + build: + runs-on: + group: large-runners + labels: [linux, x64] + steps: + - uses: actions/checkout@v4 +` + wf, err := ParseWorkflow([]byte(yaml), "test.yml") + if err != nil { + t.Fatal(err) + } + job := wf.Jobs["build"] + if len(job.RunsOn) != 2 { + t.Fatalf("expected 2 labels from group, got %d: %v", len(job.RunsOn), job.RunsOn) + } +} + +func TestParseNeeds_String(t *testing.T) { + yaml := ` +name: CI +on: push +jobs: + build: + runs-on: ubuntu-latest + steps: + - run: echo build + test: + needs: build + runs-on: ubuntu-latest + steps: + - run: echo test +` + wf, err := ParseWorkflow([]byte(yaml), "test.yml") + if err != nil { + t.Fatal(err) + } + job := wf.Jobs["test"] + if len(job.Needs) != 1 || job.Needs[0] != "build" { + t.Errorf("expected [build], got %v", job.Needs) + } +} + +func TestParseNeeds_Sequence(t *testing.T) { + yaml := ` +name: CI +on: push +jobs: + lint: + runs-on: ubuntu-latest + steps: + - run: echo lint + build: + runs-on: ubuntu-latest + steps: + - run: echo build + test: + needs: [lint, build] + runs-on: ubuntu-latest + steps: + - run: echo test +` + wf, err := ParseWorkflow([]byte(yaml), "test.yml") + if err != nil { + t.Fatal(err) + } + job := wf.Jobs["test"] + if len(job.Needs) != 2 { + t.Fatalf("expected 2 needs, got %d: %v", len(job.Needs), job.Needs) + } +} diff --git a/scripts/analysis-queries.sql b/scripts/analysis-queries.sql new file mode 100644 index 0000000..eb63757 --- /dev/null +++ b/scripts/analysis-queries.sql @@ -0,0 +1,160 @@ +-- Fluxgate Analysis Queries +-- Run against any scan SQLite database: sqlite3 < analysis-queries.sql +-- Or use individual queries interactively. + +-- ============================================================================= +-- OVERVIEW +-- ============================================================================= + +-- Total repos scanned and findings +SELECT 'Repos scanned' AS metric, COUNT(*) AS value FROM repos +UNION ALL +SELECT 'Repos with findings', COUNT(*) FROM repos WHERE findings_count > 0 +UNION ALL +SELECT 'Total findings', COUNT(*) FROM findings; + +-- ============================================================================= +-- FINDINGS BY RULE AND SEVERITY +-- ============================================================================= + +SELECT rule_id, severity, COUNT(*) AS count +FROM findings +GROUP BY rule_id, severity +ORDER BY rule_id, + CASE severity + WHEN 'critical' THEN 1 + WHEN 'high' THEN 2 + WHEN 'medium' THEN 3 + WHEN 'low' THEN 4 + WHEN 'info' THEN 5 + END; + +-- ============================================================================= +-- FG-001 (PWN REQUEST) DETAILS +-- ============================================================================= + +-- All FG-001 critical and high findings with repo context +SELECT r.owner || '/' || r.name AS repo, + r.stars, + f.severity, + f.workflow_path, + f.description +FROM findings f +JOIN repos r ON r.id = f.repo_id +WHERE f.rule_id = 'FG-001' + AND f.severity IN ('critical', 'high') +ORDER BY + CASE f.severity WHEN 'critical' THEN 1 WHEN 'high' THEN 2 END, + r.stars DESC; + +-- FG-001 mitigated vs unmitigated breakdown +SELECT + CASE + WHEN f.description LIKE '%mitigated%' THEN 'mitigated' + ELSE 'unmitigated' + END AS status, + f.severity, + COUNT(*) AS count +FROM findings f +WHERE f.rule_id = 'FG-001' +GROUP BY status, f.severity +ORDER BY status, + CASE f.severity WHEN 'critical' THEN 1 WHEN 'high' THEN 2 WHEN 'medium' THEN 3 END; + +-- ============================================================================= +-- NEW RULES (FG-008, FG-009, FG-010) PREVALENCE +-- ============================================================================= + +-- OIDC misconfiguration findings +SELECT r.owner || '/' || r.name AS repo, f.severity, f.description +FROM findings f +JOIN repos r ON r.id = f.repo_id +WHERE f.rule_id = 'FG-008' +ORDER BY CASE f.severity WHEN 'critical' THEN 1 WHEN 'high' THEN 2 ELSE 3 END; + +-- Self-hosted runner findings +SELECT r.owner || '/' || r.name AS repo, f.severity, f.description +FROM findings f +JOIN repos r ON r.id = f.repo_id +WHERE f.rule_id = 'FG-009' +ORDER BY CASE f.severity WHEN 'critical' THEN 1 WHEN 'high' THEN 2 ELSE 3 END; + +-- Cache poisoning findings +SELECT r.owner || '/' || r.name AS repo, f.severity, f.description +FROM findings f +JOIN repos r ON r.id = f.repo_id +WHERE f.rule_id = 'FG-010' +ORDER BY CASE f.severity WHEN 'critical' THEN 1 WHEN 'high' THEN 2 ELSE 3 END; + +-- ============================================================================= +-- ECOSYSTEM COMPARISON (for cross-database analysis) +-- ============================================================================= + +-- Prevalence rate: % of repos with each rule +SELECT + f.rule_id, + COUNT(DISTINCT f.repo_id) AS repos_affected, + (SELECT COUNT(*) FROM repos) AS total_repos, + ROUND(100.0 * COUNT(DISTINCT f.repo_id) / (SELECT COUNT(*) FROM repos), 1) AS pct +FROM findings f +GROUP BY f.rule_id +ORDER BY repos_affected DESC; + +-- Star-weighted severity: high-star repos with critical findings +SELECT r.owner || '/' || r.name AS repo, + r.stars, + COUNT(*) AS critical_findings +FROM findings f +JOIN repos r ON r.id = f.repo_id +WHERE f.severity = 'critical' +GROUP BY r.id +ORDER BY r.stars DESC +LIMIT 20; + +-- ============================================================================= +-- PERMISSIONS POSTURE +-- ============================================================================= + +-- Repos with no permissions block (FG-004/FG-006) +SELECT r.owner || '/' || r.name AS repo, r.stars, f.workflow_path, f.description +FROM findings f +JOIN repos r ON r.id = f.repo_id +WHERE f.rule_id IN ('FG-004', 'FG-006') + AND f.severity IN ('high', 'medium') +ORDER BY r.stars DESC +LIMIT 30; + +-- ============================================================================= +-- TAG PINNING (FG-003) +-- ============================================================================= + +-- Unpinned actions by severity +SELECT f.severity, COUNT(*) AS count +FROM findings f +WHERE f.rule_id = 'FG-003' +GROUP BY f.severity +ORDER BY CASE f.severity WHEN 'high' THEN 1 WHEN 'medium' THEN 2 WHEN 'info' THEN 3 END; + +-- High-severity unpinned actions (branch-pinned: @main/@master) +SELECT r.owner || '/' || r.name AS repo, f.workflow_path, f.description +FROM findings f +JOIN repos r ON r.id = f.repo_id +WHERE f.rule_id = 'FG-003' AND f.severity = 'high' +ORDER BY r.stars DESC +LIMIT 20; + +-- ============================================================================= +-- AGGREGATE STATS FOR BLOG POST / CFP +-- ============================================================================= + +-- Summary statistics +SELECT + (SELECT COUNT(*) FROM repos) AS total_repos, + (SELECT COUNT(*) FROM findings) AS total_findings, + (SELECT COUNT(*) FROM findings WHERE severity = 'critical') AS critical, + (SELECT COUNT(*) FROM findings WHERE severity = 'high') AS high, + (SELECT COUNT(*) FROM findings WHERE rule_id = 'FG-001') AS pwn_requests, + (SELECT COUNT(*) FROM findings WHERE rule_id = 'FG-001' AND severity = 'critical') AS unmitigated_pwn, + (SELECT COUNT(DISTINCT repo_id) FROM findings WHERE rule_id = 'FG-001') AS repos_with_pwn, + ROUND(100.0 * (SELECT COUNT(DISTINCT repo_id) FROM findings WHERE rule_id = 'FG-001') + / (SELECT COUNT(*) FROM repos), 1) AS pwn_pct; diff --git a/scripts/bigquery-prt-workflows.sql b/scripts/bigquery-prt-workflows.sql new file mode 100644 index 0000000..e07a3d9 --- /dev/null +++ b/scripts/bigquery-prt-workflows.sql @@ -0,0 +1,43 @@ +-- BigQuery: Extract all GitHub Actions workflows containing pull_request_target +-- Target table: bigquery-public-data.github_repos.contents + files +-- Free tier: 1 TiB/month processing +-- +-- Usage: +-- 1. Run this query in BigQuery console (console.cloud.google.com/bigquery) +-- 2. Export results as JSONL to GCS or download directly +-- 3. Feed into: fluxgate ingest exported.jsonl --db bigquery.db +-- +-- Estimated cost: ~$2.50 (scans ~500GB of content table) +-- Estimated rows: 50K-200K workflows with pull_request_target + +SELECT + f.repo_name AS repo, + f.path AS path, + c.content AS content +FROM + `bigquery-public-data.github_repos.files` f +JOIN + `bigquery-public-data.github_repos.contents` c +ON + f.id = c.id +WHERE + f.path LIKE '.github/workflows/%.yml' + AND c.content LIKE '%pull_request_target%' + AND c.size < 1048576 -- Skip files > 1MB (likely generated) + +UNION ALL + +SELECT + f.repo_name AS repo, + f.path AS path, + c.content AS content +FROM + `bigquery-public-data.github_repos.files` f +JOIN + `bigquery-public-data.github_repos.contents` c +ON + f.id = c.id +WHERE + f.path LIKE '.github/workflows/%.yaml' + AND c.content LIKE '%pull_request_target%' + AND c.size < 1048576 diff --git a/test/fixtures/cache-poisoning-pr.yaml b/test/fixtures/cache-poisoning-pr.yaml new file mode 100644 index 0000000..359fc32 --- /dev/null +++ b/test/fixtures/cache-poisoning-pr.yaml @@ -0,0 +1,15 @@ +name: CI +on: + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/cache@v4 + with: + path: node_modules + key: deps-${{ github.head_ref }}-${{ hashFiles('package-lock.json') }} + - run: npm ci && npm test diff --git a/test/fixtures/cache-poisoning-prt.yaml b/test/fixtures/cache-poisoning-prt.yaml new file mode 100644 index 0000000..06d8ffc --- /dev/null +++ b/test/fixtures/cache-poisoning-prt.yaml @@ -0,0 +1,17 @@ +name: Build and Test +on: + pull_request_target: + types: [opened, synchronize] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + - uses: actions/cache@v4 + with: + path: ~/.npm + key: npm-${{ hashFiles('package-lock.json') }} + - run: npm ci && npm run build diff --git a/test/fixtures/cache-setup-action.yaml b/test/fixtures/cache-setup-action.yaml new file mode 100644 index 0000000..937c96e --- /dev/null +++ b/test/fixtures/cache-setup-action.yaml @@ -0,0 +1,15 @@ +name: CI +on: + pull_request: + branches: [main] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: 20 + cache: npm + - run: npm ci && npm test diff --git a/test/fixtures/gitlab-mr-secrets.yml b/test/fixtures/gitlab-mr-secrets.yml new file mode 100644 index 0000000..788e6ac --- /dev/null +++ b/test/fixtures/gitlab-mr-secrets.yml @@ -0,0 +1,16 @@ +stages: + - test + - deploy + +variables: + DEPLOY_TOKEN: $CI_JOB_TOKEN + +test: + stage: test + tags: + - docker + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' + script: + - pip install -r requirements.txt + - pytest tests/ diff --git a/test/fixtures/gitlab-safe.yml b/test/fixtures/gitlab-safe.yml new file mode 100644 index 0000000..fa4cc5c --- /dev/null +++ b/test/fixtures/gitlab-safe.yml @@ -0,0 +1,9 @@ +stages: + - test + +test: + stage: test + rules: + - if: '$CI_PIPELINE_SOURCE == "push"' + script: + - go test ./... diff --git a/test/fixtures/gitlab-script-injection.yml b/test/fixtures/gitlab-script-injection.yml new file mode 100644 index 0000000..7d5d3ba --- /dev/null +++ b/test/fixtures/gitlab-script-injection.yml @@ -0,0 +1,12 @@ +stages: + - lint + +check_title: + stage: lint + script: + - echo "Checking MR title: $CI_MERGE_REQUEST_TITLE" + - | + if echo "$CI_MERGE_REQUEST_TITLE" | grep -q "WIP"; then + echo "MR is WIP, skipping" + exit 0 + fi diff --git a/test/fixtures/gitlab-unsafe-include.yml b/test/fixtures/gitlab-unsafe-include.yml new file mode 100644 index 0000000..38fb155 --- /dev/null +++ b/test/fixtures/gitlab-unsafe-include.yml @@ -0,0 +1,13 @@ +include: + - remote: 'https://example.com/ci-templates/lint.yml' + - project: 'my-group/ci-templates' + file: '/templates/deploy.yml' + +stages: + - lint + - deploy + +lint: + stage: lint + script: + - eslint src/ diff --git a/test/fixtures/oidc-prt-fork-checkout.yaml b/test/fixtures/oidc-prt-fork-checkout.yaml new file mode 100644 index 0000000..e993caa --- /dev/null +++ b/test/fixtures/oidc-prt-fork-checkout.yaml @@ -0,0 +1,20 @@ +name: Deploy Preview +on: + pull_request_target: + types: [opened, synchronize] + +jobs: + deploy: + runs-on: ubuntu-latest + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + - uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::123456789:role/deploy + aws-region: us-east-1 + - run: npm ci && npm run build diff --git a/test/fixtures/oidc-prt-no-fork.yaml b/test/fixtures/oidc-prt-no-fork.yaml new file mode 100644 index 0000000..7e2603f --- /dev/null +++ b/test/fixtures/oidc-prt-no-fork.yaml @@ -0,0 +1,17 @@ +name: Label PR +on: + pull_request_target: + types: [opened] + +jobs: + label: + runs-on: ubuntu-latest + permissions: + id-token: write + pull-requests: write + steps: + - uses: actions/checkout@v4 + - uses: google-github-actions/auth@v2 + with: + workload_identity_provider: projects/123/locations/global/workloadIdentityPools/pool/providers/provider + - run: echo "labeling" diff --git a/test/fixtures/oidc-push-only.yaml b/test/fixtures/oidc-push-only.yaml new file mode 100644 index 0000000..7479e72 --- /dev/null +++ b/test/fixtures/oidc-push-only.yaml @@ -0,0 +1,18 @@ +name: Deploy Production +on: + push: + branches: [main] + +jobs: + deploy: + runs-on: ubuntu-latest + permissions: + id-token: write + contents: read + steps: + - uses: actions/checkout@v4 + - uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::123456789:role/deploy + aws-region: us-east-1 + - run: npm run deploy diff --git a/test/fixtures/self-hosted-pr.yaml b/test/fixtures/self-hosted-pr.yaml new file mode 100644 index 0000000..a3187aa --- /dev/null +++ b/test/fixtures/self-hosted-pr.yaml @@ -0,0 +1,12 @@ +name: GPU Tests +on: + pull_request: + branches: [main] + +jobs: + gpu-test: + runs-on: [self-hosted, linux, gpu] + steps: + - uses: actions/checkout@v4 + - run: pip install -e ".[test]" + - run: pytest tests/gpu/ diff --git a/test/fixtures/self-hosted-prt-fork.yaml b/test/fixtures/self-hosted-prt-fork.yaml new file mode 100644 index 0000000..738eb99 --- /dev/null +++ b/test/fixtures/self-hosted-prt-fork.yaml @@ -0,0 +1,13 @@ +name: Integration Tests +on: + pull_request_target: + types: [opened, synchronize] + +jobs: + test: + runs-on: [self-hosted, linux] + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + - run: make test diff --git a/test/fixtures/self-hosted-push-only.yaml b/test/fixtures/self-hosted-push-only.yaml new file mode 100644 index 0000000..e286ba5 --- /dev/null +++ b/test/fixtures/self-hosted-push-only.yaml @@ -0,0 +1,11 @@ +name: Release Build +on: + push: + tags: ['v*'] + +jobs: + build: + runs-on: [self-hosted, linux] + steps: + - uses: actions/checkout@v4 + - run: make release From cf8a405a26c5613c33720297d725c91b6615f8c8 Mon Sep 17 00:00:00 2001 From: Christopher Lusk Date: Sun, 22 Mar 2026 21:01:53 -0400 Subject: [PATCH 03/10] feat: Azure Pipelines security analysis with 4 rules Add cross-platform Azure DevOps Pipelines support: - AZ-001: PR builds with secret/variable group exposure to forks - AZ-002: Script injection via predefined variables (Build.SourceBranchName, etc.) - AZ-003: Unpinned template extends and repository resources - AZ-009: Self-hosted agent pools on PR-triggered pipelines Parser handles all Azure Pipelines YAML structures: stages, jobs, single-job (root steps), deployment jobs, pool inheritance, resources, and extends templates. Environment protection reduces AZ-009 severity from high to medium. Wired into ScanDirectory for automatic detection of azure-pipelines.yml alongside GitHub Actions and GitLab CI. 8 tests, 5 fixtures. Co-Authored-By: Claude Opus 4.6 --- internal/cicd/azure.go | 497 ++++++++++++++++++++++ internal/cicd/azure_rules.go | 261 ++++++++++++ internal/cicd/azure_test.go | 275 ++++++++++++ internal/scanner/scanner.go | 67 +++ test/fixtures/azure-pr-build.yml | 22 + test/fixtures/azure-push-only.yml | 18 + test/fixtures/azure-script-injection.yml | 16 + test/fixtures/azure-self-hosted.yml | 14 + test/fixtures/azure-unpinned-resource.yml | 22 + 9 files changed, 1192 insertions(+) create mode 100644 internal/cicd/azure.go create mode 100644 internal/cicd/azure_rules.go create mode 100644 internal/cicd/azure_test.go create mode 100644 test/fixtures/azure-pr-build.yml create mode 100644 test/fixtures/azure-push-only.yml create mode 100644 test/fixtures/azure-script-injection.yml create mode 100644 test/fixtures/azure-self-hosted.yml create mode 100644 test/fixtures/azure-unpinned-resource.yml diff --git a/internal/cicd/azure.go b/internal/cicd/azure.go new file mode 100644 index 0000000..ead4093 --- /dev/null +++ b/internal/cicd/azure.go @@ -0,0 +1,497 @@ +package cicd + +import ( + "fmt" + + "gopkg.in/yaml.v3" +) + +// AzurePipeline represents a parsed azure-pipelines.yml file. +type AzurePipeline struct { + path string + triggers []TriggerType + jobs []PipelineJob + resources []AzureResource + variables []AzureVariable + extends string +} + +// AzureResource represents a pipeline resource reference. +type AzureResource struct { + Type string // "repositories", "pipelines", "containers" + Repository string + Ref string + Endpoint string +} + +// AzureVariable represents a pipeline variable. +type AzureVariable struct { + Name string + Value string + IsSecret bool + Group string +} + +// rawAzurePipeline is the intermediate representation for YAML unmarshalling. +type rawAzurePipeline struct { + Trigger yaml.Node `yaml:"trigger"` + PR yaml.Node `yaml:"pr"` + Schedules []rawAzureSchedule `yaml:"schedules"` + Pool yaml.Node `yaml:"pool"` + Variables yaml.Node `yaml:"variables"` + Resources yaml.Node `yaml:"resources"` + Extends *rawAzureExtends `yaml:"extends"` + Stages []rawAzureStage `yaml:"stages"` + Jobs []rawAzureJob `yaml:"jobs"` + Steps []rawAzureStep `yaml:"steps"` +} + +type rawAzureSchedule struct { + Cron string `yaml:"cron"` +} + +type rawAzureExtends struct { + Template string `yaml:"template"` +} + +type rawAzureStage struct { + Stage string `yaml:"stage"` + DisplayName string `yaml:"displayName"` + DependsOn yaml.Node `yaml:"dependsOn"` + Condition string `yaml:"condition"` + Pool yaml.Node `yaml:"pool"` + Variables yaml.Node `yaml:"variables"` + Jobs []rawAzureJob `yaml:"jobs"` +} + +type rawAzureJob struct { + Job string `yaml:"job"` + Deployment string `yaml:"deployment"` + DisplayName string `yaml:"displayName"` + DependsOn yaml.Node `yaml:"dependsOn"` + Condition string `yaml:"condition"` + Pool yaml.Node `yaml:"pool"` + Variables yaml.Node `yaml:"variables"` + Environment yaml.Node `yaml:"environment"` + Steps []rawAzureStep `yaml:"steps"` + Template string `yaml:"template"` +} + +type rawAzureStep struct { + Script string `yaml:"script"` + Bash string `yaml:"bash"` + Powershell string `yaml:"powershell"` + Pwsh string `yaml:"pwsh"` + Task string `yaml:"task"` + Template string `yaml:"template"` + Checkout string `yaml:"checkout"` + DisplayName string `yaml:"displayName"` + Env map[string]string `yaml:"env"` + Inputs map[string]string `yaml:"inputs"` +} + +// ParseAzurePipeline parses an azure-pipelines.yml file. +func ParseAzurePipeline(data []byte, path string) (*AzurePipeline, error) { + var raw rawAzurePipeline + if err := yaml.Unmarshal(data, &raw); err != nil { + return nil, fmt.Errorf("parsing %s: %w", path, err) + } + + pipeline := &AzurePipeline{ + path: path, + } + + // Parse triggers + pipeline.triggers = inferAzureTriggers(&raw) + + // Parse variables + pipeline.variables = parseAzureVariables(&raw.Variables) + + // Parse resources + pipeline.resources = parseAzureResources(&raw.Resources) + + // Parse extends + if raw.Extends != nil { + pipeline.extends = raw.Extends.Template + } + + // Parse jobs from stages, jobs, or steps + defaultPool := parseAzurePool(&raw.Pool) + + if len(raw.Stages) > 0 { + for _, stage := range raw.Stages { + stagePool := parseAzurePool(&stage.Pool) + if stagePool == "" { + stagePool = defaultPool + } + for _, rawJob := range stage.Jobs { + job := convertAzureJob(&rawJob, stagePool) + if stage.Condition != "" { + job.Conditions = append(job.Conditions, stage.Condition) + } + pipeline.jobs = append(pipeline.jobs, job) + } + } + } else if len(raw.Jobs) > 0 { + for _, rawJob := range raw.Jobs { + pipeline.jobs = append(pipeline.jobs, convertAzureJob(&rawJob, defaultPool)) + } + } else if len(raw.Steps) > 0 { + // Single-job pipeline (steps at root level) + job := PipelineJob{ + Name: "__root__", + RunnerType: defaultPool, + } + for _, step := range raw.Steps { + job.Steps = append(job.Steps, convertAzureStep(&step)...) + } + pipeline.jobs = append(pipeline.jobs, job) + } + + return pipeline, nil +} + +// Platform implements Pipeline. +func (p *AzurePipeline) Platform() string { return "azure" } + +// FilePath implements Pipeline. +func (p *AzurePipeline) FilePath() string { return p.path } + +// Triggers implements Pipeline. +func (p *AzurePipeline) Triggers() []TriggerType { return p.triggers } + +// HasExternalTrigger implements Pipeline. +func (p *AzurePipeline) HasExternalTrigger() bool { + for _, t := range p.triggers { + if t == TriggerExternalPR || t == TriggerInternalPR || t == TriggerComment { + return true + } + } + return false +} + +// Jobs implements Pipeline. +func (p *AzurePipeline) Jobs() []PipelineJob { return p.jobs } + +// Variables returns pipeline variables. +func (p *AzurePipeline) Variables() []AzureVariable { return p.variables } + +// Resources returns pipeline resource references. +func (p *AzurePipeline) Resources() []AzureResource { return p.resources } + +// Extends returns the template this pipeline extends. +func (p *AzurePipeline) Extends() string { return p.extends } + +func inferAzureTriggers(raw *rawAzurePipeline) []TriggerType { + var triggers []TriggerType + + // Check trigger (CI trigger = push) + hasTrigger := raw.Trigger.Kind != 0 + hasPR := raw.PR.Kind != 0 + + if !hasTrigger && !hasPR { + // Default: trigger on push and PR + triggers = append(triggers, TriggerPush, TriggerInternalPR) + } else { + if hasTrigger { + // Check if trigger is "none" + if raw.Trigger.Kind == yaml.ScalarNode && raw.Trigger.Value == "none" { + // Explicitly disabled + } else { + triggers = append(triggers, TriggerPush) + } + } + if hasPR { + if raw.PR.Kind == yaml.ScalarNode && raw.PR.Value == "none" { + // Explicitly disabled + } else { + // Azure DevOps: PR triggers for forks are treated as external. + // Fork PRs get limited access by default, but pipelines can + // opt into "make secrets available to builds of forks". + triggers = append(triggers, TriggerInternalPR) + } + } + } + + if len(raw.Schedules) > 0 { + triggers = append(triggers, TriggerSchedule) + } + + return triggers +} + +func parseAzurePool(node *yaml.Node) string { + if node == nil || node.Kind == 0 { + return "" + } + + switch node.Kind { + case yaml.ScalarNode: + return node.Value + case yaml.MappingNode: + var vmImage, poolName string + for i := 0; i < len(node.Content)-1; i += 2 { + key := node.Content[i].Value + val := node.Content[i+1].Value + switch key { + case "vmImage": + vmImage = val + case "name": + poolName = val + } + } + if vmImage != "" { + return "hosted:" + vmImage + } + if poolName != "" { + return "self-hosted:" + poolName + } + } + return "" +} + +func parseAzureVariables(node *yaml.Node) []AzureVariable { + if node == nil || node.Kind == 0 { + return nil + } + + var vars []AzureVariable + + switch node.Kind { + case yaml.MappingNode: + // Simple key-value pairs + for i := 0; i < len(node.Content)-1; i += 2 { + vars = append(vars, AzureVariable{ + Name: node.Content[i].Value, + Value: node.Content[i+1].Value, + }) + } + case yaml.SequenceNode: + // Extended form: [{name: ..., value: ...}, {group: ...}] + for _, item := range node.Content { + if item.Kind != yaml.MappingNode { + continue + } + v := AzureVariable{} + for i := 0; i < len(item.Content)-1; i += 2 { + key := item.Content[i].Value + val := item.Content[i+1].Value + switch key { + case "name": + v.Name = val + case "value": + v.Value = val + case "group": + v.Group = val + v.Name = val // use group name as identifier + } + } + vars = append(vars, v) + } + } + + return vars +} + +func parseAzureResources(node *yaml.Node) []AzureResource { + if node == nil || node.Kind == 0 { + return nil + } + + // resources is a mapping: {repositories: [...], pipelines: [...], containers: [...]} + if node.Kind != yaml.MappingNode { + return nil + } + + var resources []AzureResource + for i := 0; i < len(node.Content)-1; i += 2 { + resType := node.Content[i].Value // "repositories", "pipelines", etc. + listNode := node.Content[i+1] + if listNode.Kind != yaml.SequenceNode { + continue + } + for _, item := range listNode.Content { + if item.Kind != yaml.MappingNode { + continue + } + r := AzureResource{Type: resType} + for j := 0; j < len(item.Content)-1; j += 2 { + key := item.Content[j].Value + val := item.Content[j+1].Value + switch key { + case "repository": + r.Repository = val + case "ref": + r.Ref = val + case "endpoint": + r.Endpoint = val + } + } + resources = append(resources, r) + } + } + + return resources +} + +func convertAzureJob(raw *rawAzureJob, parentPool string) PipelineJob { + name := raw.Job + if name == "" { + name = raw.Deployment + } + if name == "" { + name = raw.DisplayName + } + + job := PipelineJob{ + Name: name, + DependsOn: parseAzureDependsOn(&raw.DependsOn), + } + + // Pool / runner type + pool := parseAzurePool(&raw.Pool) + if pool == "" { + pool = parentPool + } + job.RunnerType = pool + + // Condition + if raw.Condition != "" { + job.Conditions = append(job.Conditions, raw.Condition) + } + + // Environment (deployment jobs) + job.Environment = parseAzureEnvironment(&raw.Environment) + + // Template reference as a step + if raw.Template != "" { + job.Steps = append(job.Steps, PipelineStep{ + Name: "template", + Type: StepInclude, + Command: raw.Template, + }) + } + + // Steps + for _, step := range raw.Steps { + job.Steps = append(job.Steps, convertAzureStep(&step)...) + } + + // Extract variable-based secrets + job.Secrets = extractAzureSecretRefs(&raw.Variables) + + return job +} + +func convertAzureStep(raw *rawAzureStep) []PipelineStep { + var steps []PipelineStep + + if raw.Script != "" { + steps = append(steps, PipelineStep{ + Name: raw.DisplayName, + Type: StepScript, + Command: raw.Script, + Env: raw.Env, + }) + } + if raw.Bash != "" { + steps = append(steps, PipelineStep{ + Name: raw.DisplayName, + Type: StepScript, + Command: raw.Bash, + Env: raw.Env, + }) + } + if raw.Powershell != "" { + steps = append(steps, PipelineStep{ + Name: raw.DisplayName, + Type: StepScript, + Command: raw.Powershell, + Env: raw.Env, + }) + } + if raw.Pwsh != "" { + steps = append(steps, PipelineStep{ + Name: raw.DisplayName, + Type: StepScript, + Command: raw.Pwsh, + Env: raw.Env, + }) + } + if raw.Task != "" { + steps = append(steps, PipelineStep{ + Name: raw.DisplayName, + Type: StepAction, + Command: raw.Task, + Env: raw.Env, + }) + } + if raw.Template != "" { + steps = append(steps, PipelineStep{ + Name: raw.DisplayName, + Type: StepInclude, + Command: raw.Template, + }) + } + + return steps +} + +func parseAzureDependsOn(node *yaml.Node) []string { + if node == nil || node.Kind == 0 { + return nil + } + switch node.Kind { + case yaml.ScalarNode: + if node.Value != "" { + return []string{node.Value} + } + case yaml.SequenceNode: + var deps []string + for _, n := range node.Content { + if n.Value != "" { + deps = append(deps, n.Value) + } + } + return deps + } + return nil +} + +func parseAzureEnvironment(node *yaml.Node) string { + if node == nil || node.Kind == 0 { + return "" + } + switch node.Kind { + case yaml.ScalarNode: + return node.Value + case yaml.MappingNode: + for i := 0; i < len(node.Content)-1; i += 2 { + if node.Content[i].Value == "name" { + return node.Content[i+1].Value + } + } + } + return "" +} + +func extractAzureSecretRefs(node *yaml.Node) []string { + if node == nil || node.Kind == 0 { + return nil + } + // Look for variable group references or secret-marked variables + var secrets []string + if node.Kind == yaml.SequenceNode { + for _, item := range node.Content { + if item.Kind != yaml.MappingNode { + continue + } + for i := 0; i < len(item.Content)-1; i += 2 { + if item.Content[i].Value == "group" { + secrets = append(secrets, "group:"+item.Content[i+1].Value) + } + } + } + } + return secrets +} diff --git a/internal/cicd/azure_rules.go b/internal/cicd/azure_rules.go new file mode 100644 index 0000000..34c1448 --- /dev/null +++ b/internal/cicd/azure_rules.go @@ -0,0 +1,261 @@ +package cicd + +import ( + "fmt" + "strings" +) + +// AzureFinding represents a security finding in an Azure Pipeline. +type AzureFinding struct { + RuleID string + Severity string + File string + Line int + Message string + Details string +} + +// ScanAzurePipeline runs all Azure Pipelines security rules. +func ScanAzurePipeline(pipeline *AzurePipeline) []AzureFinding { + var findings []AzureFinding + findings = append(findings, checkAzurePRSecrets(pipeline)...) + findings = append(findings, checkAzureScriptInjection(pipeline)...) + findings = append(findings, checkAzureUnpinnedTemplates(pipeline)...) + findings = append(findings, checkAzureSelfHostedAgent(pipeline)...) + return findings +} + +// checkAzurePRSecrets detects PR-triggered pipelines that may expose secrets +// to fork authors. Azure DevOps has a setting "Make secrets available to builds +// of forks" — when enabled, fork PR builds access pipeline secrets. +func checkAzurePRSecrets(pipeline *AzurePipeline) []AzureFinding { + if !pipeline.HasExternalTrigger() { + return nil + } + + var findings []AzureFinding + + for _, job := range pipeline.Jobs() { + // Check for dangerous execution in PR-triggered jobs + hasDangerousExec := false + hasCheckout := false + for _, step := range job.Steps { + if step.Type == StepScript { + cmd := strings.ToLower(step.Command) + dangerousCmds := []string{ + "pip install", "npm install", "npm ci", "yarn install", + "make", "cargo build", "go build", "mvn", "gradle", + "dotnet build", "nuget restore", "msbuild", + "./", "bash ", "sh ", "python ", "node ", + } + for _, dc := range dangerousCmds { + if strings.Contains(cmd, dc) { + hasDangerousExec = true + break + } + } + } + if step.Type == StepAction && strings.Contains(strings.ToLower(step.Command), "checkout") { + hasCheckout = true + } + } + + // Check for secret variable groups + hasSecrets := len(job.Secrets) > 0 + for _, v := range pipeline.Variables() { + if v.IsSecret || v.Group != "" { + hasSecrets = true + break + } + } + + if hasDangerousExec || hasCheckout { + severity := severityMedium + msg := fmt.Sprintf("Azure PR Build: job '%s' runs on PR trigger", job.Name) + + if hasDangerousExec { + msg += " — executes build commands that may run fork code" + severity = severityHigh + } + if hasSecrets { + msg += " with secret/variable group access" + severity = severityHigh + } + + findings = append(findings, AzureFinding{ + RuleID: "AZ-001", + Severity: severity, + File: pipeline.FilePath(), + Message: msg, + Details: "Azure DevOps pipelines triggered by PRs from forks can expose secrets if 'Make secrets available to builds of forks' is enabled in pipeline settings. Review fork build settings and consider using environment approvals.", + }) + } + } + + return findings +} + +// checkAzureScriptInjection detects user-controllable Azure DevOps predefined +// variables used directly in script blocks. +func checkAzureScriptInjection(pipeline *AzurePipeline) []AzureFinding { + // Azure DevOps predefined variables that can be attacker-controlled + dangerousVars := []string{ + "$(Build.SourceBranchName)", + "$(Build.SourceVersionMessage)", + "$(System.PullRequest.SourceBranch)", + "$(System.PullRequest.TargetBranch)", + "$(Build.RequestedFor)", + "$(Build.RequestedForEmail)", + } + + var findings []AzureFinding + for _, job := range pipeline.Jobs() { + for _, step := range job.Steps { + if step.Type != StepScript { + continue + } + for _, dv := range dangerousVars { + if strings.Contains(step.Command, dv) { + findings = append(findings, AzureFinding{ + RuleID: "AZ-002", + Severity: severityHigh, + File: pipeline.FilePath(), + Line: step.Line, + Message: fmt.Sprintf( + "Azure Script Injection: %s used in script of job '%s'", + dv, job.Name), + Details: "User-controllable predefined variables in script blocks can be exploited for command injection via crafted branch names or commit messages. Use environment variables or task inputs instead of inline variable expansion.", + }) + break + } + } + + // Also check env mappings for dangerous variable references + for envKey, envVal := range step.Env { + for _, dv := range dangerousVars { + if strings.Contains(envVal, dv) { + // Using env var mapping is actually the SAFE pattern — skip + _ = envKey + break + } + } + } + } + } + return findings +} + +// checkAzureUnpinnedTemplates detects template references without version +// pinning and external repository resources without ref pins. +func checkAzureUnpinnedTemplates(pipeline *AzurePipeline) []AzureFinding { + var findings []AzureFinding + + // Check for extends template without pinning + if ext := pipeline.Extends(); ext != "" { + if !strings.Contains(ext, "@") { + findings = append(findings, AzureFinding{ + RuleID: "AZ-003", + Severity: severityMedium, + File: pipeline.FilePath(), + Message: fmt.Sprintf("Azure Unpinned Extends: pipeline extends template '%s' without repository pin", ext), + Details: "Pipeline extends templates from external repositories should use @ with a pinned ref to prevent supply chain attacks via template modification.", + }) + } + } + + // Check step-level template references + for _, job := range pipeline.Jobs() { + for _, step := range job.Steps { + if step.Type == StepInclude && step.Command != "" { + // Template references with @ are cross-repo — check for ref pin + if strings.Contains(step.Command, "@") { + // Has repo reference, which is good — the ref is pinned at the + // resource level, not in the template reference itself + continue + } + // Local template references are fine + } + } + } + + // Check resource repository references + for _, res := range pipeline.Resources() { + if res.Type == "repositories" && res.Ref == "" && res.Repository != "" { + findings = append(findings, AzureFinding{ + RuleID: "AZ-003", + Severity: severityLow, + File: pipeline.FilePath(), + Message: fmt.Sprintf( + "Azure Unpinned Resource: repository resource '%s' without ref pin", + res.Repository), + Details: "Repository resources without a ref pin use the default branch. Pin to a specific tag, branch, or commit SHA to prevent supply chain attacks.", + }) + } + } + + return findings +} + +// checkAzureSelfHostedAgent detects PR-triggered pipelines running on +// self-hosted agent pools. +func checkAzureSelfHostedAgent(pipeline *AzurePipeline) []AzureFinding { + if !pipeline.HasExternalTrigger() { + return nil + } + + var findings []AzureFinding + + for _, job := range pipeline.Jobs() { + if !isAzureSelfHosted(job.RunnerType) { + continue + } + + severity := severityHigh + msg := fmt.Sprintf( + "Azure Self-Hosted Agent: job '%s' uses agent pool '%s' on PR trigger", + job.Name, job.RunnerType) + + // Deployment jobs with environment protection are safer + if job.Environment != "" { + severity = severityMedium + msg += fmt.Sprintf(" (protected by environment '%s')", job.Environment) + } + + findings = append(findings, AzureFinding{ + RuleID: "AZ-009", + Severity: severity, + File: pipeline.FilePath(), + Message: msg, + Details: "Self-hosted agent pools on PR-triggered pipelines allow fork PR authors to execute arbitrary code on the agent machine. This risks credential theft, persistence, and lateral movement. Use Microsoft-hosted agents for PR validation or restrict fork builds.", + }) + } + + return findings +} + +func isAzureSelfHosted(pool string) bool { + if pool == "" { + return false + } + if strings.HasPrefix(pool, "self-hosted:") { + return true + } + // Microsoft-hosted pools use vmImage + if strings.HasPrefix(pool, "hosted:") { + return false + } + // Named pools without "hosted:" prefix are likely self-hosted + // Common Microsoft-hosted pool names + hostedPools := []string{ + "azure pipelines", "vs2022-preview", "windows-latest", + "ubuntu-latest", "macos-latest", + } + lower := strings.ToLower(pool) + for _, hp := range hostedPools { + if strings.Contains(lower, hp) { + return false + } + } + // If it's a named pool and not recognized as hosted, assume self-hosted + return strings.Contains(pool, ":") +} diff --git a/internal/cicd/azure_test.go b/internal/cicd/azure_test.go new file mode 100644 index 0000000..96b92c9 --- /dev/null +++ b/internal/cicd/azure_test.go @@ -0,0 +1,275 @@ +package cicd + +import ( + "testing" +) + +func TestParseAzurePipeline_Basic(t *testing.T) { + yaml := ` +trigger: + - main +pr: + - main +pool: + vmImage: 'ubuntu-latest' +steps: + - script: echo hello + displayName: Say hello + - task: NodeTool@0 + inputs: + versionSpec: '18.x' +` + p, err := ParseAzurePipeline([]byte(yaml), "azure-pipelines.yml") + if err != nil { + t.Fatal(err) + } + if p.Platform() != "azure" { + t.Errorf("expected platform 'azure', got %s", p.Platform()) + } + if len(p.Jobs()) != 1 { + t.Fatalf("expected 1 job (root), got %d", len(p.Jobs())) + } + job := p.Jobs()[0] + if len(job.Steps) != 2 { + t.Errorf("expected 2 steps, got %d", len(job.Steps)) + } + if job.Steps[0].Type != StepScript { + t.Errorf("expected step 0 type 'script', got %s", job.Steps[0].Type) + } + if job.Steps[1].Type != StepAction { + t.Errorf("expected step 1 type 'action', got %s", job.Steps[1].Type) + } +} + +func TestParseAzurePipeline_Stages(t *testing.T) { + yaml := ` +trigger: + - main +stages: + - stage: Build + jobs: + - job: BuildJob + pool: + vmImage: 'ubuntu-latest' + steps: + - script: make build + - stage: Deploy + dependsOn: Build + jobs: + - job: DeployJob + pool: + name: production-pool + environment: production + steps: + - script: make deploy +` + p, err := ParseAzurePipeline([]byte(yaml), "azure-pipelines.yml") + if err != nil { + t.Fatal(err) + } + if len(p.Jobs()) != 2 { + t.Fatalf("expected 2 jobs, got %d", len(p.Jobs())) + } + if p.Jobs()[0].Name != "BuildJob" { + t.Errorf("expected first job 'BuildJob', got %s", p.Jobs()[0].Name) + } + if p.Jobs()[1].Environment != "production" { + t.Errorf("expected second job environment 'production', got %s", p.Jobs()[1].Environment) + } +} + +func TestAzurePRSecrets(t *testing.T) { + yaml := ` +trigger: + - main +pr: + - main +pool: + vmImage: 'ubuntu-latest' +variables: + - group: production-secrets +steps: + - script: npm ci + displayName: Install +` + p, err := ParseAzurePipeline([]byte(yaml), "azure-pipelines.yml") + if err != nil { + t.Fatal(err) + } + findings := ScanAzurePipeline(p) + + var az001 []AzureFinding + for _, f := range findings { + if f.RuleID == "AZ-001" { + az001 = append(az001, f) + } + } + if len(az001) == 0 { + t.Fatal("expected AZ-001 finding for PR build with secrets") + } + if az001[0].Severity != severityHigh { + t.Errorf("expected severity high, got %s", az001[0].Severity) + } +} + +func TestAzureScriptInjection(t *testing.T) { + yaml := ` +trigger: + - main +pr: + - main +pool: + vmImage: 'ubuntu-latest' +jobs: + - job: Build + steps: + - script: echo "Branch $(Build.SourceBranchName)" + displayName: Echo branch +` + p, err := ParseAzurePipeline([]byte(yaml), "azure-pipelines.yml") + if err != nil { + t.Fatal(err) + } + findings := ScanAzurePipeline(p) + + var az002 []AzureFinding + for _, f := range findings { + if f.RuleID == "AZ-002" { + az002 = append(az002, f) + } + } + if len(az002) == 0 { + t.Fatal("expected AZ-002 finding for script injection") + } + if az002[0].Severity != severityHigh { + t.Errorf("expected severity high, got %s", az002[0].Severity) + } +} + +func TestAzureSelfHostedAgent(t *testing.T) { + yaml := ` +trigger: + - main +pr: + - main +jobs: + - job: Build + pool: + name: my-private-pool + steps: + - script: make build +` + p, err := ParseAzurePipeline([]byte(yaml), "azure-pipelines.yml") + if err != nil { + t.Fatal(err) + } + findings := ScanAzurePipeline(p) + + var az009 []AzureFinding + for _, f := range findings { + if f.RuleID == "AZ-009" { + az009 = append(az009, f) + } + } + if len(az009) == 0 { + t.Fatal("expected AZ-009 finding for self-hosted agent on PR") + } + if az009[0].Severity != severityHigh { + t.Errorf("expected severity high, got %s", az009[0].Severity) + } +} + +func TestAzureUnpinnedResource(t *testing.T) { + yaml := ` +trigger: + - main +resources: + repositories: + - repository: templates + type: github + endpoint: my-connection +extends: + template: build.yml +pool: + vmImage: 'ubuntu-latest' +steps: + - script: echo hello +` + p, err := ParseAzurePipeline([]byte(yaml), "azure-pipelines.yml") + if err != nil { + t.Fatal(err) + } + findings := ScanAzurePipeline(p) + + var az003 []AzureFinding + for _, f := range findings { + if f.RuleID == "AZ-003" { + az003 = append(az003, f) + } + } + if len(az003) < 2 { + t.Fatalf("expected at least 2 AZ-003 findings (extends + resource), got %d", len(az003)) + } +} + +func TestAzurePushOnly_NoFindings(t *testing.T) { + yaml := ` +trigger: + - main +pr: none +pool: + vmImage: 'ubuntu-latest' +steps: + - script: npm ci + - script: npm test +` + p, err := ParseAzurePipeline([]byte(yaml), "azure-pipelines.yml") + if err != nil { + t.Fatal(err) + } + findings := ScanAzurePipeline(p) + + // AZ-001 and AZ-009 should not fire (no PR trigger) + for _, f := range findings { + if f.RuleID == "AZ-001" || f.RuleID == "AZ-009" { + t.Errorf("unexpected finding %s on push-only pipeline", f.RuleID) + } + } +} + +func TestAzureSelfHosted_DeploymentProtection(t *testing.T) { + yaml := ` +trigger: + - main +pr: + - main +stages: + - stage: Deploy + jobs: + - deployment: DeployProd + pool: + name: production-agents + environment: production + steps: + - script: deploy.sh +` + p, err := ParseAzurePipeline([]byte(yaml), "azure-pipelines.yml") + if err != nil { + t.Fatal(err) + } + findings := ScanAzurePipeline(p) + + var az009 []AzureFinding + for _, f := range findings { + if f.RuleID == "AZ-009" { + az009 = append(az009, f) + } + } + if len(az009) == 0 { + t.Fatal("expected AZ-009 finding even with environment protection") + } + // Should be medium (mitigated by environment) + if az009[0].Severity != severityMedium { + t.Errorf("expected medium severity (environment protected), got %s", az009[0].Severity) + } +} diff --git a/internal/scanner/scanner.go b/internal/scanner/scanner.go index ea8d8af..17b24e3 100644 --- a/internal/scanner/scanner.go +++ b/internal/scanner/scanner.go @@ -62,6 +62,17 @@ func ScanDirectory(dir string, opts ScanOptions) (*ScanResult, error) { result.Findings = append(result.Findings, glFindings...) } + // Scan Azure Pipelines if azure-pipelines.yml exists + for _, azName := range []string{"azure-pipelines.yml", "azure-pipelines.yaml"} { + azPath := filepath.Join(dir, azName) + if data, err := os.ReadFile(azPath); err == nil { + azFindings := ScanAzurePipelines(data, azPath, opts) + result.Workflows++ + result.Findings = append(result.Findings, azFindings...) + break + } + } + if result.Workflows == 0 { return nil, os.ErrNotExist } @@ -184,6 +195,62 @@ func ScanGitLabCI(data []byte, path string, opts ScanOptions) []Finding { return findings } +// ScanAzurePipelines parses and scans an azure-pipelines.yml file, returning +// findings in the common Finding format. +func ScanAzurePipelines(data []byte, path string, opts ScanOptions) []Finding { + pipeline, err := cicd.ParseAzurePipeline(data, path) + if err != nil { + return nil + } + + azFindings := cicd.ScanAzurePipeline(pipeline) + + var findings []Finding + for _, azf := range azFindings { + f := Finding{ + RuleID: azf.RuleID, + Severity: azf.Severity, + File: azf.File, + Line: azf.Line, + Message: azf.Message, + Details: azf.Details, + } + findings = append(findings, f) + } + + // Apply severity filter + if len(opts.Severities) > 0 { + sevSet := make(map[string]bool) + for _, s := range opts.Severities { + sevSet[strings.ToLower(s)] = true + } + var filtered []Finding + for _, f := range findings { + if sevSet[f.Severity] { + filtered = append(filtered, f) + } + } + findings = filtered + } + + // Apply rule filter + if len(opts.Rules) > 0 { + ruleSet := make(map[string]bool) + for _, r := range opts.Rules { + ruleSet[r] = true + } + var filtered []Finding + for _, f := range findings { + if ruleSet[f.RuleID] { + filtered = append(filtered, f) + } + } + findings = filtered + } + + return findings +} + func sortFindings(findings []Finding) { sort.Slice(findings, func(i, j int) bool { ri := SeverityRank(findings[i].Severity) diff --git a/test/fixtures/azure-pr-build.yml b/test/fixtures/azure-pr-build.yml new file mode 100644 index 0000000..82142c7 --- /dev/null +++ b/test/fixtures/azure-pr-build.yml @@ -0,0 +1,22 @@ +trigger: + branches: + include: + - main + +pr: + branches: + include: + - main + +pool: + vmImage: 'ubuntu-latest' + +variables: + - group: production-secrets + +steps: + - checkout: self + - script: npm ci + displayName: Install dependencies + - script: npm test + displayName: Run tests diff --git a/test/fixtures/azure-push-only.yml b/test/fixtures/azure-push-only.yml new file mode 100644 index 0000000..f6b3837 --- /dev/null +++ b/test/fixtures/azure-push-only.yml @@ -0,0 +1,18 @@ +trigger: + branches: + include: + - main + - release/* + +pr: none + +pool: + vmImage: 'ubuntu-latest' + +steps: + - script: npm ci + displayName: Install + - script: npm run build + displayName: Build + - script: npm test + displayName: Test diff --git a/test/fixtures/azure-script-injection.yml b/test/fixtures/azure-script-injection.yml new file mode 100644 index 0000000..a63e614 --- /dev/null +++ b/test/fixtures/azure-script-injection.yml @@ -0,0 +1,16 @@ +trigger: + - main + +pr: + - main + +pool: + vmImage: 'ubuntu-latest' + +jobs: + - job: Greet + steps: + - script: | + echo "Building branch $(Build.SourceBranchName)" + echo "Requested by $(Build.RequestedFor)" + displayName: Echo branch info diff --git a/test/fixtures/azure-self-hosted.yml b/test/fixtures/azure-self-hosted.yml new file mode 100644 index 0000000..d06b31e --- /dev/null +++ b/test/fixtures/azure-self-hosted.yml @@ -0,0 +1,14 @@ +trigger: + - main + +pr: + - main + +jobs: + - job: Build + pool: + name: my-private-pool + steps: + - checkout: self + - script: make build + displayName: Build project diff --git a/test/fixtures/azure-unpinned-resource.yml b/test/fixtures/azure-unpinned-resource.yml new file mode 100644 index 0000000..51af9c0 --- /dev/null +++ b/test/fixtures/azure-unpinned-resource.yml @@ -0,0 +1,22 @@ +trigger: + - main + +resources: + repositories: + - repository: templates + type: github + endpoint: my-github-connection + - repository: pinned-templates + type: github + ref: refs/tags/v1.0.0 + endpoint: my-github-connection + +extends: + template: build.yml + +pool: + vmImage: 'ubuntu-latest' + +steps: + - script: echo hello + displayName: Hello From 0facf21fd6bb05a75d182229c902b700e9125890 Mon Sep 17 00:00:00 2001 From: Christopher Lusk Date: Sun, 22 Mar 2026 21:30:46 -0400 Subject: [PATCH 04/10] fix: GitLab CI parser handles script lines with colon-space as mapping nodes YAML parses unquoted script lines containing ": " (e.g. `echo "MR title: $CI_MERGE_REQUEST_TITLE"`) as mapping nodes instead of scalars. This caused GL-002 script injection detection to miss these lines entirely. Fix: extractScriptSteps now reconstructs the original command string from mapping node key-value pairs when sequence items are parsed as mappings. Found during v0.5.0 cross-platform test corpus validation. Co-Authored-By: Claude Opus 4.6 --- internal/cicd/gitlab.go | 19 +++++++++++++++++-- internal/cicd/gitlab_test.go | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/internal/cicd/gitlab.go b/internal/cicd/gitlab.go index fd1cf8a..0dcf3b5 100644 --- a/internal/cicd/gitlab.go +++ b/internal/cicd/gitlab.go @@ -247,11 +247,26 @@ func extractScriptSteps(name string, node *yaml.Node, jobNode *yaml.Node) []Pipe }) case yaml.SequenceNode: for _, item := range node.Content { + cmd := item.Value + line := item.Line + // When YAML parses an unquoted script line containing ": " (e.g. + // echo "foo: $BAR"), it interprets it as a mapping node instead + // of a scalar. Reconstruct the original string from the key-value. + if item.Kind == yaml.MappingNode && len(item.Content) >= 2 { + key := item.Content[0].Value + val := item.Content[1].Value + if val != "" { + cmd = key + ": " + val + } else { + cmd = key + } + line = item.Content[0].Line + } steps = append(steps, PipelineStep{ Name: name, Type: StepScript, - Command: item.Value, - Line: item.Line, + Command: cmd, + Line: line, }) } } diff --git a/internal/cicd/gitlab_test.go b/internal/cicd/gitlab_test.go index 4188b76..4a4cbd6 100644 --- a/internal/cicd/gitlab_test.go +++ b/internal/cicd/gitlab_test.go @@ -146,3 +146,39 @@ func TestGitLabIncludes_Parsed(t *testing.T) { t.Errorf("expected project include, got %s", includes[1].Project) } } + +// TestGitLabScriptInjection_MappingNode verifies GL-002 detects script injection +// even when YAML parses unquoted script lines as mapping nodes (due to ": " in +// the command, e.g. `echo "MR: $CI_MERGE_REQUEST_TITLE"`). +func TestGitLabScriptInjection_MappingNode(t *testing.T) { + yamlContent := ` +stages: + - test + +check: + stage: test + rules: + - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' + script: + - echo "Testing MR: $CI_MERGE_REQUEST_TITLE" + - pytest +` + pipeline, err := ParseGitLabCI([]byte(yamlContent), "test.yml") + if err != nil { + t.Fatal(err) + } + + findings := ScanGitLabPipeline(pipeline) + var gl002 []GitLabFinding + for _, f := range findings { + if f.RuleID == "GL-002" { + gl002 = append(gl002, f) + } + } + if len(gl002) == 0 { + t.Fatal("expected GL-002 finding for $CI_MERGE_REQUEST_TITLE in script with mapping node") + } + if gl002[0].Severity != severityHigh { + t.Errorf("expected high severity, got %s", gl002[0].Severity) + } +} From 5d5b12002ee0247de9cef621ba811fc3aa1619d9 Mon Sep 17 00:00:00 2001 From: Christopher Lusk Date: Sun, 22 Mar 2026 22:00:48 -0400 Subject: [PATCH 05/10] fix: detect fork path execution via shell variable aliases The path isolation mitigation incorrectly classified workflows as safe when fork code was referenced via shell variable aliases (e.g., PR="$GITHUB_WORKSPACE/pr" followed by python script.py "$PR"). The referencesForkPath function now: - Matches $GITHUB_WORKSPACE/ references directly - Detects shell variable assignments aliasing the checkout path - Tracks alias variables and checks their usage in non-data commands This fixes a confidence tiering regression on tinygrad/szdiff.yml, which was classified as pattern-only instead of confirmed. The pip install and python execution of fork code via $PR variable is now correctly detected. Found during v0.5.0 ground truth validation. Co-Authored-By: Claude Opus 4.6 --- internal/scanner/rules.go | 49 ++++++++++++++++++++++++++++++ internal/scanner/rules_test.go | 25 +++++++++++++++ test/fixtures/path-alias-exec.yaml | 27 ++++++++++++++++ 3 files changed, 101 insertions(+) create mode 100644 test/fixtures/path-alias-exec.yaml diff --git a/internal/scanner/rules.go b/internal/scanner/rules.go index 7af2155..d7df13e 100644 --- a/internal/scanner/rules.go +++ b/internal/scanner/rules.go @@ -892,6 +892,7 @@ func containsMaintainerCheck(step Step) bool { // referencesForkPath checks if a run block executes code from the fork checkout path. func referencesForkPath(run string, checkoutPath string) bool { + // Direct path patterns execPatterns := []string{ "cd " + checkoutPath, "./" + checkoutPath + "/", @@ -899,6 +900,9 @@ func referencesForkPath(run string, checkoutPath string) bool { "pip install " + checkoutPath, "pip install -e " + checkoutPath, "npm install --prefix " + checkoutPath, + // $GITHUB_WORKSPACE-relative references + "$GITHUB_WORKSPACE/" + checkoutPath, + "${GITHUB_WORKSPACE}/" + checkoutPath, } for _, p := range execPatterns { if strings.Contains(run, p) { @@ -916,6 +920,51 @@ func referencesForkPath(run string, checkoutPath string) bool { } } } + + // Detect shell variable aliases for the fork path. + // Pattern: VAR="$GITHUB_WORKSPACE/" or VAR="./" followed by + // usage of $VAR in a non-data command (e.g., python script.py "$VAR"). + aliasPatterns := []string{ + "$GITHUB_WORKSPACE/" + checkoutPath, + "${GITHUB_WORKSPACE}/" + checkoutPath, + "./" + checkoutPath, + "\"" + checkoutPath + "\"", + } + lines := strings.Split(run, "\n") + var aliasVars []string + for _, line := range lines { + line = strings.TrimSpace(line) + for _, ap := range aliasPatterns { + if strings.Contains(line, ap) { + // Check for variable assignment: VAR="..." or VAR=... + if eqIdx := strings.Index(line, "="); eqIdx > 0 { + varName := strings.TrimSpace(line[:eqIdx]) + // Must be a simple identifier (no spaces, starts with letter/underscore) + if len(varName) > 0 && !strings.ContainsAny(varName, " \t$()") { + aliasVars = append(aliasVars, "$"+varName, "${"+varName+"}") + } + } + } + } + } + + // Check if any alias variable is used in a non-data command + for _, line := range lines { + line = strings.TrimSpace(line) + for _, av := range aliasVars { + if strings.Contains(line, av) { + // Skip the assignment line itself and data-only commands + if strings.Contains(line, "=") && strings.HasPrefix(line, strings.TrimPrefix(strings.TrimPrefix(av, "${"), "$")) { + continue + } + if isDataOnlyCommand(line) { + continue + } + return true + } + } + } + return false } diff --git a/internal/scanner/rules_test.go b/internal/scanner/rules_test.go index deafa08..9791478 100644 --- a/internal/scanner/rules_test.go +++ b/internal/scanner/rules_test.go @@ -603,6 +603,31 @@ func TestSeverityFilter(t *testing.T) { } } +// TestCheckPwnRequest_PathAliasExec verifies that fork path detection works +// when the checkout path is referenced via a shell variable alias (e.g., +// PR="$GITHUB_WORKSPACE/pr" followed by python script.py "$PR"). +// This is the tinygrad/szdiff.yml pattern. +func TestCheckPwnRequest_PathAliasExec(t *testing.T) { + wf := loadFixture(t, "path-alias-exec.yaml") + findings := CheckPwnRequest(wf) + + if len(findings) == 0 { + t.Fatal("expected at least 1 FG-001 finding") + } + + // The szdiff job should NOT have PathIsolated mitigation because + // $PR (aliasing $GITHUB_WORKSPACE/pr) is used in python sz.py "$PR" + for _, f := range findings { + if f.RuleID == "FG-001" { + for _, m := range f.Mitigations { + if strings.Contains(m, "no direct execution of fork path") { + t.Errorf("PathIsolated mitigation should not apply when fork path is aliased via shell variable: %s", m) + } + } + } + } +} + func TestRuleFilter(t *testing.T) { wf := loadFixture(t, "mixed-workflow.yaml") opts := ScanOptions{Rules: []string{"FG-002"}} diff --git a/test/fixtures/path-alias-exec.yaml b/test/fixtures/path-alias-exec.yaml new file mode 100644 index 0000000..e8d441d --- /dev/null +++ b/test/fixtures/path-alias-exec.yaml @@ -0,0 +1,27 @@ +name: Size Diff +on: + pull_request_target: + +jobs: + szdiff: + permissions: + contents: read + pull-requests: write + runs-on: ubuntu-latest + steps: + - name: Checkout PR code + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + path: pr + - name: Checkout base + uses: actions/checkout@v4 + with: + path: base + - name: Count Diff + run: | + BASE="$GITHUB_WORKSPACE/base" + PR="$GITHUB_WORKSPACE/pr" + pip install tabulate $BASE + cp "$BASE/sz.py" . + python sz.py "$BASE" "$PR" From b5c88a43f0f5c4733efeccdd12af1fe10b3057a9 Mon Sep 17 00:00:00 2001 From: Christopher Lusk Date: Sun, 22 Mar 2026 22:23:09 -0400 Subject: [PATCH 06/10] =?UTF-8?q?feat:=20fluxgate=20v0.6.0=20=E2=80=94=20b?= =?UTF-8?q?ot=20TOCTOU=20detection,=20dispatch=20injection,=20attack=20cor?= =?UTF-8?q?relation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FG-011: New rule detects bot actor guard TOCTOU bypass risk on pull_request_target and workflow_run triggers. Bot actor guards (dependabot[bot], renovate[bot]) no longer suppress FG-001 findings to info — capped at high to reflect TOCTOU bypassability. FG-002 extended: workflow_dispatch inputs and workflow_call inputs now detected as injectable expressions (github.event.inputs.*, inputs.*). FG-001+FG-002 correlation: post-scan pass merges co-occurring pwn request and script injection findings into a single enhanced finding referencing the Ultralytics attack pattern. Triage prompts added with BoostSecurity attack taxonomy (pipeline parasitism, transitive action compromise, bot TOCTOU, Shai-Hulud, Ultralytics chain). 21 rules across 3 platforms, 69 tests. Co-Authored-By: Claude Opus 4.6 --- README.md | 29 +++++- cmd/fluxgate/main.go | 2 +- internal/report/table.go | 2 +- internal/scanner/rules.go | 104 ++++++++++++++++++- internal/scanner/rules_test.go | 116 ++++++++++++++++++++-- internal/scanner/scanner.go | 68 +++++++++++++ test/fixtures/bot-actor-guard-toctou.yaml | 24 +++++ test/fixtures/dispatch-injection.yaml | 22 ++++ test/fixtures/workflow-run-toctou.yaml | 18 ++++ 9 files changed, 374 insertions(+), 11 deletions(-) create mode 100644 test/fixtures/bot-actor-guard-toctou.yaml create mode 100644 test/fixtures/dispatch-injection.yaml create mode 100644 test/fixtures/workflow-run-toctou.yaml diff --git a/README.md b/README.md index 9b1a63a..048d38c 100644 --- a/README.md +++ b/README.md @@ -22,13 +22,40 @@ go install github.com/north-echo/fluxgate/cmd/fluxgate@latest ## What It Detects +### GitHub Actions (FG-xxx) + | Rule | Severity | Description | |---------|----------|-------------| | FG-001 | Critical | Pwn Request: pull_request_target with fork checkout | -| FG-002 | High | Script Injection via expression interpolation | +| FG-002 | High | Script Injection via expression interpolation (PR context, dispatch inputs, reusable workflow inputs) | | FG-003 | Medium | Tag-based action pinning (mutable references) | | FG-004 | Medium | Overly broad workflow permissions | | FG-005 | Low | Secrets exposed in workflow logs | +| FG-006 | Medium | Fork PR code execution via build hooks | +| FG-007 | Medium | Inconsistent GITHUB_TOKEN blanking | +| FG-008 | Critical | OIDC misconfiguration on external triggers | +| FG-009 | High | Self-hosted runner on external triggers | +| FG-010 | High | Cache poisoning via shared cache on PR workflows | +| FG-011 | Medium | Bot actor guard TOCTOU bypass risk | + +### GitLab CI (GL-xxx) + +| Rule | Severity | Description | +|---------|----------|-------------| +| GL-001 | High | Merge request pipeline with privileged variables | +| GL-002 | High | Script injection via CI predefined variables | +| GL-003 | Medium | Unpinned include templates | + +### Azure Pipelines (AZ-xxx) + +| Rule | Severity | Description | +|---------|----------|-------------| +| AZ-001 | High | Fork PR builds with secret/variable group exposure | +| AZ-002 | High | Script injection via Azure predefined variables | +| AZ-003 | Medium | Unpinned template extends and repository resources | +| AZ-009 | High | Self-hosted agent pools on PR-triggered pipelines | + +**21 rules across 3 CI/CD platforms.** ## Why This Exists diff --git a/cmd/fluxgate/main.go b/cmd/fluxgate/main.go index 2b26934..46ccfdc 100644 --- a/cmd/fluxgate/main.go +++ b/cmd/fluxgate/main.go @@ -16,7 +16,7 @@ import ( "github.com/spf13/cobra" ) -var version = "0.5.0" +var version = "0.6.0" func main() { rootCmd := &cobra.Command{ diff --git a/internal/report/table.go b/internal/report/table.go index d2e0f37..7603b7c 100644 --- a/internal/report/table.go +++ b/internal/report/table.go @@ -11,7 +11,7 @@ import ( // WriteTable writes findings as a human-readable table. func WriteTable(w io.Writer, result *scanner.ScanResult) { - fmt.Fprintln(w, "fluxgate v0.5.0 — CI/CD Pipeline Security Gate") + fmt.Fprintln(w, "fluxgate v0.6.0 — CI/CD Pipeline Security Gate") fmt.Fprintln(w) fmt.Fprintf(w, "Scanning: %s\n\n", result.Path) diff --git a/internal/scanner/rules.go b/internal/scanner/rules.go index d7df13e..2e57fb6 100644 --- a/internal/scanner/rules.go +++ b/internal/scanner/rules.go @@ -22,6 +22,7 @@ func AllRules() map[string]Rule { "FG-008": CheckOIDCMisconfiguration, "FG-009": CheckSelfHostedRunner, "FG-010": CheckCachePoisoning, + "FG-011": CheckBotActorTOCTOU, } } @@ -37,6 +38,7 @@ var RuleDescriptions = map[string]string{ "FG-008": "OIDC Misconfiguration", "FG-009": "Self-Hosted Runner", "FG-010": "Cache Poisoning", + "FG-011": "Bot Actor Guard TOCTOU", } // ExecutionAnalysis captures the result of post-checkout step analysis. @@ -147,10 +149,16 @@ func CheckPwnRequest(wf *Workflow) []Finding { mitigation := analyzeMitigations(wf, job, checkoutIdx, postCheckoutSteps, checkoutPath) mitigated := false - if mitigation.ForkGuard || mitigation.ActorGuard { + if mitigation.ForkGuard { severity = SeverityInfo confidence = ConfidencePatternOnly mitigated = true + } else if mitigation.ActorGuard { + // Bot actor guards are bypassable via TOCTOU — cap at high, never suppress + if severity == SeverityCritical { + severity = SeverityHigh + } + mitigated = true } else if mitigation.LabelGated && mitigation.EnvironmentGated { severity = downgradeBy(severity, 2) mitigated = true @@ -368,6 +376,8 @@ func CheckScriptInjection(wf *Workflow) []Finding { "github.event.head_commit.message", "github.head_ref", "github.event.workflow_run.head_branch", + "github.event.inputs.", + "inputs.", } var findings []Finding @@ -1367,6 +1377,98 @@ func CheckCachePoisoning(wf *Workflow) []Finding { return findings } +// CheckBotActorTOCTOU detects workflows where a bot actor guard (e.g., +// if: github.actor == 'dependabot[bot]') protects a fork checkout + execution +// path that may be bypassable via TOCTOU: an attacker updates their PR commit +// after the bot triggers the workflow but before the runner resolves the SHA (FG-011). +func CheckBotActorTOCTOU(wf *Workflow) []Finding { + if !wf.On.PullRequestTarget && !wf.On.WorkflowRun { + return nil + } + + var findings []Finding + for jobName, job := range wf.Jobs { + // Must have a bot actor guard + isBot := false + if job.If != "" { + isBot, _ = containsActorGuard(job.If) + } + // Also check needs chain for inherited actor guards + if !isBot { + for _, depName := range job.Needs { + if dep, ok := wf.Jobs[depName]; ok { + if dep.If != "" { + isBot, _ = containsActorGuard(dep.If) + if isBot { + break + } + } + } + } + } + if !isBot { + continue + } + + // Must have fork checkout + checkoutIdx := -1 + checkoutPath := "" + for i, step := range job.Steps { + if isCheckoutAction(step.Uses) && refPointsToPRHead(step.With["ref"]) { + checkoutIdx = i + checkoutPath = step.With["path"] + break + } + } + if checkoutIdx == -1 { + continue + } + + // Must have post-checkout execution + postCheckoutSteps := job.Steps[checkoutIdx+1:] + execResult := analyzePostCheckoutExecution(postCheckoutSteps) + + // Also check path isolation — if fork code is isolated and not executed, skip + if checkoutPath != "" { + hasForkExec := false + for _, step := range postCheckoutSteps { + if step.Run != "" && referencesForkPath(step.Run, checkoutPath) { + hasForkExec = true + break + } + } + if !hasForkExec && !execResult.Confirmed && !execResult.Likely { + continue + } + } else if !execResult.Confirmed && !execResult.Likely { + continue + } + + trigger := "pull_request_target" + if !wf.On.PullRequestTarget && wf.On.WorkflowRun { + trigger = "workflow_run" + } + + msg := fmt.Sprintf( + "Bot Actor Guard TOCTOU: job '%s' on %s has bot actor guard with fork checkout + execution — "+ + "attacker can update PR commit after bot triggers workflow but before runner resolves SHA", + jobName, trigger) + + findings = append(findings, Finding{ + RuleID: "FG-011", + Severity: SeverityMedium, + File: wf.Path, + Line: 1, + Message: msg, + Details: "Bot-delegated TOCTOU: if: github.actor == 'dependabot[bot]' guards are " + + "bypassable when an attacker pushes a new commit between the bot trigger event " + + "and the runner's checkout. The workflow runs the attacker's code with the bot's privileges. " + + "See BoostSecurity 'Weaponizing Dependabot' research.", + }) + } + return findings +} + // describeTrigger returns a human-readable description of the workflow trigger. func describeTrigger(wf *Workflow) string { if wf.On.PullRequestTarget { diff --git a/internal/scanner/rules_test.go b/internal/scanner/rules_test.go index 9791478..ff7a76f 100644 --- a/internal/scanner/rules_test.go +++ b/internal/scanner/rules_test.go @@ -218,21 +218,52 @@ func TestMixedWorkflow_MultipleFindings(t *testing.T) { opts := ScanOptions{} findings := ScanWorkflow(wf, opts) - if len(findings) < 3 { - t.Fatalf("expected at least 3 findings for mixed workflow, got %d", len(findings)) + if len(findings) < 2 { + t.Fatalf("expected at least 2 findings for mixed workflow, got %d", len(findings)) } rulesSeen := map[string]bool{} for _, f := range findings { rulesSeen[f.RuleID] = true } - // Should trigger FG-001 (pwn request), FG-002 (script injection), - // FG-003 (tag pinning), FG-004 (broad permissions) - for _, expected := range []string{"FG-001", "FG-002", "FG-003"} { + // FG-001 and FG-002 are correlated into a single merged FG-001 finding + // FG-003 (tag pinning) should still fire separately + for _, expected := range []string{"FG-001", "FG-003"} { if !rulesSeen[expected] { t.Errorf("expected rule %s to fire on mixed workflow", expected) } } + // FG-002 should be absorbed into the merged FG-001 + if rulesSeen["FG-002"] { + t.Error("expected FG-002 to be merged into FG-001 via correlation, but it still appears separately") + } +} + +func TestCorrelation_PwnRequestPlusInjection(t *testing.T) { + wf := loadFixture(t, "mixed-workflow.yaml") + opts := ScanOptions{} + findings := ScanWorkflow(wf, opts) + + // Find the correlated FG-001 finding + var merged *Finding + for i, f := range findings { + if f.RuleID == "FG-001" { + merged = &findings[i] + break + } + } + if merged == nil { + t.Fatal("expected merged FG-001 finding") + } + if merged.Severity != SeverityCritical { + t.Errorf("expected critical severity for merged finding, got %s", merged.Severity) + } + if !strings.Contains(merged.Message, "Ultralytics") { + t.Error("expected merged finding to reference Ultralytics pattern") + } + if !strings.Contains(merged.Details, "FG-001+FG-002") { + t.Error("expected merged finding details to mention FG-001+FG-002 correlation") + } } // --- FG-001 mitigation tests --- @@ -356,8 +387,9 @@ func TestCheckPwnRequest_ActorGuardBot(t *testing.T) { t.Fatalf("expected 1 finding, got %d", len(findings)) } f := findings[0] - if f.Severity != SeverityInfo { - t.Errorf("expected info severity for bot actor guard, got %s", f.Severity) + // Bot actor guards are TOCTOU-bypassable — cap at high, never suppress to info + if f.Severity != SeverityHigh { + t.Errorf("expected high severity for bot actor guard (TOCTOU cap), got %s", f.Severity) } if len(f.Mitigations) == 0 { t.Error("expected mitigations to be populated") @@ -628,6 +660,76 @@ func TestCheckPwnRequest_PathAliasExec(t *testing.T) { } } +// --- FG-011 Bot Actor Guard TOCTOU tests --- + +func TestCheckBotActorTOCTOU_PRT(t *testing.T) { + wf := loadFixture(t, "bot-actor-guard-toctou.yaml") + findings := CheckBotActorTOCTOU(wf) + + if len(findings) != 1 { + t.Fatalf("expected 1 FG-011 finding, got %d", len(findings)) + } + f := findings[0] + if f.RuleID != "FG-011" { + t.Errorf("expected FG-011, got %s", f.RuleID) + } + if f.Severity != SeverityMedium { + t.Errorf("expected medium severity, got %s", f.Severity) + } + if !strings.Contains(f.Message, "pull_request_target") { + t.Error("expected message to mention pull_request_target trigger") + } + if !strings.Contains(f.Message, "TOCTOU") { + t.Error("expected message to mention TOCTOU") + } +} + +func TestCheckBotActorTOCTOU_WorkflowRun(t *testing.T) { + wf := loadFixture(t, "workflow-run-toctou.yaml") + findings := CheckBotActorTOCTOU(wf) + + if len(findings) != 1 { + t.Fatalf("expected 1 FG-011 finding for workflow_run, got %d", len(findings)) + } + f := findings[0] + if f.RuleID != "FG-011" { + t.Errorf("expected FG-011, got %s", f.RuleID) + } + if !strings.Contains(f.Message, "workflow_run") { + t.Error("expected message to mention workflow_run trigger") + } +} + +func TestCheckBotActorTOCTOU_NoBotGuard(t *testing.T) { + // A PRT workflow without actor guard should not fire FG-011 + wf := loadFixture(t, "pwn-request.yaml") + findings := CheckBotActorTOCTOU(wf) + + if len(findings) != 0 { + t.Fatalf("expected 0 findings for workflow without bot actor guard, got %d", len(findings)) + } +} + +// --- FG-002 extension: workflow_dispatch/call injection --- + +func TestCheckScriptInjection_DispatchInputs(t *testing.T) { + wf := loadFixture(t, "dispatch-injection.yaml") + findings := CheckScriptInjection(wf) + + // Should find: inputs.* and github.event.inputs.* (2 pattern matches on the run block) + if len(findings) < 2 { + t.Fatalf("expected at least 2 injection findings for dispatch inputs, got %d", len(findings)) + } + for _, f := range findings { + if f.RuleID != "FG-002" { + t.Errorf("expected FG-002, got %s", f.RuleID) + } + if f.Severity != SeverityHigh { + t.Errorf("expected high severity, got %s", f.Severity) + } + } +} + func TestRuleFilter(t *testing.T) { wf := loadFixture(t, "mixed-workflow.yaml") opts := ScanOptions{Rules: []string{"FG-002"}} diff --git a/internal/scanner/scanner.go b/internal/scanner/scanner.go index 17b24e3..82a06dd 100644 --- a/internal/scanner/scanner.go +++ b/internal/scanner/scanner.go @@ -1,6 +1,7 @@ package scanner import ( + "fmt" "os" "path/filepath" "sort" @@ -111,6 +112,9 @@ func ScanWorkflow(wf *Workflow, opts ScanOptions) []Finding { findings = append(findings, results...) } + // Post-scan correlation: merge FG-001+FG-002 co-occurrences + findings = CorrelatePwnRequestInjection(findings) + if len(opts.Severities) > 0 { sevSet := make(map[string]bool) for _, s := range opts.Severities { @@ -251,6 +255,70 @@ func ScanAzurePipelines(data []byte, path string, opts ScanOptions) []Finding { return findings } +// CorrelatePwnRequestInjection runs after all rules complete and merges +// co-occurring FG-001 (confirmed) + FG-002 findings on the same file into a +// single enhanced finding with the Ultralytics attack pattern narrative. +// The original FG-001 and FG-002 findings are replaced by the merged finding. +func CorrelatePwnRequestInjection(findings []Finding) []Finding { + // Index FG-001 findings by file — any confidence level qualifies because + // the co-occurrence of FG-002 itself proves attacker-controlled execution + fg001ByFile := make(map[string]int) // file -> index in findings + for i, f := range findings { + if f.RuleID == "FG-001" { + fg001ByFile[f.File] = i + } + } + + if len(fg001ByFile) == 0 { + return findings + } + + // Find FG-002 findings on the same files + mergedFiles := make(map[string]bool) + var fg002Exprs []string + for _, f := range findings { + if f.RuleID == "FG-002" { + if _, ok := fg001ByFile[f.File]; ok { + mergedFiles[f.File] = true + fg002Exprs = append(fg002Exprs, f.Message) + } + } + } + + if len(mergedFiles) == 0 { + return findings + } + + // Build merged findings and filter out originals + var result []Finding + for _, f := range findings { + if mergedFiles[f.File] && (f.RuleID == "FG-001" || f.RuleID == "FG-002") { + // Skip — will be replaced by merged finding + if f.RuleID == "FG-001" { + // Emit the merged finding in place of FG-001 + merged := Finding{ + RuleID: "FG-001", + Severity: SeverityCritical, + Confidence: f.Confidence, + File: f.File, + Line: f.Line, + Message: fmt.Sprintf( + "Pwn Request + Script Injection: fork checkout with attacker-controlled expression in run block (Ultralytics pattern) [%s]", + f.Confidence), + Details: f.Details + " | Combined FG-001+FG-002: attacker controls both checked-out code AND " + + "expressions interpolated into shell commands. This is the exact attack chain used in the " + + "Ultralytics compromise (pull_request_target → shell injection via branch name → credential exfiltration).", + Mitigations: f.Mitigations, + } + result = append(result, merged) + } + continue + } + result = append(result, f) + } + return result +} + func sortFindings(findings []Finding) { sort.Slice(findings, func(i, j int) bool { ri := SeverityRank(findings[i].Severity) diff --git a/test/fixtures/bot-actor-guard-toctou.yaml b/test/fixtures/bot-actor-guard-toctou.yaml new file mode 100644 index 0000000..5f9acdf --- /dev/null +++ b/test/fixtures/bot-actor-guard-toctou.yaml @@ -0,0 +1,24 @@ +name: Dependabot Auto-merge +on: + pull_request_target: + +jobs: + auto-merge: + runs-on: ubuntu-latest + if: github.actor == 'dependabot[bot]' + permissions: + contents: write + pull-requests: write + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + - name: Build and test + run: | + npm ci + npm test + - name: Merge + run: gh pr merge --auto --squash "$PR_URL" + env: + PR_URL: ${{ github.event.pull_request.html_url }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/test/fixtures/dispatch-injection.yaml b/test/fixtures/dispatch-injection.yaml new file mode 100644 index 0000000..fc2a06f --- /dev/null +++ b/test/fixtures/dispatch-injection.yaml @@ -0,0 +1,22 @@ +name: Deploy +on: + workflow_dispatch: + inputs: + environment: + description: 'Target environment' + required: true + type: string + branch: + description: 'Branch to deploy' + required: true + type: string + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Deploy + run: | + echo "Deploying ${{ inputs.branch }} to ${{ inputs.environment }}" + ./deploy.sh ${{ github.event.inputs.environment }} diff --git a/test/fixtures/workflow-run-toctou.yaml b/test/fixtures/workflow-run-toctou.yaml new file mode 100644 index 0000000..714f055 --- /dev/null +++ b/test/fixtures/workflow-run-toctou.yaml @@ -0,0 +1,18 @@ +name: Post-CI Deploy +on: + workflow_run: + workflows: ["CI"] + types: [completed] + +jobs: + deploy: + runs-on: ubuntu-latest + if: github.actor == 'renovate[bot]' + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.ref }} + - name: Install and deploy + run: | + npm ci + npm run deploy From fc87966d568566be28ec303267291ae8d2cca992 Mon Sep 17 00:00:00 2001 From: Christopher Lusk Date: Sun, 22 Mar 2026 22:28:29 -0400 Subject: [PATCH 07/10] security: add security boundaries, remove triage prompts from public repo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add SECURITY-BOUNDARIES.md defining the public/private boundary for this project. Add CLAUDE.md with CC instructions referencing it. Remove prompts/ from git tracking — triage agent prompts encode assessment methodology and must not be public (rule 1). Update .gitignore to exclude prompts/, queries/, scans/, findings/, reports/, and .sql files. Co-Authored-By: Claude Opus 4.6 --- .gitignore | 13 +++++++++++++ CLAUDE.md | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 CLAUDE.md diff --git a/.gitignore b/.gitignore index 2e97475..34824a3 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,19 @@ dist/ *.db-wal *.db-shm +# Private research (see SECURITY-BOUNDARIES.md) +prompts/ +queries/ +scans/ +findings/ +reports/ +MEMORY.md +*.sql + +# Environment and secrets +.env +.env.* + # Editor/OS .DS_Store *.swp diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..37e7743 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,41 @@ +# Fluxgate — Claude Code Instructions + +## Project + +Fluxgate is a CI/CD pipeline security static analysis tool. It scans GitHub Actions, GitLab CI, and Azure Pipelines workflow files for dangerous security patterns (pwn requests, script injection, OIDC misconfiguration, etc.). + +## Security Boundaries + +**READ [SECURITY-BOUNDARIES.md](SECURITY-BOUNDARIES.md) BEFORE EVERY COMMIT.** + +Key constraints: +- Never commit prompt files, BigQuery queries, scan databases, triage briefs, or real scan output to this public repo. +- Never reference specific unpatched repos by name in commits or code. +- Never embed disclosure tracking IDs (GHSA-*, VULN-*) in public code. +- Test fixtures must be synthetic — never copy real workflow files from scanned repos. +- **Before every push, ask: "Does this commit contain anything that helps an attacker evade detection or identifies an unpatched target?"** + +## Code Structure + +- `cmd/fluxgate/` — CLI entry point (cobra) +- `internal/scanner/` — GitHub Actions parser, rules (FG-xxx), scanner orchestration +- `internal/cicd/` — GitLab CI parser+rules (GL-xxx), Azure Pipelines parser+rules (AZ-xxx) +- `internal/github/` — GitHub API client, batch scanning, discovery +- `internal/report/` — Output formatters (table, JSON, SARIF, markdown) +- `internal/store/` — SQLite persistence +- `test/fixtures/` — Synthetic YAML fixtures for rule tests + +## Testing + +```bash +go test ./... +``` + +All rules must have corresponding test fixtures and test functions in `*_test.go`. + +## Style + +- Go standard library style, no unnecessary abstractions +- Rules are functions with signature `func(wf *Workflow) []Finding` +- Platform-specific rules live in their parser package (internal/cicd/) +- Bridge functions in scanner.go convert platform findings to common Finding type From 66ad2e460d1e21bf20d8099e27fc0a14a26318ff Mon Sep 17 00:00:00 2001 From: Christopher Lusk <122107484+north-echo@users.noreply.github.com> Date: Sun, 22 Mar 2026 22:42:35 -0400 Subject: [PATCH 08/10] Update security contact email address --- SECURITY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SECURITY.md b/SECURITY.md index 911cb91..30d4902 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -6,7 +6,7 @@ If you discover a security vulnerability in Fluxgate, please report it responsib ### Contact -- **Email:** christopherdlusk@gmail.com +- **Email:** clusk@northecho.dev - **GitHub Security Advisory:** Open a [GitHub Security Advisory](https://github.com/north-echo/fluxgate/security/advisories/new) on this repository. ### What to Include From 7836f67b22b4ea96d32b36d0f737732047eedb80 Mon Sep 17 00:00:00 2001 From: Christopher Lusk Date: Sun, 22 Mar 2026 22:43:39 -0400 Subject: [PATCH 09/10] chore: add SECURITY-BOUNDARIES.md to .gitignore, update CLAUDE.md Remove reference to committed security boundaries file from CLAUDE.md. Add SECURITY-BOUNDARIES.md to .gitignore to prevent future accidental commits. Co-Authored-By: Claude Opus 4.6 --- .gitignore | 3 ++- CLAUDE.md | 17 +++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 34824a3..5b67b36 100644 --- a/.gitignore +++ b/.gitignore @@ -7,13 +7,14 @@ dist/ *.db-wal *.db-shm -# Private research (see SECURITY-BOUNDARIES.md) +# Private research prompts/ queries/ scans/ findings/ reports/ MEMORY.md +SECURITY-BOUNDARIES.md *.sql # Environment and secrets diff --git a/CLAUDE.md b/CLAUDE.md index 37e7743..5e8e406 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,14 +6,15 @@ Fluxgate is a CI/CD pipeline security static analysis tool. It scans GitHub Acti ## Security Boundaries -**READ [SECURITY-BOUNDARIES.md](SECURITY-BOUNDARIES.md) BEFORE EVERY COMMIT.** - -Key constraints: -- Never commit prompt files, BigQuery queries, scan databases, triage briefs, or real scan output to this public repo. -- Never reference specific unpatched repos by name in commits or code. -- Never embed disclosure tracking IDs (GHSA-*, VULN-*) in public code. -- Test fixtures must be synthetic — never copy real workflow files from scanned repos. -- **Before every push, ask: "Does this commit contain anything that helps an attacker evade detection or identifies an unpatched target?"** +Before every push, ask: **"Does this commit contain anything that helps an attacker evade detection or identifies an unpatched target?"** If yes, do not push. + +Never commit to this public repo: +- Prompt files, BigQuery queries, scan databases, triage briefs, or real scan output +- Specific unpatched repo names in commits, code, or documentation +- Disclosure tracking IDs (GHSA-*, VULN-*, HackerOne report numbers) +- SECURITY-BOUNDARIES.md or any file describing what we consider sensitive + +Test fixtures must be synthetic — never copy real workflow files from scanned repos. When in doubt, keep it private. ## Code Structure From 33fa46e3f4f4477760534be1bc444da7fa0b1ca2 Mon Sep 17 00:00:00 2001 From: Katie Mulliken Date: Mon, 30 Mar 2026 08:19:38 -0400 Subject: [PATCH 10/10] feat: add git sparse checkout mode for batch scanning Adds --clone flag to batch and discover commands, scanning repos via local git sparse checkout instead of the GitHub API. This avoids API rate limits when scanning large numbers of repos. Key changes: - internal/git: sparse clone package with concurrent clone-and-scan - Sliding star-count windows in FetchTopRepos to paginate beyond GitHub's 1,000-result search limit - Repo list caching in SQLite for --resume with --top N - SQLite hardening: single-conn serialization + busy_timeout for concurrent goroutine writes Co-Authored-By: Claude Opus 4.6 --- cmd/fluxgate/main.go | 215 +++++++++++++++++++++++++++++++---- internal/git/sparse.go | 160 ++++++++++++++++++++++++++ internal/git/sparse_test.go | 136 ++++++++++++++++++++++ internal/github/batch.go | 68 +++++++++-- internal/store/migrations.go | 20 ++++ internal/store/sqlite.go | 73 +++++++++++- 6 files changed, 637 insertions(+), 35 deletions(-) create mode 100644 internal/git/sparse.go create mode 100644 internal/git/sparse_test.go diff --git a/cmd/fluxgate/main.go b/cmd/fluxgate/main.go index 46ccfdc..6b29148 100644 --- a/cmd/fluxgate/main.go +++ b/cmd/fluxgate/main.go @@ -9,6 +9,7 @@ import ( "strings" "time" + gitclone "github.com/north-echo/fluxgate/internal/git" ghclient "github.com/north-echo/fluxgate/internal/github" "github.com/north-echo/fluxgate/internal/report" "github.com/north-echo/fluxgate/internal/scanner" @@ -162,15 +163,18 @@ func newRemoteCmd() *cobra.Command { func newBatchCmd() *cobra.Command { var ( - top int - dbPath string - list string - resume bool - delay time.Duration - reportPath string - token string - severities string - rules string + top int + dbPath string + list string + resume bool + delay time.Duration + reportPath string + token string + severities string + rules string + useClone bool + concurrency int + keepDir string ) cmd := &cobra.Command{ @@ -197,6 +201,12 @@ func newBatchCmd() *cobra.Command { token = os.Getenv("GITHUB_TOKEN") } + if useClone { + if err := gitclone.CheckGit(); err != nil { + return err + } + } + client := ghclient.NewClient(token) ctx := context.Background() @@ -207,16 +217,47 @@ func newBatchCmd() *cobra.Command { return fmt.Errorf("loading repo list: %w", err) } } else if top > 0 { - fmt.Printf("Fetching top %d repos by stars...\n", top) - repos, err = client.FetchTopRepos(ctx, top) - if err != nil { - return fmt.Errorf("fetching top repos: %w", err) + cacheKey := fmt.Sprintf("top:%d", top) + if resume { + cached, _ := db.LoadRepoList(cacheKey) + if cached != nil { + fmt.Printf("Loaded %d repos from cache\n\n", len(cached)) + for _, c := range cached { + repos = append(repos, ghclient.RepoInfo{Owner: c.Owner, Name: c.Name, Stars: c.Stars, Language: c.Language}) + } + } + } + if len(repos) == 0 { + fmt.Printf("Fetching top %d repos by stars...\n", top) + repos, err = client.FetchTopRepos(ctx, top) + if err != nil { + return fmt.Errorf("fetching top repos: %w", err) + } + fmt.Printf("Found %d repos\n\n", len(repos)) + + entries := make([]store.RepoListEntry, len(repos)) + for i, r := range repos { + entries[i] = store.RepoListEntry{Owner: r.Owner, Name: r.Name, Stars: r.Stars, Language: r.Language} + } + if saveErr := db.SaveRepoList(cacheKey, entries); saveErr != nil { + fmt.Fprintf(os.Stderr, "Warning: could not cache repo list: %v\n", saveErr) + } } - fmt.Printf("Found %d repos\n\n", len(repos)) } else { return fmt.Errorf("specify --top N or --list file") } + if useClone { + return batchScanWithClone(ctx, repos, cloneScanOptions{ + DB: db, + Token: token, + ScanOpts: parseScanOpts(severities, rules), + Resume: resume, + Concurrency: concurrency, + KeepDir: keepDir, + }) + } + batchOpts := ghclient.BatchOptions{ Top: top, List: list, @@ -248,23 +289,126 @@ func newBatchCmd() *cobra.Command { cmd.Flags().StringVar(&severities, "severity", "", "Filter by severity (comma-separated)") cmd.Flags().StringVar(&rules, "rules", "", "Filter by rule ID (comma-separated)") cmd.Flags().DurationVar(&delay, "delay", 0, "Delay between repos to avoid rate limits (e.g. 1s, 500ms)") + cmd.Flags().BoolVar(&useClone, "clone", false, "Use git sparse checkout instead of API (avoids rate limits)") + cmd.Flags().IntVar(&concurrency, "concurrency", 5, "Number of concurrent clone operations (used with --clone)") + cmd.Flags().StringVar(&keepDir, "keep", "", "Keep cloned repos in this directory instead of cleaning up") return cmd } +// cloneScanOptions groups parameters for clone-based batch scanning. +type cloneScanOptions struct { + DB *store.DB + Token string + ScanOpts scanner.ScanOptions + Resume bool + Concurrency int + KeepDir string +} + +// batchScanWithClone scans repos by sparse-cloning them locally instead of +// fetching workflows via the GitHub API. Each repo is cloned, scanned, and +// cleaned up within a goroutine, bounding disk usage to O(concurrency). +func batchScanWithClone(ctx context.Context, repos []ghclient.RepoInfo, copts cloneScanOptions) error { + var toScan []ghclient.RepoInfo + for _, repo := range repos { + if copts.Resume { + already, err := copts.DB.IsRepoScanned(repo.Owner, repo.Name) + if err != nil { + return fmt.Errorf("checking if %s/%s is scanned: %w", repo.Owner, repo.Name, err) + } + if already { + continue + } + } + toScan = append(toScan, repo) + } + + if len(toScan) == 0 { + fmt.Println("All repos already scanned.") + return nil + } + + if copts.KeepDir != "" { + fmt.Fprintf(os.Stderr, "Keeping clones in: %s\n", copts.KeepDir) + if err := os.MkdirAll(copts.KeepDir, 0o750); err != nil { + return fmt.Errorf("creating keep dir: %w", err) + } + } + + repoInfo := make(map[string]ghclient.RepoInfo, len(toScan)) + cloneRepos := make([]gitclone.Repo, len(toScan)) + for i, r := range toScan { + cloneRepos[i] = gitclone.Repo{Owner: r.Owner, Name: r.Name} + repoInfo[r.Owner+"/"+r.Name] = r + } + + fmt.Printf("Scanning %d repos via clone (concurrency: %d)...\n", len(toScan), copts.Concurrency) + + results := gitclone.CloneAndScan(ctx, cloneRepos, copts.Concurrency, copts.Token, copts.KeepDir, + func(owner, name, dir string, cr *gitclone.CloneResult) error { + key := owner + "/" + name + info := repoInfo[key] + + scanResult, err := scanner.ScanDirectory(dir, copts.ScanOpts) + if err != nil { + emptyResult := &scanner.ScanResult{Path: key} + if saveErr := copts.DB.SaveResult(owner, name, info.Stars, info.Language, emptyResult); saveErr != nil { + fmt.Fprintf(os.Stderr, " Warning: could not save error state: %v\n", saveErr) + } + return err + } + + cr.SetFindings(len(scanResult.Findings), scanResult.Workflows) + scanResult.Path = key + return copts.DB.SaveResult(owner, name, info.Stars, info.Language, scanResult) + }) + + scanned, withFindings := 0, 0 + for i, cr := range results { + key := cr.Owner + "/" + cr.Name + info := repoInfo[key] + + fmt.Printf("[%d/%d] %s", i+1, len(results), key) + if info.Stars > 0 { + fmt.Printf(" (%d stars)", info.Stars) + } + + if cr.Err != nil { + fmt.Printf(" error: %v\n", cr.Err) + continue + } + + scanned++ + if cr.Findings > 0 { + withFindings++ + fmt.Printf(" %d issues in %d workflows\n", cr.Findings, cr.Workflows) + } else { + fmt.Println(" clean") + } + } + + fmt.Printf("\nBatch complete: %d scanned, %d with findings, %d skipped\n", + scanned, withFindings, len(repos)-len(toScan)) + return nil +} + func newDiscoverCmd() *cobra.Command { var ( - trigger string - minStars int - maxPages int - dbPath string - delay time.Duration - resume bool - token string - severities string - rules string - listOnly bool - output string + trigger string + minStars int + maxPages int + dbPath string + delay time.Duration + resume bool + token string + severities string + rules string + listOnly bool + output string + useClone bool + concurrency int + keepDir string ) cmd := &cobra.Command{ @@ -275,6 +419,13 @@ func newDiscoverCmd() *cobra.Command { if token == "" { token = os.Getenv("GITHUB_TOKEN") } + + if useClone { + if err := gitclone.CheckGit(); err != nil { + return err + } + } + client := ghclient.NewClient(token) ctx := context.Background() @@ -312,6 +463,17 @@ func newDiscoverCmd() *cobra.Command { } defer db.Close() + if useClone { + return batchScanWithClone(ctx, repos, cloneScanOptions{ + DB: db, + Token: token, + ScanOpts: parseScanOpts(severities, rules), + Resume: resume, + Concurrency: concurrency, + KeepDir: keepDir, + }) + } + batchOpts := ghclient.BatchOptions{ Resume: resume, Delay: delay, @@ -334,6 +496,9 @@ func newDiscoverCmd() *cobra.Command { cmd.Flags().StringVar(&rules, "rules", "", "Filter by rule ID (comma-separated)") cmd.Flags().BoolVar(&listOnly, "list-only", false, "Output repo list without scanning") cmd.Flags().StringVarP(&output, "output", "o", "", "Output file for --list-only (default: stdout)") + cmd.Flags().BoolVar(&useClone, "clone", false, "Use git sparse checkout instead of API (avoids rate limits)") + cmd.Flags().IntVar(&concurrency, "concurrency", 5, "Number of concurrent clone operations (used with --clone)") + cmd.Flags().StringVar(&keepDir, "keep", "", "Keep cloned repos in this directory instead of cleaning up") return cmd } diff --git a/internal/git/sparse.go b/internal/git/sparse.go new file mode 100644 index 0000000..ebf0482 --- /dev/null +++ b/internal/git/sparse.go @@ -0,0 +1,160 @@ +package git + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "sync" +) + +// CloneResult holds the outcome of a single clone-and-scan operation. +type CloneResult struct { + Owner string + Name string + Err error + Findings int // populated by ScanFunc via SetFindings + Workflows int // populated by ScanFunc via SetFindings +} + +// SetFindings records scan metrics in the result. Call this from ScanFunc. +func (r *CloneResult) SetFindings(findings, workflows int) { + r.Findings = findings + r.Workflows = workflows +} + +// CheckGit verifies that git is available in PATH. +func CheckGit() error { + _, err := exec.LookPath("git") + if err != nil { + return fmt.Errorf("git is required for --clone mode but was not found in PATH") + } + return nil +} + +// runGit executes a git command in the given directory, suppressing output. +func runGit(ctx context.Context, dir string, args ...string) error { + cmd := exec.CommandContext(ctx, "git", args...) + cmd.Dir = dir + return cmd.Run() +} + +// SparseClone clones a GitHub repository using sparse checkout, fetching only +// the .github/ directory. +// +// If token is non-empty, it is injected via GIT_ASKPASS to avoid exposing +// credentials in process argument lists. +func SparseClone(ctx context.Context, owner, repo, destDir, token string) error { + repoURL := fmt.Sprintf("https://github.com/%s/%s.git", owner, repo) + + cloneCmd := exec.CommandContext(ctx, "git", "clone", + "--filter=blob:none", + "--no-checkout", + "--depth=1", + repoURL, + destDir, + ) + if token != "" { + cloneCmd.Env = append(os.Environ(), tokenAskPassEnv(token)...) + } + if err := cloneCmd.Run(); err != nil { + return fmt.Errorf("git clone %s/%s: %w", owner, repo, err) + } + + // Non-cone mode excludes root-level files; cone mode always includes them. + if err := runGit(ctx, destDir, "sparse-checkout", "init", "--no-cone"); err != nil { + return fmt.Errorf("git sparse-checkout init for %s/%s: %w", owner, repo, err) + } + + if err := runGit(ctx, destDir, "sparse-checkout", "set", "/.github/"); err != nil { + return fmt.Errorf("git sparse-checkout set for %s/%s: %w", owner, repo, err) + } + + if err := runGit(ctx, destDir, "checkout"); err != nil { + return fmt.Errorf("git checkout for %s/%s: %w", owner, repo, err) + } + + return nil +} + +// tokenAskPassEnv returns environment variables that inject a GitHub token +// via GIT_ASKPASS, keeping the token out of process argument lists. +func tokenAskPassEnv(token string) []string { + return []string{ + "GIT_ASKPASS=printf", + "GIT_TERMINAL_PROMPT=0", + fmt.Sprintf("GIT_PASSWORD=%s", token), + } +} + +// ScanFunc is called with the cloned repo directory and a result pointer. +// It should scan the directory, persist results, and call result.SetFindings +// to record metrics. Errors are recorded in CloneResult. +type ScanFunc func(owner, name, dir string, result *CloneResult) error + +// Repo identifies a repository to clone. +type Repo struct { + Owner string + Name string +} + +// CloneAndScan clones repos concurrently, calling scanFn on each clone as it +// completes. Each clone is removed after scanning unless keepDir is non-empty. +// This bounds disk usage to O(concurrency) rather than O(total repos). +func CloneAndScan(ctx context.Context, repos []Repo, concurrency int, token, keepDir string, scanFn ScanFunc) []CloneResult { + if concurrency < 1 { + concurrency = 5 + } + + results := make([]CloneResult, len(repos)) + sem := make(chan struct{}, concurrency) + var wg sync.WaitGroup + + for i := range repos { + wg.Add(1) + go func(idx int) { + defer wg.Done() + + sem <- struct{}{} + defer func() { <-sem }() + + owner, name := repos[idx].Owner, repos[idx].Name + results[idx] = CloneResult{Owner: owner, Name: name} + + select { + case <-ctx.Done(): + results[idx].Err = ctx.Err() + return + default: + } + + var repoDir string + if keepDir != "" { + repoDir = filepath.Join(keepDir, owner, name) + if err := os.MkdirAll(filepath.Dir(repoDir), 0o750); err != nil { + results[idx].Err = err + return + } + } else { + tmpDir, err := os.MkdirTemp("", "fluxgate-*") + if err != nil { + results[idx].Err = err + return + } + repoDir = filepath.Join(tmpDir, owner, name) + defer os.RemoveAll(tmpDir) + } + + if err := SparseClone(ctx, owner, name, repoDir, token); err != nil { + results[idx].Err = err + return + } + + results[idx].Err = scanFn(owner, name, repoDir, &results[idx]) + }(i) + } + + wg.Wait() + return results +} diff --git a/internal/git/sparse_test.go b/internal/git/sparse_test.go new file mode 100644 index 0000000..ad27b6e --- /dev/null +++ b/internal/git/sparse_test.go @@ -0,0 +1,136 @@ +package git + +import ( + "context" + "os" + "os/exec" + "path/filepath" + "testing" +) + +func TestCheckGit(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not available in test environment") + } + if err := CheckGit(); err != nil { + t.Fatalf("CheckGit() returned error: %v", err) + } +} + +func TestCheckGitMissingBinary(t *testing.T) { + t.Setenv("PATH", "") + err := CheckGit() + if err == nil { + t.Fatal("expected error when git is not in PATH") + } + want := "git is required for --clone mode but was not found in PATH" + if err.Error() != want { + t.Errorf("got %q, want %q", err.Error(), want) + } +} + +func TestSparseClone(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not available") + } + + ctx := context.Background() + cloneDir := filepath.Join(t.TempDir(), "clone") + + err := SparseClone(ctx, "actions", "checkout", cloneDir, "") + if err != nil { + t.Fatalf("SparseClone() error: %v", err) + } + + entries, err := os.ReadDir(cloneDir) + if err != nil { + t.Fatalf("reading clone dir: %v", err) + } + + for _, entry := range entries { + if entry.Name() == ".git" || entry.Name() == ".github" { + continue + } + t.Errorf("unexpected file in sparse checkout: %s", entry.Name()) + } + + if _, err := os.Stat(filepath.Join(cloneDir, ".github", "workflows")); err != nil { + t.Errorf(".github/workflows/ should exist: %v", err) + } +} + +func TestSparseCloneInvalidRepo(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not available") + } + + ctx := context.Background() + cloneDir := filepath.Join(t.TempDir(), "clone") + + err := SparseClone(ctx, "nonexistent-owner-xyzzy", "nonexistent-repo-xyzzy", cloneDir, "") + if err == nil { + t.Fatal("expected error for invalid repo, got nil") + } +} + +func TestCloneAndScan(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not available") + } + + ctx := context.Background() + repos := []Repo{ + {Owner: "actions", Name: "checkout"}, + {Owner: "nonexistent-owner-xyzzy", Name: "nonexistent-repo-xyzzy"}, + } + + var scannedDirs []string + results := CloneAndScan(ctx, repos, 2, "", "", + func(owner, name, dir string, cr *CloneResult) error { + scannedDirs = append(scannedDirs, dir) + cr.SetFindings(5, 2) + return nil + }) + + if len(results) != 2 { + t.Fatalf("expected 2 results, got %d", len(results)) + } + + var success, failure int + for _, r := range results { + if r.Err != nil { + failure++ + } else { + success++ + } + } + + if success != 1 { + t.Errorf("expected 1 success, got %d", success) + } + if failure != 1 { + t.Errorf("expected 1 failure, got %d", failure) + } + + if len(scannedDirs) != 1 { + t.Errorf("expected scanFn called once, got %d", len(scannedDirs)) + } + + // Verify findings were recorded + for _, r := range results { + if r.Err == nil { + if r.Findings != 5 || r.Workflows != 2 { + t.Errorf("expected findings=5, workflows=2, got %d, %d", r.Findings, r.Workflows) + } + } + } +} diff --git a/internal/github/batch.go b/internal/github/batch.go index b9b6bcd..b730c1a 100644 --- a/internal/github/batch.go +++ b/internal/github/batch.go @@ -4,6 +4,7 @@ import ( "bufio" "context" "fmt" + "math" "os" "strings" "time" @@ -32,14 +33,55 @@ type RepoInfo struct { } // FetchTopRepos returns the top N repos by star count from GitHub search API. +// GitHub limits search results to 1,000 per query, so for N > 1000 we use +// sliding star-count windows to paginate beyond the limit. func (c *Client) FetchTopRepos(ctx context.Context, top int) ([]RepoInfo, error) { var repos []RepoInfo + seen := make(map[string]bool) perPage := 100 if top < perPage { perPage = top } - for page := 1; len(repos) < top; page++ { + // Start with a broad query; narrow the star range when we hit the 1000-result ceiling + maxStars := 0 // 0 means no upper bound + minStars := 1000 + + for len(repos) < top { + query := fmt.Sprintf("stars:%d..%d", minStars, maxStars) + if maxStars == 0 { + query = fmt.Sprintf("stars:>%d", minStars) + } + + windowRepos, lowestStars, err := c.fetchSearchWindow(ctx, query, perPage, top-len(repos), seen) + if err != nil { + return repos, err + } + + repos = append(repos, windowRepos...) + fmt.Printf(" %d repos collected so far (stars >= %d)\n", len(repos), lowestStars) + + if len(windowRepos) == 0 || lowestStars <= minStars { + break + } + + // Slide the window: next query gets repos with fewer stars + maxStars = lowestStars + } + + if len(repos) > top { + repos = repos[:top] + } + return repos, nil +} + +// fetchSearchWindow fetches up to `limit` repos matching a star query, returning +// the repos found, the lowest star count seen, and any error. +func (c *Client) fetchSearchWindow(ctx context.Context, query string, perPage, limit int, seen map[string]bool) ([]RepoInfo, int, error) { + var repos []RepoInfo + lowestStars := math.MaxInt + + for page := 1; len(repos) < limit && page <= 10; page++ { searchOpts := &gh.SearchOptions{ Sort: "stars", Order: "desc", @@ -50,31 +92,41 @@ func (c *Client) FetchTopRepos(ctx context.Context, top int) ([]RepoInfo, error) } result, err := withRetry(ctx, func(ctx context.Context) (*gh.RepositoriesSearchResult, *gh.Response, error) { - result, resp, err := c.gh.Search.Repositories(ctx, "stars:>1000", searchOpts) - return result, resp, err + return c.gh.Search.Repositories(ctx, query, searchOpts) }) if err != nil { - return repos, fmt.Errorf("searching repos (page %d): %w", page, err) + return repos, lowestStars, fmt.Errorf("searching repos (query=%s page=%d): %w", query, page, err) } for _, r := range result.Repositories { + key := r.GetOwner().GetLogin() + "/" + r.GetName() + if seen[key] { + continue + } + seen[key] = true + + stars := r.GetStargazersCount() + if stars < lowestStars { + lowestStars = stars + } + repos = append(repos, RepoInfo{ Owner: r.GetOwner().GetLogin(), Name: r.GetName(), - Stars: r.GetStargazersCount(), + Stars: stars, Language: r.GetLanguage(), }) - if len(repos) >= top { + if len(repos) >= limit { break } } if len(result.Repositories) < perPage { - break // no more results + break } } - return repos, nil + return repos, lowestStars, nil } // LoadRepoList reads repos from a file (one owner/repo per line). diff --git a/internal/store/migrations.go b/internal/store/migrations.go index 309c70e..7fbb730 100644 --- a/internal/store/migrations.go +++ b/internal/store/migrations.go @@ -34,3 +34,23 @@ CREATE INDEX IF NOT EXISTS idx_repos_stars ON repos(stars DESC); const migration001AddSource = ` ALTER TABLE repos ADD COLUMN source TEXT DEFAULT 'top-1000'; ` + +const migration002AddRepoLists = ` +CREATE TABLE IF NOT EXISTS repo_lists ( + id INTEGER PRIMARY KEY, + query TEXT NOT NULL UNIQUE, + created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +); + +CREATE TABLE IF NOT EXISTS repo_list_entries ( + id INTEGER PRIMARY KEY, + list_id INTEGER REFERENCES repo_lists(id), + owner TEXT NOT NULL, + name TEXT NOT NULL, + stars INTEGER, + language TEXT, + position INTEGER NOT NULL +); + +CREATE INDEX IF NOT EXISTS idx_repo_list_entries_list ON repo_list_entries(list_id, position); +` diff --git a/internal/store/sqlite.go b/internal/store/sqlite.go index 5c62cf0..c681058 100644 --- a/internal/store/sqlite.go +++ b/internal/store/sqlite.go @@ -46,12 +46,20 @@ func Open(path string) (*DB, error) { return nil, fmt.Errorf("opening database: %w", err) } - // Enable WAL mode for better concurrent access + // Serialize all access through a single connection to prevent SQLITE_BUSY + // under concurrent goroutine writes. WAL mode still helps with read perf. + db.SetMaxOpenConns(1) + if _, err := db.Exec("PRAGMA journal_mode=WAL"); err != nil { db.Close() return nil, fmt.Errorf("setting WAL mode: %w", err) } + if _, err := db.Exec("PRAGMA busy_timeout=30000"); err != nil { + db.Close() + return nil, fmt.Errorf("setting busy timeout: %w", err) + } + if _, err := db.Exec(schema); err != nil { db.Close() return nil, fmt.Errorf("running migrations: %w", err) @@ -65,8 +73,69 @@ func Open(path string) (*DB, error) { // runMigrations applies incremental schema changes. Each is idempotent. func runMigrations(db *sqlx.DB) { - // Add source column if it doesn't exist db.Exec(migration001AddSource) + db.Exec(migration002AddRepoLists) +} + +// RepoListEntry holds a cached repo from a saved list. +type RepoListEntry struct { + Owner string `db:"owner"` + Name string `db:"name"` + Stars int `db:"stars"` + Language string `db:"language"` +} + +// SaveRepoList caches a fetched repo list under a query key (e.g. "top:5000"). +func (d *DB) SaveRepoList(query string, repos []RepoListEntry) error { + tx, err := d.db.Beginx() + if err != nil { + return err + } + defer tx.Rollback() + + if _, err := tx.Exec("DELETE FROM repo_list_entries WHERE list_id IN (SELECT id FROM repo_lists WHERE query = ?)", query); err != nil { + return err + } + if _, err := tx.Exec("DELETE FROM repo_lists WHERE query = ?", query); err != nil { + return err + } + + res, err := tx.Exec("INSERT INTO repo_lists (query) VALUES (?)", query) + if err != nil { + return err + } + listID, _ := res.LastInsertId() + + for i, r := range repos { + _, err := tx.Exec( + "INSERT INTO repo_list_entries (list_id, owner, name, stars, language, position) VALUES (?, ?, ?, ?, ?, ?)", + listID, r.Owner, r.Name, r.Stars, r.Language, i, + ) + if err != nil { + return err + } + } + + return tx.Commit() +} + +// LoadRepoList loads a cached repo list by query key. Returns nil if not found. +func (d *DB) LoadRepoList(query string) ([]RepoListEntry, error) { + var entries []RepoListEntry + err := d.db.Select(&entries, ` + SELECT e.owner, e.name, e.stars, e.language + FROM repo_list_entries e + JOIN repo_lists l ON l.id = e.list_id + WHERE l.query = ? + ORDER BY e.position + `, query) + if err != nil { + return nil, err + } + if len(entries) == 0 { + return nil, nil + } + return entries, nil } // Close closes the database connection.