diff --git a/commitchecker/.gitignore b/commitchecker/.gitignore index 5da6a5e..e3ff3eb 100644 --- a/commitchecker/.gitignore +++ b/commitchecker/.gitignore @@ -1 +1 @@ -commitchecker \ No newline at end of file +/commitchecker \ No newline at end of file diff --git a/commitchecker/README.md b/commitchecker/README.md index 0f5758a..13eff64 100644 --- a/commitchecker/README.md +++ b/commitchecker/README.md @@ -57,3 +57,35 @@ tests: ``` There is no code or `` patch needed in your repository! + +# Limitations + +## Stale PR branches and `--ancestry-path` + +The commitchecker uses `git log --ancestry-path` to validate only the commits on the +direct descent path from the upstream merge-base. This is necessary to correctly +handle the complex DAG created by OpenShift's downstream rebase process (see +[ancestry-path.md](ancestry-path.md) for details). + +When a PR branch has fallen behind `main`, the PR commits may not be direct +descendants of the start ref, so `--ancestry-path` excludes them. The commitchecker +detects this by also running without `--ancestry-path` to find all reachable commits. +If commits exist but none are on the direct ancestry path, the checker fails with: + +``` +ERROR: found N commits between X..Y but none are on the direct ancestry path from Z; the PR branch may need to be rebased +``` + +### How to read the CI logs + +The commitchecker runs two passes and logs the results of each: + +- `Validating all N commits between X..Y` — the total number of commits reachable + from end but not start, regardless of branch topology. +- `Validating N direct commits between X..Y` — the number of commits on the direct + ancestry path from the merge-base to end. Only these are enforced. + +### How to fix it + +Rebase or merge your PR branch onto the current `main` so that your commits become +direct descendants of `PULL_BASE_SHA`, then re-run CI. diff --git a/commitchecker/ancestry-path.md b/commitchecker/ancestry-path.md new file mode 100644 index 0000000..ca5451d --- /dev/null +++ b/commitchecker/ancestry-path.md @@ -0,0 +1,107 @@ +# `--ancestry-path` Git Semantics + +## What `git log A..B` does normally + +`git log A..B` shows all commits reachable from `B` that are **not** reachable from +`A`. It walks backwards from `B` through all parent pointers (including merge parents) +until it hits commits that are ancestors of `A`. + +In a simple linear history this is intuitive: + +``` +A --- C --- D --- B +``` + +`A..B` = {C, D, B} + +## What `--ancestry-path` adds + +`--ancestry-path` further restricts the output to commits that are on a **direct path +of descent** from `A` to `B`. A commit is included only if it is both: + +1. A **descendant** of `A` — `A` is reachable by walking backwards through its parents +2. An **ancestor** of `B` — the commit is reachable by walking backwards from `B` + +In other words: the commit must sit on a chain `A -> ... -> commit -> ... -> B`. + +## Why the commitchecker needs it + +OpenShift's downstream rebase process creates a complex DAG. During a rebase, the +downstream repo effectively resets to the upstream state and then re-applies carry +patches. This creates a history like: + +``` +upstream: U1 --- U2 --- U3 --- U4 + \ +downstream: ... --- M (merge) --- carry1 --- carry2 --- HEAD + | + merge-base +``` + +Without `--ancestry-path`, `git log merge-base..HEAD` walks all parent pointers from +`HEAD`. Through the merge commit `M`, it reaches back into upstream history and finds +`U1, U2, U3, U4` — commits that don't follow the `UPSTREAM:` convention because they +are actual upstream commits, not downstream carries. The checker would flag all of them +as invalid. + +With `--ancestry-path`, only commits on the direct descendant path from `merge-base` +to `HEAD` are shown: `carry1, carry2`. The upstream commits reached via side paths +through merges are excluded. This is the correct set to validate. + +## How `--ancestry-path` can silently drop commits + +The problem arises in CI when a PR branch has fallen behind `main`. + +Consider this scenario: + +``` +main: A --- B --- C --- D (D = current main tip = PULL_BASE_SHA) + \ +PR branch: E --- F --- HEAD (PR commits forked from A) +``` + +CI merges the PR onto main, producing: + +``` + A --- B --- C --- D + \ \ + E --- F --- G (merge of PR onto main) +``` + +Now, `git log --ancestry-path D..G`: + +For a commit to be on the ancestry path from `D` to `G`, it must be a **descendant of +`D`**. Let's check each commit: + +- **`G` (the merge)** — descendant of `D`? Yes (D is a parent of G). Ancestor of `G`? + Trivially yes. **Included** (but filtered by `--no-merges`). +- **`E`** — descendant of `D`? **No.** `E`'s parent is `A`, not `D`. Walking backwards + from `E` gives `E -> A`, which never reaches `D`. **Excluded.** +- **`F`** — descendant of `D`? **No.** `F`'s parent is `E`, whose parent is `A`. + **Excluded.** + +Result: `--ancestry-path` returns **zero non-merge commits**. If used alone, the +checker would validate nothing and exit successfully — a false positive. + +Without `--ancestry-path`, `git log D..G` correctly returns `{E, F}` because they are +reachable from `G` (via its second merge parent) but not from `D`. + +### The key insight + +`--ancestry-path` requires the commit to be a **descendant** of the start ref — meaning +the start must be reachable by walking **backwards** from that commit through its parent +chain. For `E`, walking backwards gives `E -> A`, never reaching `D`. The commit is +reachable from `G` but not descended from `D`. + +## The trade-off + +| Scenario | `--ancestry-path` | Without `--ancestry-path` | +|---|---|---| +| Rebase PRs (complex merges) | Correct: filters out upstream commits | Wrong: includes upstream commits | +| Stale PR branches | Wrong: drops PR commits | Correct: finds all PR commits | + +No single `git log` invocation handles both cases correctly. The commitchecker +addresses this by running both: `--ancestry-path` for enforcement (to handle rebases +correctly) and without `--ancestry-path` to find all reachable commits. If commits +exist but none are on the direct ancestry path, the checker fails with an error +telling the user to rebase. \ No newline at end of file diff --git a/commitchecker/commitchecker.go b/commitchecker/commitchecker.go index 52385c1..da96009 100644 --- a/commitchecker/commitchecker.go +++ b/commitchecker/commitchecker.go @@ -45,30 +45,49 @@ func main() { start = mergeBase } - commits, err := commitchecker.CommitsBetween(start, opts.End) + // Diagnostic: find all commits reachable from end but not start. This catches + // every PR commit regardless of branch topology, so we can detect when the + // stricter direct-ancestry check below silently drops commits on stale branches. + allCommits := listCommits(commitchecker.AllCommitsBetween(opts.Start, opts.End)) + _, _ = fmt.Fprintf(os.Stdout, "Validating all %d commits between %s..%s\n", len(allCommits), opts.Start, opts.End) + validateCommits(allCommits) + + // Enforced: find only commits on the direct descent path from the upstream + // merge-base to end. This excludes upstream commits reached via side branches + // in the DAG created by the downstream rebase process. + directCommits := listCommits(commitchecker.DirectCommitsBetween(start, opts.End)) + _, _ = fmt.Fprintf(os.Stdout, "Validating %d direct commits between %s..%s\n", len(directCommits), start, opts.End) + if len(allCommits) > 0 && len(directCommits) == 0 { + _, _ = fmt.Fprintf(os.Stderr, "ERROR: found %d commits between %s..%s but none are on the direct ancestry path from %s; the PR branch may need to be rebased\n", len(allCommits), opts.Start, opts.End, start) + os.Exit(2) + } + if errs := validateCommits(directCommits); len(errs) > 0 { + os.Exit(2) + } +} + +func listCommits(commits []commitchecker.Commit, err error) []commitchecker.Commit { if err != nil { if err == commitchecker.ErrNotCommit { _, _ = fmt.Fprintf(os.Stderr, "WARNING: one of the provided commits does not exist, not a true branch\n") os.Exit(0) } - _, _ = fmt.Fprintf(os.Stderr, "ERROR: couldn't find commits from %s..%s: %v\n", opts.Start, opts.End, err) + _, _ = fmt.Fprintf(os.Stderr, "ERROR: couldn't list commits: %v\n", err) os.Exit(1) } + return commits +} - _, _ = fmt.Fprintf(os.Stdout, "Validating %d commits between %s...%s\n", len(commits), start, opts.End) +func validateCommits(commits []commitchecker.Commit) []string { var errs []string for _, commit := range commits { _, _ = fmt.Fprintf(os.Stdout, "Validating commit %+v\n", commit) for _, validate := range commitchecker.AllCommitValidators { - errs = append(errs, validate(commit)...) - } - } - - if len(errs) > 0 { - for _, e := range errs { - _, _ = fmt.Fprintf(os.Stderr, "%s\n\n", e) + for _, e := range validate(commit) { + _, _ = fmt.Fprintf(os.Stderr, "%s\n\n", e) + errs = append(errs, e) + } } - - os.Exit(2) } + return errs } diff --git a/commitchecker/pkg/commitchecker/commits_test.go b/commitchecker/pkg/commitchecker/commits_test.go new file mode 100644 index 0000000..889bf85 --- /dev/null +++ b/commitchecker/pkg/commitchecker/commits_test.go @@ -0,0 +1,143 @@ +package commitchecker_test + +import ( + "os" + "os/exec" + "testing" + + "github.com/openshift/build-machinery-go/commitchecker/pkg/commitchecker" +) + +// initRepo creates a bare-bones git repo in a temp dir, changes into it, and +// returns a cleanup function that restores the original working directory. +func initRepo(t *testing.T) func() { + t.Helper() + origDir, err := os.Getwd() + if err != nil { + t.Fatal(err) + } + dir := t.TempDir() + if err := os.Chdir(dir); err != nil { + t.Fatal(err) + } + git(t, "init") + git(t, "config", "user.email", "test@test.com") + git(t, "config", "user.name", "Test") + return func() { + if err := os.Chdir(origDir); err != nil { + t.Fatal(err) + } + } +} + +// git runs a git command and fails the test on error. +func git(t *testing.T, args ...string) string { + t.Helper() + cmd := exec.Command("git", args...) + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("git %v failed: %s\n%s", args, err, out) + } + return string(out) +} + +// commit creates an empty commit with the given message and returns its SHA. +func commit(t *testing.T, msg string) string { + t.Helper() + git(t, "commit", "--allow-empty", "-m", msg) + out := git(t, "rev-parse", "HEAD") + // trim trailing newline + return out[:len(out)-1] +} + +// TestStaleBranch verifies that DirectCommitsBetween returns no commits when +// the PR branch has fallen behind main, while AllCommitsBetween finds them. +// +// DAG: +// +// main: A --- B --- C --- D +// \ \ +// E --- F --- G (merge) +func TestStaleBranch(t *testing.T) { + cleanup := initRepo(t) + defer cleanup() + + // A: initial commit on main + commit(t, "UPSTREAM: : A initial") + git(t, "checkout", "-b", "main") + + // Branch off at A for the PR + git(t, "checkout", "-b", "pr-branch") + commit(t, "UPSTREAM: : E pr commit 1") + commit(t, "UPSTREAM: : F pr commit 2") + + // Advance main past A + git(t, "checkout", "main") + commit(t, "UPSTREAM: : B advance") + commit(t, "UPSTREAM: : C advance") + startSHA := commit(t, "UPSTREAM: : D advance") + + // Merge PR onto main (simulating CI merge). + // The merge commit itself is filtered by --no-merges, so --ancestry-path + // returns zero non-merge commits when E and F are not descendants of D. + git(t, "merge", "pr-branch", "--no-ff", "-m", "Merge PR") + mergeSHA := git(t, "rev-parse", "HEAD") + mergeSHA = mergeSHA[:len(mergeSHA)-1] + + // AllCommitsBetween should find the PR commits (E, F) + allCommits, err := commitchecker.AllCommitsBetween(startSHA, mergeSHA) + if err != nil { + t.Fatalf("AllCommitsBetween: %v", err) + } + if len(allCommits) == 0 { + t.Error("AllCommitsBetween returned 0 commits; expected PR commits to be found") + } + + // DirectCommitsBetween should find no non-merge commits because E and F + // are not descendants of D (the stale branch problem). + directCommits, err := commitchecker.DirectCommitsBetween(startSHA, mergeSHA) + if err != nil { + t.Fatalf("DirectCommitsBetween: %v", err) + } + + // The key assertion: all commits found, but direct commits missed them. + if len(allCommits) > 0 && len(directCommits) == 0 { + t.Logf("Correctly detected stale branch: %d all commits, %d direct commits", len(allCommits), len(directCommits)) + } else if len(directCommits) > 0 { + t.Errorf("Expected DirectCommitsBetween to return 0 commits on a stale branch, got %d", len(directCommits)) + } +} + +// TestUpToDateBranch verifies that both AllCommitsBetween and +// DirectCommitsBetween find the same commits when the branch is up to date. +// +// DAG: +// +// main: A --- B(start) --- C --- D(end) +func TestUpToDateBranch(t *testing.T) { + cleanup := initRepo(t) + defer cleanup() + + commit(t, "UPSTREAM: : A initial") + git(t, "checkout", "-b", "main") + startSHA := commit(t, "UPSTREAM: : B start") + commit(t, "UPSTREAM: : C middle") + commit(t, "UPSTREAM: : D end") + + allCommits, err := commitchecker.AllCommitsBetween(startSHA, "HEAD") + if err != nil { + t.Fatalf("AllCommitsBetween: %v", err) + } + + directCommits, err := commitchecker.DirectCommitsBetween(startSHA, "HEAD") + if err != nil { + t.Fatalf("DirectCommitsBetween: %v", err) + } + + if len(allCommits) != len(directCommits) { + t.Errorf("Expected same commit count, got all=%d direct=%d", len(allCommits), len(directCommits)) + } + if len(allCommits) != 2 { + t.Errorf("Expected 2 commits (C, D), got %d", len(allCommits)) + } +} diff --git a/commitchecker/pkg/commitchecker/git.go b/commitchecker/pkg/commitchecker/git.go index 8beb1fe..8ddb76d 100644 --- a/commitchecker/pkg/commitchecker/git.go +++ b/commitchecker/pkg/commitchecker/git.go @@ -37,14 +37,28 @@ func IsCommit(a string) bool { var ErrNotCommit = fmt.Errorf("one or both of the provided commits was not a valid commit") -func CommitsBetween(a, b string) ([]Commit, error) { +// DirectCommitsBetween returns commits on the direct ancestry path from a to b, +// using --ancestry-path to exclude commits reachable via side branches. +func DirectCommitsBetween(a, b string) ([]Commit, error) { + return gitLog(a, b, "--no-merges", "--ancestry-path") +} + +// AllCommitsBetween returns all non-merge commits reachable from b but not from a. +func AllCommitsBetween(a, b string) ([]Commit, error) { + return gitLog(a, b, "--no-merges") +} + +func gitLog(a, b string, extraArgs ...string) ([]Commit, error) { var commits []Commit - stdout, stderr, err := run("git", "log", "--no-merges", "--oneline", "--ancestry-path", fmt.Sprintf("%s..%s", a, b)) + args := []string{"git", "log", "--oneline"} + args = append(args, extraArgs...) + args = append(args, fmt.Sprintf("%s..%s", a, b)) + stdout, stderr, err := run(args...) if err != nil { if !IsCommit(a) || !IsCommit(b) { return nil, ErrNotCommit } - return nil, fmt.Errorf("error executing git log: %s: %s", stderr, err) + return nil, fmt.Errorf("error executing %s: %s: %s", strings.Join(args, " "), stderr, err) } for _, log := range strings.Split(stdout, "\n") { if len(log) == 0 { @@ -68,9 +82,6 @@ func NewCommitFromOnelineLog(log string) (Commit, error) { } commit.Sha = parts[0] commit.Summary = strings.Join(parts[1:], " ") - if err != nil { - return commit, err - } commit.Email, err = emailInCommit(commit.Sha) if err != nil { return commit, err