Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions tx.go
Original file line number Diff line number Diff line change
Expand Up @@ -645,20 +645,29 @@ func (tx *Tx) page(id common.Pgid) *common.Page {
func (tx *Tx) forEachPage(pgidnum common.Pgid, fn func(*common.Page, int, []common.Pgid)) {
stack := make([]common.Pgid, 10)
stack[0] = pgidnum
tx.forEachPageInternal(stack[:1], fn)
tx.forEachPageInternal(stack[:1], map[common.Pgid]struct{}{}, fn)
}

func (tx *Tx) forEachPageInternal(pgidstack []common.Pgid, fn func(*common.Page, int, []common.Pgid)) {
p := tx.page(pgidstack[len(pgidstack)-1])
func (tx *Tx) forEachPageInternal(pgidstack []common.Pgid, visited map[common.Pgid]struct{}, fn func(*common.Page, int, []common.Pgid)) {
pgid := pgidstack[len(pgidstack)-1]
p := tx.page(pgid)

// Execute function.
fn(p, len(pgidstack)-1, pgidstack)

// Stop descending on a revisit so a corrupted db with a page cycle
// cannot drive this recursion to stack overflow. fn still runs above,
// so verifyPageReachable's "multiple references" diagnostic fires.
if _, ok := visited[pgid]; ok {
return
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the question is what we're going to do here, reading from a corrupted db should definitely be surfaced.

I (personally) would rather side with a panic than changing the function interface to an error.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good push — I audited the forEachPage callers and you're right that the soft-return is a blind spot for one of them:

  • tx_check.go:148 (Check): the fn is verifyPageReachable, which already emits page N: multiple references to the error channel on duplicate pgids (tx_check.go:163-165), and tx.check wraps its body in defer recover() that converts panics into panicked{} errors on the same channel (tx_check.go:39-43). Surfacing is fine here under either strategy.
  • bucket.go:621 (Bucket.Stats): the fn just accumulates counters and recursively re-enters sub-buckets via subStats.Add(b.openBucket(...).Stats()). On a revisit it silently inflates KeyN/LeafPageN/BranchPageN and double-enters sub-buckets. No recover on this path. With the current soft-return, bbolt stats on a cycle-corrupted db prints wrong numbers with no indication — exactly the silent behavior you're flagging.

A few ways forward, in increasing cost:

  1. Panic on cycle (your suggestion). One line in forEachPageInternal. Check stays clean via the existing recover; Stats / bbolt stats crash loudly with the offending pgid. Simplest, matches your preference.
  2. Panic + db.Logger().Errorf first. Same as (1), plus a log entry via the existing Logger interface so the cycle is recorded even if some caller recovers the panic. Marginal, but ~free.
  3. Add Corrupted bool to BucketStats (additive, non-breaking). forEachPage would route the cycle through a sink that Stats reads, so the CLI could print a prominent warning alongside the (inflated) numbers instead of crashing. Strictly the most graceful UX, but it's more scope than the bug this PR is fixing — probably wants its own issue/PR.

My default would be (1), possibly (2). Happy to go straight to (3) in a follow-up if you'd rather keep this PR tight. Which direction do you want?

}
visited[pgid] = struct{}{}

// Recursively loop over children.
if p.IsBranchPage() {
for i := 0; i < int(p.Count()); i++ {
elem := p.BranchPageElement(uint16(i))
tx.forEachPageInternal(append(pgidstack, elem.Pgid()), fn)
tx.forEachPageInternal(append(pgidstack, elem.Pgid()), visited, fn)
}
}
}
Expand Down
80 changes: 80 additions & 0 deletions tx_whitebox_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package bbolt

import (
"fmt"
"path/filepath"
"testing"

"github.com/stretchr/testify/require"

"go.etcd.io/bbolt/internal/common"
"go.etcd.io/bbolt/internal/guts_cli"
"go.etcd.io/bbolt/internal/surgeon"
)

// TestTx_forEachPage_CycleTerminates corrupts a db so that a branch page's
// child list loops back on itself, then verifies that tx.forEachPage
// terminates instead of recursing until the goroutine stack overflows.
// See issue #701 for the real-world corruption pattern.
func TestTx_forEachPage_CycleTerminates(t *testing.T) {
path := filepath.Join(t.TempDir(), "db")
db, err := Open(path, 0600, nil)
require.NoError(t, err)
require.NoError(t, db.Update(func(tx *Tx) error {
b, err := tx.CreateBucketIfNotExists([]byte("data"))
if err != nil {
return err
}
for i := 0; i < 500; i++ {
if err := b.Put([]byte(fmt.Sprintf("%04d", i)), make([]byte, 100)); err != nil {
return err
}
}
return nil
}))
require.NoError(t, db.Close())

// Pick a branch ancestor and one of its leaf descendants; overwriting the
// leaf with a copy of the branch leaves the leaf as a branch whose child
// list still references the leaf's own pgid, forming a cycle.
xray := surgeon.NewXRay(path)
paths, err := xray.FindPathsToKey([]byte("0001"))
require.NoError(t, err)
require.NotEmpty(t, paths)
p0 := paths[0]
require.GreaterOrEqual(t, len(p0), 2, "need at least one branch above the leaf")
ancestor := p0[len(p0)-2]
leaf := p0[len(p0)-1]
require.NoError(t, surgeon.CopyPage(path, ancestor, leaf))

ancestorPage, _, err := guts_cli.ReadPage(path, uint64(ancestor))
require.NoError(t, err)
require.True(t, ancestorPage.IsBranchPage())
var hasLeafAsChild bool
for i := uint16(0); i < ancestorPage.Count(); i++ {
if ancestorPage.BranchPageElement(i).Pgid() == common.Pgid(leaf) {
hasLeafAsChild = true
break
}
}
require.True(t, hasLeafAsChild, "expected ancestor to reference the leaf directly")

db, err = Open(path, 0600, nil)
require.NoError(t, err)
defer func() { _ = db.Close() }()

require.NoError(t, db.View(func(tx *Tx) error {
rootPage := tx.Bucket([]byte("data")).RootPage()
var count int
leafVisits := 0
tx.forEachPage(rootPage, func(p *common.Page, _ int, _ []common.Pgid) {
count++
if p.Id() == common.Pgid(leaf) {
leafVisits++
}
})
require.Less(t, count, 10_000, "forEachPage walked %d pages; cycle detection likely missing", count)
require.GreaterOrEqual(t, leafVisits, 2, "expected fn to fire on the cycle back-edge so duplicate-reference diagnostics still run")
return nil
}))
}
Loading