From cd878929f918a087141549466e5d1971616b6dc8 Mon Sep 17 00:00:00 2001 From: Roman Khimov Date: Thu, 29 May 2025 23:43:05 +0300 Subject: [PATCH] db: rework metalock into RWMutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit metalock is documented to protect meta page access, but its real functionality is broader and less obvious: * meta page itself is accessed in t.init(), but it's not changed, so an exclusive lock is not needed * db.opened is somewhat related, but it's changed in close() only which takes all the locks anyway * db.data is used close to this lock, but it's protected by mmaplock * db.freelist is in fact changed when metalock is taken and it has no synchronization of its own (notice that db.freelist pointer is changed in close() as well, so some synchronization is needed for it also) So we have a lock that tries to protect meta page (and does it after 249746fef43a12e0b5368d98006be51d3afa4d3a), but in fact it helps freelists more than the others. What we're trying to do in freelist is to keep track of open RO transaction IDs, maintaining a slice of them. Transaction IDs in their turn are not exactly IDs in that they do not identify transactions uniquely, rather it's an ID of the latest transaction that changed the state and at any given point all of the new readers use the same latest ID. The other important aspect of these IDs is that any RW transaction always uses a new ID that is incremented from the previous one, IDs only move forward (not considering uint64 overflow). At the very minimum this means that: * the most common case is when this list has exactly one unique ID which is the latest * occasionally we can have a previous one as well * only in rare cases of long-running read transactions we can have some set of older IDs * once an old ID is gone it'll never return since new transactions use the latest one Which in turn means that: * keeping a list of IDs wastes memory for nothing, we usually have a lot of duplicates there * what we need in fact is some sort of reference counting for a limited number of past IDs * no new IDs are possible unless there is an RW transaction Reference counting can be implemented with atomic variables, not requiring locks, the only problem is to always have an appropriate ID to count its users. This can be done via a separate RW-transaction-only API managing a list of ID-ref structures. It requires some protection from readers, but this can be tied to meta page update since RO transactions can't use new ID earlier than it appears there. Then RO transactions can rely on this list always having current ID that they can use, which means they can use RO lock for the list itself while doing changes to the counter. This removes the final exclusive lock for read-only transactions (with disabled statistics) and this drastically improves concurrent View() test results as well as real application behavior with many readers. Before: workers samples min avg 50% 80% 90% max 1 10 78.358µs 964.847µs 1.059159ms 1.073256ms 1.07551ms 1.07551ms 10 100 32.802µs 304.922µs 80.924µs 674.54µs 1.069298ms 1.220625ms 100 1000 30.758µs 304.541µs 64.192µs 397.094µs 1.101991ms 2.183302ms 1000 10000 30.558µs 1.05711ms 92.426µs 2.111896ms 3.317894ms 11.790014ms 10000 100000 30.548µs 10.98898ms 90.742µs 21.740659ms 33.020076ms 135.33094ms After: workers samples min avg 50% 80% 90% max 1 10 96.093µs 969.267µs 1.063618ms 1.066473ms 1.085629ms 1.085629ms 10 100 32.803µs 252.33µs 73.71µs 827.159µs 934.392µs 1.071142ms 100 1000 31.339µs 239.554µs 38.703µs 613.263µs 888.616µs 1.403793ms 1000 10000 30.518µs 195.822µs 41.538µs 101.031µs 360.474µs 4.075932ms 10000 100000 30.478µs 165.346µs 39.054µs 89.981µs 144.544µs 11.35032ms Signed-off-by: Roman Khimov --- db.go | 19 ++++++------ internal/freelist/freelist.go | 3 ++ internal/freelist/shared.go | 56 +++++++++++++++++++++++++---------- tx.go | 1 + 4 files changed, 54 insertions(+), 25 deletions(-) diff --git a/db.go b/db.go index 5d3e26496..2ac0ffefc 100644 --- a/db.go +++ b/db.go @@ -143,7 +143,7 @@ type DB struct { batch *batch rwlock sync.Mutex // Allows only one writer at a time. - metalock sync.Mutex // Protects meta page access. + metalock sync.RWMutex // Protects meta page access. mmaplock sync.RWMutex // Protects mmap access during remapping. statlock sync.RWMutex // Protects stats access. @@ -422,6 +422,7 @@ func (db *DB) getPageSizeFromSecondMeta() (int, bool, error) { func (db *DB) loadFreelist() { db.freelistLoad.Do(func() { db.freelist = newFreelist(db.FreelistType) + db.freelist.AddCurrentTXID(db.meta().Txid()) if !db.hasSyncedFreelist() { // Reconstruct free list by scanning the DB. db.freelist.Init(db.freepages()) @@ -776,7 +777,7 @@ func (db *DB) beginTx() (*Tx, error) { // Lock the meta pages while we initialize the transaction. We obtain // the meta lock before the mmap lock because that's the order that the // write transaction will obtain them. - db.metalock.Lock() + db.metalock.RLock() // Obtain a read-only lock on the mmap. When the mmap is remapped it will // obtain a write lock so all transactions must finish before it can be @@ -786,14 +787,14 @@ func (db *DB) beginTx() (*Tx, error) { // Exit if the database is not open yet. if !db.opened { db.mmaplock.RUnlock() - db.metalock.Unlock() + db.metalock.RUnlock() return nil, berrors.ErrDatabaseNotOpen } // Exit if the database is not correctly mapped. if db.data == nil { db.mmaplock.RUnlock() - db.metalock.Unlock() + db.metalock.RUnlock() return nil, berrors.ErrInvalidMapping } @@ -806,7 +807,7 @@ func (db *DB) beginTx() (*Tx, error) { } // Unlock the meta pages. - db.metalock.Unlock() + db.metalock.RUnlock() // Update the transaction stats. if db.stats != nil { @@ -831,8 +832,8 @@ func (db *DB) beginRWTx() (*Tx, error) { // Once we have the writer lock then we can lock the meta pages so that // we can set up the transaction. - db.metalock.Lock() - defer db.metalock.Unlock() + db.metalock.RLock() + defer db.metalock.RUnlock() // Exit if the database is not open yet. if !db.opened { @@ -860,14 +861,14 @@ func (db *DB) removeTx(tx *Tx) { db.mmaplock.RUnlock() // Use the meta lock to restrict access to the DB object. - db.metalock.Lock() + db.metalock.RLock() if db.freelist != nil { db.freelist.RemoveReadonlyTXID(tx.meta.Txid()) } // Unlock the meta pages. - db.metalock.Unlock() + db.metalock.RUnlock() // Merge statistics. if db.stats != nil { diff --git a/internal/freelist/freelist.go b/internal/freelist/freelist.go index 2b819506b..5c2fba1e3 100644 --- a/internal/freelist/freelist.go +++ b/internal/freelist/freelist.go @@ -22,6 +22,9 @@ type Interface interface { // Init initializes this freelist with the given list of pages. Init(ids common.Pgids) + // AddCurrentTXID adds the latest known ID to the list. + AddCurrentTXID(tid common.Txid) + // Allocate tries to allocate the given number of contiguous pages // from the free list pages. It returns the starting page ID if // available; otherwise, it returns 0. diff --git a/internal/freelist/shared.go b/internal/freelist/shared.go index f2d113008..879763905 100644 --- a/internal/freelist/shared.go +++ b/internal/freelist/shared.go @@ -3,7 +3,9 @@ package freelist import ( "fmt" "math" + "slices" "sort" + "sync/atomic" "unsafe" "go.etcd.io/bbolt/internal/common" @@ -15,13 +17,18 @@ type txPending struct { lastReleaseBegin common.Txid // beginning txid of last matching releaseRange } +type txIdReference struct { + txid common.Txid + refs atomic.Int32 +} + type shared struct { Interface - readonlyTXIDs []common.Txid // all readonly transaction IDs. - allocs map[common.Pgid]common.Txid // mapping of Txid that allocated a pgid. - cache map[common.Pgid]struct{} // fast lookup of all free and pending page ids. - pending map[common.Txid]*txPending // mapping of soon-to-be free page ids by tx. + readerRefs []*txIdReference + allocs map[common.Pgid]common.Txid // mapping of Txid that allocated a pgid. + cache map[common.Pgid]struct{} // fast lookup of all free and pending page ids. + pending map[common.Txid]*txPending // mapping of soon-to-be free page ids by tx. } func newShared() *shared { @@ -118,15 +125,18 @@ func (t *shared) Rollback(txid common.Txid) { } func (t *shared) AddReadonlyTXID(tid common.Txid) { - t.readonlyTXIDs = append(t.readonlyTXIDs, tid) + for _, r := range t.readerRefs { + if r.txid == tid { + r.refs.Add(1) + break + } + } } func (t *shared) RemoveReadonlyTXID(tid common.Txid) { - for i := range t.readonlyTXIDs { - if t.readonlyTXIDs[i] == tid { - last := len(t.readonlyTXIDs) - 1 - t.readonlyTXIDs[i] = t.readonlyTXIDs[last] - t.readonlyTXIDs = t.readonlyTXIDs[:last] + for _, r := range t.readerRefs { + if r.txid == tid { + r.refs.Add(-1) break } } @@ -140,23 +150,37 @@ func (t txIDx) Less(i, j int) bool { return t[i] < t[j] } func (t *shared) ReleasePendingPages() { // Free all pending pages prior to the earliest open transaction. - sort.Sort(txIDx(t.readonlyTXIDs)) minid := common.Txid(math.MaxUint64) - if len(t.readonlyTXIDs) > 0 { - minid = t.readonlyTXIDs[0] + + for i := range t.readerRefs { + if t.readerRefs[i].refs.Load() != 0 && minid > t.readerRefs[i].txid { + minid = t.readerRefs[i].txid + } } if minid > 0 { t.release(minid - 1) } // Release unused txid extents. - for _, tid := range t.readonlyTXIDs { - t.releaseRange(minid, tid-1) - minid = tid + 1 + for _, e := range t.readerRefs { + t.releaseRange(minid, e.txid-1) + minid = e.txid + 1 } t.releaseRange(minid, common.Txid(math.MaxUint64)) // Any page both allocated and freed in an extent is safe to release. } +func (t *shared) AddCurrentTXID(tid common.Txid) { + inUseID := tid - 1 // New readers can still use it till meta pages are updated. + + t.readerRefs = slices.DeleteFunc(t.readerRefs, func(e *txIdReference) bool { + // tid can be left by previous unsuccessful transaction, there + // can't be any readers using it, but we'll append it anyway + // below (this just simplifies code). + return (e.txid < inUseID && e.refs.Load() == 0) || e.txid == tid + }) + t.readerRefs = append(t.readerRefs, &txIdReference{txid: tid}) +} + func (t *shared) release(txid common.Txid) { m := make(common.Pgids, 0) for tid, txp := range t.pending { diff --git a/tx.go b/tx.go index f32a20931..8c4567f46 100644 --- a/tx.go +++ b/tx.go @@ -569,6 +569,7 @@ func (tx *Tx) writeMeta() error { lg.Errorf("writeAt failed, pgid: %d, pageSize: %d, error: %v", p.Id(), tx.db.pageSize, err) return err } + tx.db.freelist.AddCurrentTXID(tx.meta.Txid()) tx.db.metalock.Unlock() if !tx.db.NoSync || common.IgnoreNoSync { // gofail: var beforeSyncMetaPage struct{}