From 8e1b4f76aaec24fdb503eba8dee172a5bfa7b006 Mon Sep 17 00:00:00 2001 From: Anton Evangelatov Date: Tue, 21 Apr 2026 16:33:05 +0300 Subject: [PATCH] fix(op-supernode): guard chain container vn against torn interface read Supernode.Start fires the Interop activity and chain-container goroutines in parallel. Interop's checkChainsReady immediately calls c.OptimisticAt -> c.SyncStatus, which reads c.vn while the chain container is concurrently assigning it in its restart loop. A torn two-word interface read can yield (type=*simpleVirtualNode, data=nil); the `c.vn == nil` guard returns false, the method dispatches on a nil receiver, and v.mu.Lock() panics at offset 0x58 (the mu field inside simpleVirtualNode). Protect c.vn with a sync.RWMutex and route all reads/writes through getVN/setVN helpers so every method takes one consistent snapshot. Also add nil-receiver guards on the *simpleVirtualNode methods as a defense against typed-nil interfaces slipping through in tests or future refactors. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../chain_container/chain_container.go | 59 ++++++++++++++----- .../virtual_node/virtual_node.go | 15 +++++ 2 files changed, 58 insertions(+), 16 deletions(-) diff --git a/op-supernode/supernode/chain_container/chain_container.go b/op-supernode/supernode/chain_container/chain_container.go index 1b60980e058..484730e4b31 100644 --- a/op-supernode/supernode/chain_container/chain_container.go +++ b/op-supernode/supernode/chain_container/chain_container.go @@ -99,6 +99,12 @@ type virtualNodeFactory func(cfg *opnodecfg.Config, log gethlog.Logger, initOver type ResetCallback func(chainID eth.ChainID, timestamp uint64, invalidatedBlock eth.BlockRef) type simpleChainContainer struct { + // vnMu protects vn. The chain container restart loop writes vn concurrently + // with activity goroutines (e.g. Interop.checkChainsReady) reading it, so an + // unsynchronized access produces a torn interface read: non-nil type with + // nil data pointer, which slips past a plain `vn == nil` check and panics + // on the next method dispatch. + vnMu sync.RWMutex vn virtual_node.VirtualNode vncfg *opnodecfg.Config cfg config.CLIConfig @@ -229,6 +235,22 @@ func defaultVirtualNodeFactory(cfg *opnodecfg.Config, log gethlog.Logger, initOv return virtual_node.NewVirtualNode(cfg, log, initOverload, appVersion) } +// getVN returns a consistent snapshot of c.vn. Always read c.vn through this +// helper; a bare read races with the restart loop's assignment. +func (c *simpleChainContainer) getVN() virtual_node.VirtualNode { + c.vnMu.RLock() + defer c.vnMu.RUnlock() + return c.vn +} + +// setVN writes c.vn under the lock. Used by the restart loop when it installs +// a new virtual node on each iteration. +func (c *simpleChainContainer) setVN(vn virtual_node.VirtualNode) { + c.vnMu.Lock() + defer c.vnMu.Unlock() + c.vn = vn +} + func (c *simpleChainContainer) subPath(path string) string { return filepath.Join(c.cfg.DataDir, c.chainID.String(), path) } @@ -263,7 +285,7 @@ func (c *simpleChainContainer) Start(ctx context.Context) error { c.initOverload.SuperAuthority = c } // Pass in the chain container as a SuperAuthority - c.vn = c.virtualNodeFactory(c.vncfg, c.log, c.initOverload, c.appVersion, c) + c.setVN(c.virtualNodeFactory(c.vncfg, c.log, c.initOverload, c.appVersion, c)) if c.pause.Load() { // Check for stop/cancellation even while paused, so teardown doesn't hang. // Without this, a stuck pause (e.g. from RewindEngine exiting before Resume) @@ -281,19 +303,20 @@ func (c *simpleChainContainer) Start(ctx context.Context) error { } // start the virtual node - err := c.vn.Start(ctx) + vn := c.getVN() + err := vn.Start(ctx) if err != nil { - c.log.Warn("virtual node exited with error", "vn_id", c.vn, "error", err) + c.log.Warn("virtual node exited with error", "vn_id", vn, "error", err) } else { - c.log.Info("virtual node exited", "vn_id", c.vn) + c.log.Info("virtual node exited", "vn_id", vn) } // always stop the virtual node after it exits stopCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) - if stopErr := c.vn.Stop(stopCtx); stopErr != nil { + if stopErr := vn.Stop(stopCtx); stopErr != nil { c.log.Error("error stopping virtual node", "error", stopErr) } else { - c.log.Info("virtual node stopped", "vn_id", c.vn) + c.log.Info("virtual node stopped", "vn_id", vn) } cancel() @@ -322,8 +345,8 @@ func (c *simpleChainContainer) Stop(ctx context.Context) error { c.rollupClient.Close() } - if c.vn != nil { - if err := c.vn.Stop(stopCtx); err != nil { + if vn := c.getVN(); vn != nil { + if err := vn.Stop(stopCtx); err != nil { c.log.Error("error stopping virtual node", "error", err) } } @@ -397,13 +420,14 @@ func (c *simpleChainContainer) LocalSafeBlockAtTimestamp(ctx context.Context, ts // SyncStatus returns the in-process op-node sync status for this chain. func (c *simpleChainContainer) SyncStatus(ctx context.Context) (*eth.SyncStatus, error) { - if c.vn == nil { + vn := c.getVN() + if vn == nil { if c.log != nil { c.log.Warn("SyncStatus: virtual node not initialized") } return nil, virtual_node.ErrVirtualNodeNotRunning } - st, err := c.vn.SyncStatus(ctx) + st, err := vn.SyncStatus(ctx) if err != nil { return nil, err } @@ -424,7 +448,8 @@ func (c *simpleChainContainer) OutputRootAtL2BlockNumber(ctx context.Context, l2 // safeDBAtL2 delegates to the virtual node to resolve the earliest L1 at which the L2 became safe. func (c *simpleChainContainer) safeDBAtL2(ctx context.Context, l2 eth.BlockID) (eth.BlockID, error) { - if c.vn == nil { + vn := c.getVN() + if vn == nil { return eth.BlockID{}, fmt.Errorf("virtual node not initialized") } status, err := c.SyncStatus(ctx) @@ -433,7 +458,7 @@ func (c *simpleChainContainer) safeDBAtL2(ctx context.Context, l2 eth.BlockID) ( } currentL1 := status.CurrentL1 c.log.Debug("safeDBAtL2", "l2", l2, "currentL1", currentL1, "err", err) - l1, err := c.vn.L1AtSafeHead(ctx, l2) + l1, err := vn.L1AtSafeHead(ctx, l2) if err != nil { // Map L1AtSafeHeadNotFound to ethereum.NotFound so callers treat chain lag as "not ready" if errors.Is(err, virtual_node.ErrL1AtSafeHeadNotFound) { @@ -572,7 +597,8 @@ func (c *simpleChainContainer) RewindEngine(ctx context.Context, timestamp uint6 } defer c.resetting.Store(false) - if c.vn == nil { + vn := c.getVN() + if vn == nil { return fmt.Errorf("virtual node not initialized") } if c.engine == nil { @@ -591,7 +617,7 @@ func (c *simpleChainContainer) RewindEngine(ctx context.Context, timestamp uint6 c.log.Info("chain_container/RewindEngine: paused container") // stop the vn - err = c.vn.Stop(ctx) + err = vn.Stop(ctx) if err != nil { return err } @@ -643,10 +669,11 @@ func (c *simpleChainContainer) PauseAndStopVN(ctx context.Context) error { if err := c.Pause(ctx); err != nil { return err } - if c.vn == nil { + vn := c.getVN() + if vn == nil { return nil } - return c.vn.Stop(ctx) + return vn.Stop(ctx) } // SetResetCallback sets a callback that is invoked when the chain resets. diff --git a/op-supernode/supernode/chain_container/virtual_node/virtual_node.go b/op-supernode/supernode/chain_container/virtual_node/virtual_node.go index d39cf6ecc81..0ddf5989ca6 100644 --- a/op-supernode/supernode/chain_container/virtual_node/virtual_node.go +++ b/op-supernode/supernode/chain_container/virtual_node/virtual_node.go @@ -96,6 +96,9 @@ func NewVirtualNode(cfg *opnodecfg.Config, log gethlog.Logger, initOverload *rol } func (v *simpleVirtualNode) Start(ctx context.Context) error { + if v == nil { + return ErrVirtualNodeNotRunning + } // Accquire lock while setting up inner node v.mu.Lock() if v.state != VNStateNotStarted { @@ -174,6 +177,9 @@ func (v *simpleVirtualNode) Start(ctx context.Context) error { } func (v *simpleVirtualNode) Stop(ctx context.Context) error { + if v == nil { + return nil + } v.mu.Lock() defer v.mu.Unlock() @@ -198,6 +204,9 @@ func (v *simpleVirtualNode) State() VNState { // SafeHeadAtL1 returns the recorded mapping of L1 block -> L2 safe head at or before the given L1 block number. func (v *simpleVirtualNode) SafeHeadAtL1(ctx context.Context, l1BlockNum uint64) (eth.BlockID, eth.BlockID, error) { + if v == nil { + return eth.BlockID{}, eth.BlockID{}, ErrVirtualNodeNotRunning + } v.mu.Lock() inner := v.inner v.mu.Unlock() @@ -216,6 +225,9 @@ var ErrL1AtSafeHeadNotFound = errors.New("l1 at safe head not found") // L1AtSafeHead finds the earliest L1 block at which the provided L2 block became local safe, // using the monotonicity of SafeDB (L2 safe head number is non-decreasing over L1). func (v *simpleVirtualNode) L1AtSafeHead(ctx context.Context, target eth.BlockID) (eth.BlockID, error) { + if v == nil { + return eth.BlockID{}, ErrVirtualNodeNotRunning + } v.mu.Lock() inner := v.inner v.mu.Unlock() @@ -279,6 +291,9 @@ func (v *simpleVirtualNode) L1AtSafeHead(ctx context.Context, target eth.BlockID } func (v *simpleVirtualNode) SyncStatus(ctx context.Context) (*eth.SyncStatus, error) { + if v == nil { + return nil, ErrVirtualNodeNotRunning + } v.mu.Lock() inner := v.inner v.mu.Unlock()