Skip to content
Open
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
7e59d71
deterministic state sync
marcello33 Mar 25, 2026
fdb0082
temp bump heimdall to committed version for testing purposes
marcello33 Mar 26, 2026
3ae74d8
fix parsing
marcello33 Mar 26, 2026
405e5e6
fix unmarshalling of RecordListVisibleAtHeightResponse
marcello33 Mar 26, 2026
1893557
change heimdall dep for testing
marcello33 Mar 26, 2026
f48218f
Merge branch 'develop' into mardizzone/POS-3441_deterministic-ss
marcello33 Mar 27, 2026
f3446c4
add DeterministicStateSyncBlock to GatherForks
marcello33 Mar 27, 2026
2874ed1
Merge branch 'develop' into mardizzone/POS-3441_deterministic-ss
marcello33 Mar 27, 2026
2934d3e
Merge branch 'develop' into mardizzone/POS-3441_deterministic-ss
marcello33 Mar 31, 2026
47f48bc
better comment on go.mod
marcello33 Mar 31, 2026
13a51b8
fix linter
marcello33 Mar 31, 2026
96d8797
address comments
marcello33 Apr 1, 2026
c6f0d55
address comments
marcello33 Apr 1, 2026
a94f1a9
remove omitempty
marcello33 Apr 1, 2026
46e1750
update banner
marcello33 Apr 1, 2026
653d8b6
timeout for StateSyncEventsAtHeight
marcello33 Apr 1, 2026
bdb52b0
address comments
marcello33 Apr 1, 2026
cd46ca1
address comments
marcello33 Apr 1, 2026
d508db4
added tests
marcello33 Apr 1, 2026
360e533
address minor err shadowing and fix lint
marcello33 Apr 1, 2026
c1c1e3d
test single endpoint
marcello33 Apr 3, 2026
9697c13
update heimdall-v2 dependency to DSS-test branch
marcello33 Apr 3, 2026
925c5e5
address comments
marcello33 Apr 4, 2026
5dfe949
solve lint issue
marcello33 Apr 4, 2026
8d06151
address comments
marcello33 Apr 4, 2026
3a98421
address comments
marcello33 Apr 4, 2026
a1fa6b9
address comments
marcello33 Apr 4, 2026
dce6483
address comments
marcello33 Apr 4, 2026
ca9ee85
unify endpoints
marcello33 Apr 6, 2026
85521b2
address comments
marcello33 Apr 6, 2026
4326272
remove dead code
marcello33 Apr 6, 2026
5c6214f
Merge branch 'develop' into mardizzone/POS-3441_deterministic-ss
marcello33 Apr 7, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 25 additions & 5 deletions consensus/bor/bor.go
Original file line number Diff line number Diff line change
Expand Up @@ -1760,12 +1760,32 @@ func (c *Bor) CommitStates(
// Wait for heimdall to be synced before fetching state sync events
c.spanStore.waitUntilHeimdallIsSynced(c.ctx)

eventRecords, err = c.HeimdallClient.StateSyncEvents(c.ctx, from, to.Unix())
if err != nil {
log.Error("Error occurred when fetching state sync events", "fromID", from, "to", to.Unix(), "err", err)
if c.config.IsDeterministicStateSync(header.Number) {
log.Info("Using deterministic state sync", "cutoff", to.Unix())

eventRecords, err = c.HeimdallClient.StateSyncEventsByTime(c.ctx, from, to.Unix())
if err != nil {
// Liveness-over-safety tradeoff (matches pre-fork behavior):
// FetchWithRetry already retries aggressively and MultiHeimdallClient
// provides failover, so errors reaching here are persistent. Returning
// empty lets the proposer build a block with 0 state syncs. If other
// validators succeed, they will derive a different state root and reject
// this block — the proposer misses a slot but no silent divergence
// occurs. Skipped events are retried at the next sprint since `from`
// is derived from the on-chain LastStateId.
log.Error("Error fetching deterministic state sync events", "fromID", from, "to", to.Unix(), "err", err)

stateSyncs := make([]*types.StateSyncData, 0)
return stateSyncs, nil
return make([]*types.StateSyncData, 0), nil
Comment on lines +1763 to +1778
Copy link

Copilot AI Apr 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Post-fork deterministic mode currently logs and returns an empty state-sync set when StateSyncEventsByTime errors. That outcome depends on each node’s local Heimdall connectivity; if some validators succeed and others hit a transient error, they will apply different state updates and compute different state roots for the same block. For consensus-critical finalization, this should fail block processing (return the error) or use a deterministic fallback that guarantees all nodes derive the same result (e.g., only treat specific, globally-observable errors as skippable).

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The pre-fork path has the same behavior (log + return empty). I'll probably keep this comment open and discuss with the team.
Anyway, the risk is mitigated by multiple factors:

  1. FetchWithRetry retries aggressively
  2. waitUntilHeimdallIsSynced gates CommitStates until heimdall is caught up
  3. CommitStates only runs at sprint boundaries

}
Comment thread
marcello33 marked this conversation as resolved.
Comment thread
marcello33 marked this conversation as resolved.
Comment thread
marcello33 marked this conversation as resolved.
Comment thread
marcello33 marked this conversation as resolved.
} else {
eventRecords, err = c.HeimdallClient.StateSyncEvents(c.ctx, from, to.Unix())
if err != nil {
// Pre-fork: preserve existing behavior (returning empty, no error)
log.Error("Error occurred when fetching state sync events", "fromID", from, "to", to.Unix(), "err", err)

stateSyncs := make([]*types.StateSyncData, 0)
return stateSyncs, nil
}
}

// This if statement checks if there are any state sync record overrides configured for the current block number.
Expand Down
225 changes: 225 additions & 0 deletions consensus/bor/bor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,15 @@
func (f *failingHeimdallClient) FetchStatus(ctx context.Context) (*ctypes.SyncInfo, error) {
return nil, errors.New("fetch status failed")
}
func (f *failingHeimdallClient) GetBlockHeightByTime(_ context.Context, _ int64) (int64, error) {
return 0, errors.New("get block height by time failed")
}
func (f *failingHeimdallClient) StateSyncEventsAtHeight(_ context.Context, _ uint64, _ int64, _ int64) ([]*clerk.EventRecordWithTime, error) {
return nil, errors.New("state sync events at height failed")
}
func (f *failingHeimdallClient) StateSyncEventsByTime(_ context.Context, _ uint64, _ int64) ([]*clerk.EventRecordWithTime, error) {
return nil, errors.New("state sync events by time failed")
}

// newStateDBForTest creates a fresh state database for testing.
func newStateDBForTest(t *testing.T, root common.Hash) *state.StateDB {
Expand Down Expand Up @@ -2974,6 +2983,15 @@
func (m *mockHeimdallClient) FetchStatus(ctx context.Context) (*ctypes.SyncInfo, error) {
return &ctypes.SyncInfo{CatchingUp: false}, nil
}
func (m *mockHeimdallClient) GetBlockHeightByTime(_ context.Context, _ int64) (int64, error) {
return 0, nil
}
func (m *mockHeimdallClient) StateSyncEventsAtHeight(_ context.Context, _ uint64, _ int64, _ int64) ([]*clerk.EventRecordWithTime, error) {
return nil, nil
}
func (m *mockHeimdallClient) StateSyncEventsByTime(_ context.Context, _ uint64, _ int64) ([]*clerk.EventRecordWithTime, error) {
return m.events, nil
}
func TestEncodeSigHeader_WithBaseFee(t *testing.T) {
t.Parallel()
h := &types.Header{
Expand Down Expand Up @@ -5259,3 +5277,210 @@
require.NotErrorIs(t, err, errMissingGiuglianoFields)
}
}

// trackingHeimdallClient records which IHeimdallClient methods were called.
// It returns configurable results and tracks call counts for assertions.
type trackingHeimdallClient struct {
// Call counters
stateSyncEventsCalled int
getBlockHeightByTimeCalled int
stateSyncEventsAtHeightCalled int
stateSyncEventsByTimeCalled int

// Configurable return values
blockHeight int64
blockHeightErr error
events []*clerk.EventRecordWithTime
eventsErr error
eventsAtHeight []*clerk.EventRecordWithTime
eventsAtHeightErr error
eventsByTime []*clerk.EventRecordWithTime
eventsByTimeErr error
}

func (t *trackingHeimdallClient) Close() {}

Check failure on line 5301 in consensus/bor/bor_test.go

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Add a nested comment explaining why this function is empty or complete the implementation.

See more on https://sonarcloud.io/project/issues?id=0xPolygon_bor&issues=AZ1FPy8poC2Lq8aFv3Hb&open=AZ1FPy8poC2Lq8aFv3Hb&pullRequest=2177
func (t *trackingHeimdallClient) StateSyncEvents(context.Context, uint64, int64) ([]*clerk.EventRecordWithTime, error) {
t.stateSyncEventsCalled++
return t.events, t.eventsErr
}
func (t *trackingHeimdallClient) StateSyncEventsAtHeight(context.Context, uint64, int64, int64) ([]*clerk.EventRecordWithTime, error) {
t.stateSyncEventsAtHeightCalled++
return t.eventsAtHeight, t.eventsAtHeightErr
}
func (t *trackingHeimdallClient) GetSpan(context.Context, uint64) (*borTypes.Span, error) {
return nil, nil
}
func (t *trackingHeimdallClient) GetLatestSpan(context.Context) (*borTypes.Span, error) {
return nil, nil
}
func (t *trackingHeimdallClient) FetchCheckpoint(context.Context, int64) (*checkpoint.Checkpoint, error) {
return nil, nil
}
func (t *trackingHeimdallClient) FetchCheckpointCount(context.Context) (int64, error) {
return 0, nil
}
func (t *trackingHeimdallClient) FetchMilestone(context.Context) (*milestone.Milestone, error) {
return nil, nil
}
func (t *trackingHeimdallClient) FetchMilestoneCount(context.Context) (int64, error) {
return 0, nil
}
func (t *trackingHeimdallClient) FetchStatus(context.Context) (*ctypes.SyncInfo, error) {
return &ctypes.SyncInfo{CatchingUp: false}, nil
}
func (t *trackingHeimdallClient) GetBlockHeightByTime(context.Context, int64) (int64, error) {
t.getBlockHeightByTimeCalled++
return t.blockHeight, t.blockHeightErr
}
func (t *trackingHeimdallClient) StateSyncEventsByTime(context.Context, uint64, int64) ([]*clerk.EventRecordWithTime, error) {
t.stateSyncEventsByTimeCalled++
return t.eventsByTime, t.eventsByTimeErr
}

// deterministicBorConfig returns a BorConfig with DeterministicStateSyncBlock set.
func deterministicBorConfig(forkBlock int64) *params.BorConfig {
return &params.BorConfig{
Sprint: map[string]uint64{"0": 16},
Period: map[string]uint64{"0": 2},
IndoreBlock: big.NewInt(0),
StateSyncConfirmationDelay: map[string]uint64{"0": 0},
RioBlock: big.NewInt(1000000),
DeterministicStateSyncBlock: big.NewInt(forkBlock),
}
}

func TestCommitStates_DeterministicForkSwitch(t *testing.T) {
t.Parallel()

addr1 := common.HexToAddress("0x1")
sp := &fakeSpanner{vals: []*valset.Validator{{Address: addr1, VotingPower: 1}}}
mockGC := &mockGenesisContractForCommitStatesIndore{lastStateID: 0}

// Fork activates at block 100
borCfg := deterministicBorConfig(100)
genesisTime := uint64(time.Now().Unix()) - 200
chain, b := newChainAndBorForTest(t, sp, borCfg, true, addr1, genesisTime)
b.GenesisContractsClient = mockGC

genesis := chain.HeaderChain().GetHeaderByNumber(0)
now := time.Now()

// Pre-fork: block 16 should use StateSyncEvents (old legacy path)
tracker := &trackingHeimdallClient{
events: []*clerk.EventRecordWithTime{},
}
b.SetHeimdallClient(tracker)

stateDb := newStateDBForTest(t, genesis.Root)
h := &types.Header{Number: big.NewInt(16), ParentHash: genesis.Hash(), Time: uint64(now.Unix())}

_, err := b.CommitStates(stateDb, h, statefull.ChainContext{Chain: chain.HeaderChain(), Bor: b})
require.NoError(t, err)
require.Equal(t, 1, tracker.stateSyncEventsCalled, "pre-fork should call StateSyncEvents")
require.Equal(t, 0, tracker.getBlockHeightByTimeCalled, "pre-fork should not call GetBlockHeightByTime")
require.Equal(t, 0, tracker.stateSyncEventsAtHeightCalled, "pre-fork should not call StateSyncEventsAtHeight")
require.Equal(t, 0, tracker.stateSyncEventsByTimeCalled, "pre-fork should not call StateSyncEventsByTime")

Comment thread
marcello33 marked this conversation as resolved.
// Post-fork: block 112 should use StateSyncEventsByTime (deterministic state sync)
tracker2 := &trackingHeimdallClient{
eventsByTime: []*clerk.EventRecordWithTime{},
}
b.SetHeimdallClient(tracker2)

stateDb2 := newStateDBForTest(t, genesis.Root)
h2 := &types.Header{Number: big.NewInt(112), ParentHash: genesis.Hash(), Time: uint64(now.Unix())}

_, err = b.CommitStates(stateDb2, h2, statefull.ChainContext{Chain: chain.HeaderChain(), Bor: b})
require.NoError(t, err)
require.Equal(t, 0, tracker2.stateSyncEventsCalled, "post-fork should not call StateSyncEvents")
require.Equal(t, 1, tracker2.stateSyncEventsByTimeCalled, "post-fork should call StateSyncEventsByTime")
require.Equal(t, 0, tracker2.getBlockHeightByTimeCalled, "post-fork should not call GetBlockHeightByTime")
require.Equal(t, 0, tracker2.stateSyncEventsAtHeightCalled, "post-fork should not call StateSyncEventsAtHeight")
}

func TestCommitStates_ResilientPostFork(t *testing.T) {
t.Parallel()

addr1 := common.HexToAddress("0x1")
sp := &fakeSpanner{vals: []*valset.Validator{{Address: addr1, VotingPower: 1}}}
mockGC := &mockGenesisContractForCommitStatesIndore{lastStateID: 0}

// Fork activates at block 0 so all blocks are post-fork
borCfg := deterministicBorConfig(0)
genesisTime := uint64(time.Now().Unix()) - 200
chain, b := newChainAndBorForTest(t, sp, borCfg, true, addr1, genesisTime)
b.GenesisContractsClient = mockGC

genesis := chain.HeaderChain().GetHeaderByNumber(0)
now := time.Now()

// StateSyncEventsByTime returns an error
tracker := &trackingHeimdallClient{
eventsByTimeErr: errors.New("heimdall state sync by time failed"),
}
b.SetHeimdallClient(tracker)

stateDb := newStateDBForTest(t, genesis.Root)
h := &types.Header{Number: big.NewInt(16), ParentHash: genesis.Hash(), Time: uint64(now.Unix())}

result, err := b.CommitStates(stateDb, h, statefull.ChainContext{Chain: chain.HeaderChain(), Bor: b})

// Post-fork errors are resilient: log + return empty, no error
require.NoError(t, err, "post-fork should not return error on StateSyncEventsByTime failure")
require.Empty(t, result, "post-fork should return empty on StateSyncEventsByTime failure")

// StateSyncEventsByTime should have been called
require.Equal(t, 1, tracker.stateSyncEventsByTimeCalled,
"StateSyncEventsByTime should have been called once")
// Must not fallback to StateSyncEvents
require.Equal(t, 0, tracker.stateSyncEventsCalled,
"post-fork should NOT fall back to StateSyncEvents on error")
// Old two-call pattern should not be used
require.Equal(t, 0, tracker.getBlockHeightByTimeCalled,
"GetBlockHeightByTime should NOT be called")
require.Equal(t, 0, tracker.stateSyncEventsAtHeightCalled,
"StateSyncEventsAtHeight should NOT be called")
}

func TestCommitStates_ResilientPostFork_ReturnsEmptyOnError(t *testing.T) {
t.Parallel()

addr1 := common.HexToAddress("0x1")
sp := &fakeSpanner{vals: []*valset.Validator{{Address: addr1, VotingPower: 1}}}
mockGC := &mockGenesisContractForCommitStatesIndore{lastStateID: 0}

borCfg := deterministicBorConfig(0)
genesisTime := uint64(time.Now().Unix()) - 200
chain, b := newChainAndBorForTest(t, sp, borCfg, true, addr1, genesisTime)
b.GenesisContractsClient = mockGC

genesis := chain.HeaderChain().GetHeaderByNumber(0)
now := time.Now()

// StateSyncEventsByTime fails with an HTTP error
tracker := &trackingHeimdallClient{
eventsByTimeErr: errors.New("HTTP 503: service unavailable"),
}
b.SetHeimdallClient(tracker)

stateDb := newStateDBForTest(t, genesis.Root)
h := &types.Header{Number: big.NewInt(16), ParentHash: genesis.Hash(), Time: uint64(now.Unix())}

result, err := b.CommitStates(stateDb, h, statefull.ChainContext{Chain: chain.HeaderChain(), Bor: b})

// Post-fork is resilient: returns empty on error, does not propagate
require.NoError(t, err, "post-fork should not return error on StateSyncEventsByTime failure")
require.Empty(t, result, "post-fork should return empty on StateSyncEventsByTime failure")

// StateSyncEventsByTime should have been called
require.Equal(t, 1, tracker.stateSyncEventsByTimeCalled,
"StateSyncEventsByTime should have been called")
// Old path should not have been called as fallback
require.Equal(t, 0, tracker.stateSyncEventsCalled,
"post-fork should not fall back to StateSyncEvents")
// Old two-call pattern should not be used
require.Equal(t, 0, tracker.getBlockHeightByTimeCalled,
"GetBlockHeightByTime should NOT be called")
require.Equal(t, 0, tracker.stateSyncEventsAtHeightCalled,
"StateSyncEventsAtHeight should NOT be called")
}
3 changes: 3 additions & 0 deletions consensus/bor/heimdall.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,16 @@ import (
//go:generate mockgen -source=heimdall.go -destination=../../tests/bor/mocks/IHeimdallClient.go -package=mocks
type IHeimdallClient interface {
StateSyncEvents(ctx context.Context, fromID uint64, to int64) ([]*clerk.EventRecordWithTime, error)
StateSyncEventsAtHeight(ctx context.Context, fromID uint64, toTime int64, heimdallHeight int64) ([]*clerk.EventRecordWithTime, error)
StateSyncEventsByTime(ctx context.Context, fromID uint64, toTime int64) ([]*clerk.EventRecordWithTime, error)
GetSpan(ctx context.Context, spanID uint64) (*types.Span, error)
Comment thread
marcello33 marked this conversation as resolved.
GetLatestSpan(ctx context.Context) (*types.Span, error)
FetchCheckpoint(ctx context.Context, number int64) (*checkpoint.Checkpoint, error)
FetchCheckpointCount(ctx context.Context) (int64, error)
FetchMilestone(ctx context.Context) (*milestone.Milestone, error)
FetchMilestoneCount(ctx context.Context) (int64, error)
FetchStatus(ctx context.Context) (*ctypes.SyncInfo, error)
GetBlockHeightByTime(ctx context.Context, cutoffTime int64) (int64, error)
Close()
}

Expand Down
Loading
Loading