diff --git a/.github/actions/setup-guest-toolchain/action.yml b/.github/actions/setup-guest-toolchain/action.yml index 80f5712..972153d 100644 --- a/.github/actions/setup-guest-toolchain/action.yml +++ b/.github/actions/setup-guest-toolchain/action.yml @@ -28,6 +28,10 @@ inputs: description: "Foundry version tag" required: false default: "v1.4.3" + rust-toolchain: + description: "Rust toolchain version or channel" + required: false + default: "1.95.0" rust-components: description: "Extra rustup components (comma-separated)" required: false @@ -109,6 +113,7 @@ runs: - name: Install Rust toolchain uses: dtolnay/rust-toolchain@stable with: + toolchain: ${{ inputs.rust-toolchain }} components: ${{ inputs.rust-components }} - name: Cache Rust artifacts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f203d8b..65d9ecc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,6 +5,7 @@ on: pull_request: env: + RUST_TOOLCHAIN: "1.95.0" XGENEXT2FS_VERSION: v1.5.6 XGENEXT2FS_SHA256_AMD64: 996e4e68a638b5dc5967d3410f92ecb8d2f41e32218bbe0f8b4c4474d7eebc59 XGENEXT2FS_SHA256_ARM64: e5aca81164b762bbe5447bacef41e4fa9e357fd9c8f44e519c5206227d43144d @@ -25,6 +26,8 @@ jobs: run: | sudo apt-get update sudo apt-get install -y \ + faketime \ + libfaketime \ lua5.4 \ liblua5.4-dev \ libslirp-dev @@ -32,6 +35,7 @@ jobs: - name: Install Rust toolchain uses: dtolnay/rust-toolchain@stable with: + toolchain: ${{ env.RUST_TOOLCHAIN }} components: rustfmt, clippy - name: Cache Rust artifacts @@ -53,7 +57,7 @@ jobs: - name: Test timeout-minutes: 15 - run: RUN_ANVIL_TESTS=1 cargo test --workspace --all-targets --all-features --locked + run: cargo test --workspace --all-targets --all-features --locked canonical-guest: runs-on: ubuntu-latest @@ -67,6 +71,7 @@ jobs: - name: Setup guest toolchain uses: ./.github/actions/setup-guest-toolchain with: + rust-toolchain: ${{ env.RUST_TOOLCHAIN }} xgenext2fs-version: ${{ env.XGENEXT2FS_VERSION }} xgenext2fs-sha256-amd64: ${{ env.XGENEXT2FS_SHA256_AMD64 }} xgenext2fs-sha256-arm64: ${{ env.XGENEXT2FS_SHA256_ARM64 }} @@ -92,6 +97,7 @@ jobs: - name: Setup guest toolchain uses: ./.github/actions/setup-guest-toolchain with: + rust-toolchain: ${{ env.RUST_TOOLCHAIN }} xgenext2fs-version: ${{ env.XGENEXT2FS_VERSION }} xgenext2fs-sha256-amd64: ${{ env.XGENEXT2FS_SHA256_AMD64 }} xgenext2fs-sha256-arm64: ${{ env.XGENEXT2FS_SHA256_ARM64 }} @@ -100,5 +106,10 @@ jobs: cartesi-machine-sha256-arm64: ${{ env.CARTESI_MACHINE_SHA256_ARM64 }} install-foundry: "true" + - name: Install faketime + run: | + sudo apt-get update + sudo apt-get install -y faketime libfaketime + - name: Run rollups E2E tests run: just test-rollups-e2e diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a561bcf..7d23ac3 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -20,6 +20,7 @@ permissions: contents: write env: + RUST_TOOLCHAIN: "1.95.0" XGENEXT2FS_VERSION: v1.5.6 XGENEXT2FS_SHA256_AMD64: 996e4e68a638b5dc5967d3410f92ecb8d2f41e32218bbe0f8b4c4474d7eebc59 XGENEXT2FS_SHA256_ARM64: e5aca81164b762bbe5447bacef41e4fa9e357fd9c8f44e519c5206227d43144d @@ -59,6 +60,7 @@ jobs: - name: Install Rust toolchain uses: dtolnay/rust-toolchain@stable with: + toolchain: ${{ env.RUST_TOOLCHAIN }} targets: ${{ matrix.target }} - name: Cache Rust artifacts @@ -123,6 +125,7 @@ jobs: - name: Setup guest toolchain uses: ./.github/actions/setup-guest-toolchain with: + rust-toolchain: ${{ env.RUST_TOOLCHAIN }} xgenext2fs-version: ${{ env.XGENEXT2FS_VERSION }} xgenext2fs-sha256-amd64: ${{ env.XGENEXT2FS_SHA256_AMD64 }} xgenext2fs-sha256-arm64: ${{ env.XGENEXT2FS_SHA256_ARM64 }} diff --git a/.gitignore b/.gitignore index 822d909..0359111 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ sequencer.db-wal /out/ /.DS_Store soljson-latest.js +**/states/ diff --git a/AGENTS.md b/AGENTS.md index 2c468e0..d444823 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1,110 +1,277 @@ # AGENTS.md -This file tells AI coding agents how to work effectively in this repository. +This file tells AI coding agents and human contributors how to work effectively in this repository. Start here. ## Mission -Build and evolve a **sequencer prototype** for a future DeFi stack. +Build and evolve a **DeFi sequencer** — the off-chain component that gives users low-latency soft confirmations while preserving the on-chain scheduler's canonical authority. -Current scope is intentionally small: a **dummy wallet app** that supports: -- `Transfer` -- `Withdrawal` +This is **security-critical infrastructure**. Treat every change with the care that financial systems demand. Correctness, determinism, and safety come before features. -Primary objective in this phase: make sequencer behavior, safety checks, and persistence reliable before adding "real world" execution logic. +The current application (`examples/app-core/`) is a **hardcoded placeholder** (deposit, transfer, withdrawal). It will be replaced by a production DeFi application. The sequencer itself is the product; the app is a stand-in for development. -## Project Snapshot +## Requirements -- Language: Rust (`edition = 2024`) -- API: Axum -- Queueing: Tokio MPSC -- Commit path: single blocking inclusion lane (hot path) -- Storage: SQLite (`rusqlite`, WAL mode) -- Signing: EIP-712 (`alloy`) -- Method payload encoding: SSZ +In order of importance: -## Glossary +1. **Low latency** — `POST /tx` ack under 500 ms. +2. **Financially sustainable** — the system must pay for itself through fees. +3. **Low cost transactions** — cheaper than native L1. -- `chunk`: small bounded list of user ops processed/executed and persisted together to amortize SQLite cost and keep low-latency ack behavior. -- `frame`: canonical ordering boundary that commits a `safe_block` plus a list of user ops; canonical execution drains all direct inputs safe at that block before executing the frame’s user ops. -- `batch`: list of frames that will be posted on-chain as one unit. -- `inclusion lane`: the hot-path single-lane loop that dequeues user ops, executes app logic, persists ordering, and rotates frame/batch boundaries. +## Invariants + +- **Dispute compatibility** — the design already accounts for rollup dispute resolution. Preserve it. +- **Wallet-compatible signing** — users sign with standard wallets via EIP-712. Never introduce custom signing schemes. +- **Deposit availability < 10 minutes** — happy path. The censorship-resistance backstop (`MAX_WAIT_BLOCKS`, ~4h) is the worst case. + +## Design Principles + +- **App-specific sequencer.** The sequencer may link against the application, enabling validation and execution at ingress time. This is a deliberate design choice. +- **Soft confirmations may be invalidated.** Under adversarial conditions (network, infrastructure, provider, or L1 outages), soft confirmations can be rolled back via recovery. This is by design, not a bug — it is what makes the sequencer sound in the face of liveness failures. +- **App UX may depend on the sequencer.** Without the sequencer, user experience may degrade substantially. This is an acceptable tradeoff: the on-chain scheduler remains the canonical source of truth; the sequencer only accelerates the UX. + +## Sequencer / Scheduler Duality + +The system has two components in an asymmetric relationship: + +### Scheduler — on-chain canonical authority + +The scheduler runs inside the rollup and **defines the canonical transaction ordering**. For each batch read from L1 safe inputs, it processes frames in order: drain all pending direct inputs whose block number is ≤ `safe_block`, then execute the frame's user ops. **The scheduler treats the sequencer as potentially Byzantine** — it enforces ordering and staleness rules regardless of what the sequencer claims. + +### Sequencer — off-chain predictor + +The sequencer knows the scheduler's algorithm. It uses that knowledge to **predict** what the canonical ordering will be once its batches land on L1, and issues soft confirmations to users ahead of time. The sequencer has **write priority on the execution queue**: as long as it keeps advancing `safe_block` and submitting batches, it controls ordering. + +### The `safe_block` synchronization primitive + +Each frame carries a `safe_block` chosen by the sequencer. It serves two purposes: + +- It tells the scheduler how far to drain direct inputs before executing the frame's user ops. +- It is the sequencer's commitment that it has accounted for all direct inputs up to that block. + +The sequencer must advance `safe_block` honestly. If it freezes `safe_block` (to censor deposits) or stops submitting batches, the staleness mechanism detects this and forces recovery. + +### When soft confirmations match canonical order + +Under honest sequencer operation and no infrastructure outages, soft confirmations match the canonical order. This is an **optimistic guarantee** — the sequencer is predicting a future the scheduler has not yet computed. When the sequencer goes offline, submits stale batches, or tries to censor direct inputs, the scheduler's force-drain backstop kicks in and the affected soft confirmations become invalid. + +## Batch Staleness and Recovery + +### Staleness + +A batch is **stale** when `inclusion_block - first_frame.safe_block >= MAX_WAIT_BLOCKS` (1200 blocks, ~4h). Staleness catches two failure modes: + +1. **Liveness failure** — the sequencer went offline and failed to submit batches in time. +2. **Censorship** — the sequencer kept submitting batches but froze `safe_block` to hold back direct inputs. + +When the scheduler encounters a stale batch, it **skips it entirely** — no nonce consumed, no state change. This is the **censorship-resistance backstop**: the sequencer cannot hold write priority indefinitely without advancing the drain cursor. Direct inputs are force-drained at `MAX_WAIT_BLOCKS`, guaranteeing deposit availability within ~4h even under adversarial conditions. + +### Cascading invalidation + +If a batch is stale, all existing subsequent batches are also invalid. The scheduler's expected-nonce counter does not advance on a stale skip, so every subsequent batch arrives at an unexpected nonce and is rejected. Invalidation is a suffix operation: marking batch `N` invalid cascades to `N+1`, `N+2`, …, including the open batch. New batches created after recovery are unaffected. + +### Preemptive recovery + +Rather than waiting for a batch to go stale on L1, the sequencer uses a **danger threshold** (`MAX_WAIT_BLOCKS − MARGIN`). The cycle crosses a process boundary by design: + +1. **Detector trips + process exits** — the in-process [`DangerDetector`](sequencer/src/recovery/detector.rs) polls `Storage::check_danger` on a cadence. When either the strict block-based or wall-clock-adjusted arm fires, the detector exits with `DetectorExit::DangerZone`, the runtime maps that to `RunError::DangerZoneDetected`, and the process exits with a non-zero status. Stopping the process is how the sequencer goes offline: no more user-op acceptance, no more batch submission. +2. **Orchestrator respawns** — systemd/k8s/etc. restarts the process. +3. **Startup flushes the mempool** — [`MempoolFlusher`](sequencer/src/recovery/flusher.rs) submits no-op transactions at every pending wallet-nonce slot and waits for safe finality. This consumes all pending slots so adversarially-delayed "zombie" batch submissions cannot land later. The flusher is load-bearing, not defense-in-depth. +4. **Startup runs recovery** — on fully finalized L1 state: cascade-invalidate stale batches, open a recovery batch, re-drain direct inputs from invalidated batches. Driven by [`run_preemptive_recovery`](sequencer/src/recovery/mod.rs) with the decision table in the pure [`decide_startup_action`](sequencer/src/recovery/mod.rs). +5. **Normal operation resumes** — the lane, submitter, input reader, and a fresh detector all start up. + +### Detection: safe-only, with wall-clock fallback + +Staleness is only checked against L1 **safe** state, never latest. Stale batches in latest that haven't reached safe yet will eventually become safe, and the check will fire at that point. This avoids reacting to L1 reorgs. + +When L1 is unreachable, the DB-based staleness check sees a frozen `current_safe_block` and may fail to trigger. The danger detector falls back to **wall-clock estimation**: `estimated_missed_blocks = (now − last_l1_success) / seconds_per_block`, and the danger threshold is adjusted downward by this estimate. Prevents silently issuing doomed soft confirmations during extended L1 outages. + +### Formal verification + +The preemptive recovery design is verified by bounded TLA+ model checking. See [`docs/recovery/`](docs/recovery/) for the full design, TLA+ specs, and design history. When touching recovery code, read the TLA+ first. + +## Threat Model (brief) + +See [`docs/threat-model/README.md`](docs/threat-model/README.md) for the full model. Key points when reading or writing code: + +- **Trusted:** InputBox contract, our own Ethereum node (fail-stop, not byzantine), operator config, batch-submitter key. +- **Adversarial:** `POST /tx` callers, direct-input senders, the L1 mempool and block builders (zombie transactions are a first-class threat). +- **Semi-trusted, fail-stop:** fallback RPC providers (Infura / Alchemy). +- **Self-trust:** the sequencer trusts its own code is correct. Bugs that emit malformed batches are fault states requiring manual intervention, not threats to defend against at runtime. +- **In scope:** correctness bugs *and* exploitation. Under rollup semantics, a correctness bug that causes scheduler/sequencer state divergence is as severe as direct theft. + +Open findings from staged security review live in [`SECURITY_TODO.md`](SECURITY_TODO.md). ## Architecture Map -- `sequencer/src/main.rs`: thin binary entrypoint. -- `sequencer/src/lib.rs`: public sequencer API (`run`, `RunConfig`). -- `sequencer/src/config.rs`: runtime input parsing and EIP-712 domain construction. -- `sequencer/src/runtime.rs`: bootstrap and runtime wiring. -- `sequencer/src/api/mod.rs`: `POST /tx` and `GET /ws/subscribe` endpoints (tx ingress + replay feed). -- `sequencer/src/api/error.rs`: API error model + HTTP mapping. -- `sequencer/src/inclusion_lane/mod.rs`: inclusion-lane exports and public surface. -- `sequencer/src/inclusion_lane/lane.rs`: batched execution/commit loop (single lane). -- `sequencer/src/inclusion_lane/types.rs`: inclusion-lane queue item and pipeline error types. -- `sequencer/src/inclusion_lane/error.rs`: inclusion-lane runtime and catch-up error types. -- `sequencer/src/input_reader/`: safe-input ingestion from InputBox into SQLite. -- `sequencer/src/l2_tx_feed/mod.rs`: DB-backed ordered-L2Tx feed used by WS subscriptions. -- `sequencer/src/storage/mod.rs`: DB open, migrations, frame persistence, and direct-input broker APIs. -- `sequencer/src/storage/migrations/`: DB schema/bootstrapping (`0001`). -- `sequencer-core/src/`: shared domain types/interfaces (`Application`, `SignedUserOp`, `SequencedL2Tx`, broadcast message model). -- `examples/app-core/src/application/mod.rs`: wallet prototype implementing `Application`. -- `tests/benchmarks/src/`: benchmark harnesses and self-contained benchmark runtime. - -## Domain Truths (Important) - -- This is a **sequencer prototype**, not a full DeFi stack yet. -- API validates signature and enqueues signed `UserOp`; method decoding happens during application execution. -- Deposits are direct-input-only (L1 -> L2) and must not be represented as user ops. -- Rejections (`InvalidNonce`, fee cap too low, insufficient gas balance) produce no state mutation and are not persisted. -- Included txs are persisted as frame/batch data in `batches`, `frames`, `user_ops`, `safe_inputs`, and `sequenced_l2_txs`. -- Frame fee is persisted in `frames.fee` and is fixed for the lifetime of that frame. -- The next frame fee is sampled from `batch_policy_derived.recommended_fee` when rotating to a new frame (defaults follow `batch_policy` bootstrap rows; tune `gas_price` / `alpha` via SQLite if needed). -- `/ws/subscribe` currently has internal guardrails: subscriber cap `64`, catch-up cap `50000`. -- When that catch-up window is exceeded, `/ws/subscribe` upgrades and then closes with websocket close code `1008` (`POLICY`) and reason `catch-up window exceeded`. -- Wallet state (balances/nonces) is in-memory right now (not persisted). -- EIP-712 domain name/version are fixed in code; chain ID and verifying contract come from `SEQ_CHAIN_ID` and `SEQ_APP_ADDRESS` (validated against the RPC chain id at startup). +Top-level layout follows the system's data flow. Each sequencer module corresponds to a writer role; the matching `storage/.rs` holds its storage half. + +### Workspace + +- `sequencer/` — main sequencer binary and library. +- `sequencer-core/` — shared domain types (`Application`, `SignedUserOp`, `SequencedL2Tx`, `Batch`, `Frame`). +- `examples/app-core/` — placeholder wallet app implementing the `Application` trait. +- `examples/canonical-app/` — on-chain scheduler reference implementation. +- `examples/canonical-test/` — e2e test harness for the canonical app. +- `sdk/rust-client/` — Rust client library for the sequencer API. +- `tests/{benchmarks,e2e,harness}/` — test infrastructure. + +### Sequencer module layout + +- `sequencer/src/main.rs` — thin binary entrypoint. +- `sequencer/src/lib.rs` — public sequencer API (`run`, `RunConfig`). +- `sequencer/src/http.rs` — shared HTTP error type, JSON `ErrorResponse`, `ApiConfig`, and `axum::serve` orchestration. +- `sequencer/src/runtime/` — process bootstrap, `RunConfig`, EIP-712 domain, `ShutdownSignal`, shared `clock::unix_now_ms`. +- `sequencer/src/ingress/` — public write path. + - `api.rs` — `POST /tx` handler, JSON-rejection mapping. + - `inclusion_lane/` — single-lane hot-path loop (`mod.rs`), catch-up replay, config, error types. +- `sequencer/src/egress/` — internal read path. + - `api/` — `/ws/subscribe`, `/livez`, `/readyz`, `/healthz`. + - `l2_tx_feed/` — DB-backed ordered-tx feed. +- `sequencer/src/l1/` — L1 client surface. + - `reader.rs` — safe-input ingestion from InputBox into SQLite. + - `submitter/` — stateless batch submitter (`worker.rs` + `poster.rs`). + - `provider.rs` — alloy provider construction. + - `partition.rs` — long-block-range retry helper. +- `sequencer/src/recovery/` — preemptive recovery startup procedure (`mod.rs`), runtime danger detector (`detector.rs`), and mempool flusher (`flusher.rs`). +- `sequencer/src/storage/` — SQLite persistence, split by writer role (`ingress`, `egress`, `l1_inputs`, `l1_submission`, `recovery`, `admin`, plus shared `mod`, `open`, `internals`, and `migrations/`). + +## Key Concepts + +- **Chunk** — bounded list of user ops processed and persisted together to amortize SQLite cost. +- **Frame** — ordering boundary; commits `safe_block` + user ops. +- **Batch** — list of frames posted on-chain as one L1 transaction (SSZ-encoded). +- **Inclusion lane** — hot-path single-lane loop that dequeues, executes, persists, and rotates frame/batch boundaries. The only writer of open batch/frame state. +- **Batch submitter** — stateless worker that bulk-submits all pending batches each tick. Nonces are assigned by storage (structural `parent.nonce + 1`) when batches are closed; the submitter just reads them. +- **Danger detector** — background worker that polls `Storage::check_danger` on a fixed cadence and exits with `DangerZone` when the strict or wall-clock-adjusted check fires. Never writes to the DB; never talks to L1. Crashes the process so startup recovery can run. +- **Input reader** — ingests safe inputs from L1 InputBox into SQLite. +- **L2 tx feed** — DB-backed ordered-tx stream used by WS subscribers. +- **Soft confirmation** — sequencer's predicted ordering, emitted before the batch lands on L1. + +## Domain Truths + +- API validates the EIP-712 signature and enqueues a `SignedUserOp`. Method payload decoding happens during application execution, not at ingress. +- **Deposits are direct-input-only** (L1 → L2) and must not be represented as user ops. +- Rejections (`InvalidNonce`, `InvalidMaxFee`, `InsufficientGasBalance`) produce no state mutation and are not persisted. +- Included txs are persisted as frame/batch data in `batches`, `frames`, `user_ops`, `safe_inputs`, and `sequenced_l2_txs`. Recovery metadata lives in `safe_accepted_batches`; batch lifecycle state (sealed/invalidated) lives on the `batches` row itself as write-once timestamps. +- Frame fee is persisted in `frames.fee` and is fixed for the lifetime of that frame. The next frame's fee is sampled from `batch_policy_derived.recommended_fee` at rotation. +- Wallet state (balances, nonces) is in-memory today — not persisted. +- **EIP-712 domain fields:** `name`, `version`, `chainId`, `verifyingContract`. `chainId` and `verifyingContract` come from `SEQ_CHAIN_ID` and `SEQ_APP_ADDRESS` (validated against the RPC chain id at startup). All four fields must be present on both sides — see [`SECURITY_TODO.md`](SECURITY_TODO.md) for the open divergence finding. + +### InputBox payload classification + +- The input reader ingests every `InputAdded` event from InputBox. Each event carries an authenticated `msg_sender` (delivered by the Cartesi framework from `EvmAdvanceCall`). +- **Classification is by sender address**, not by a tag byte: + - Sender == batch-submitter address → SSZ-decoded as `Batch` (scheduler side). The sequencer does not ingest its own batch submissions as direct inputs. + - Any other sender → stored verbatim as a direct input (deposit). +- The payload is opaque to the classification layer. Application-specific decoding happens inside `Application::execute_direct_input`. + +## Application Trait Contract + +Implementors of the `Application` trait must respect these contracts. The sequencer assumes them without runtime enforcement. + +### Replay determinism + +The sequencer persists every included user op and every ingested direct input. On restart, catch-up replays them in order against a fresh `Application` instance to rebuild state. **Any input that succeeded live must succeed on replay.** + +- `execute_direct_input` and `execute_valid_user_op` must not return `AppError::Internal` for any byte sequence that previously executed successfully. Catch-up treats `Internal` as fatal: it aborts startup and leaves the sequencer unable to resume. +- Prefer `ExecutionOutcome::Invalid` for malformed or ill-typed input caught at the app level. Reserve `AppError::Internal` for genuine invariant violations ("validated user op cannot pay fee") — real bugs, not adversarial inputs. `Invalid` is replay-safe; `Internal` is not. +- `validate_user_op` must be pure over the current app state. No side effects, no time dependence, no randomness. + +### No implicit state + +Application state changes must flow exclusively through `execute_valid_user_op` and `execute_direct_input`. Mutating state from `validate_user_op` or `current_user_nonce` breaks replay determinism. ## Hot-Path Invariants - API ack is tied to chunk durability, not frame/batch closure. - Chunk commit and ack remain low-latency; frame closure is orthogonal and can happen less frequently. -- API overload for `POST /tx` is currently defined by inclusion-lane queue admission: if `try_send` hits a full queue, the handler returns `429 OVERLOADED` with message `queue full`. +- `POST /tx` queue admission: `try_send` on a full queue returns `429 OVERLOADED` with message `queue full`. - Frame closure happens when direct inputs are drained, and also whenever batch closure happens. - Batch closure is controlled by batch policy (size and/or deadline). -- Preserve single-lane deterministic ordering; do not introduce extra concurrency in hot-path ordering logic without explicit approval. +- Preserve single-lane deterministic ordering. Do not introduce extra concurrency in hot-path ordering logic without explicit approval. ## Storage Invariants - Storage model is append-oriented; avoid mutable status flags for open/closed entities. -- Open batch/frame are derived by “latest row” convention. -- A frame’s leading direct-input prefix is derivable from `sequenced_l2_txs` plus `frames.safe_block`. -- `safe_inputs` contains only L1 app direct input **bodies**. InputBox payload first byte: **0x00** = direct input (tag stripped, body stored and executed), **0x01** = batch submission (for scheduler, not stored), **others** = discarded (invalid/garbage). The input reader only accepts 0x00-tagged payloads and stores `payload[1..]`. +- Open batch/frame are derived by "latest row" convention. +- A frame's leading direct-input prefix is derivable from `sequenced_l2_txs` plus `frames.safe_block`. - Safe cursor/head values should be derived from persisted facts when possible, not duplicated as mutable fields. -- Replay/catch-up must use persisted ordering plus persisted frame fee (`frames.fee`) to mirror inclusion semantics. -- Included user-op identity is constrained by `UNIQUE(sender, nonce)`. +- Replay/catch-up uses persisted ordering plus persisted frame fee (`frames.fee`) to mirror inclusion semantics exactly. +- Cursor pagination for ordered L2 txs uses **SQLite rowid**, not count-based offsets. Holes from invalidated batches would break count-based pagination. +- Included user-op identity is tracked by application nonce logic; no DB uniqueness constraint (removed to allow resubmission after recovery). +- **Reads over batch data go through `valid_batches`, `valid_closed_batches`, `valid_open_batch`, and `valid_sequenced_l2_txs` views.** These encapsulate the "exclude invalidated rows" filter so individual queries don't repeat it. Writers go to the base tables. +- **`batches` row columns partition cleanly by writer.** `sealed_at_ms` is owned by the inclusion lane (set when closing a batch); `invalidated_at_ms` is owned by recovery (set during cascade). Each is write-once (NULL → non-NULL, never back) and enforced by triggers. The partial unique index `ux_single_valid_tip` guarantees at most one row has both NULL — the Tip. +- The inclusion lane is the **only writer** of open batch/frame state. `Storage::append_user_ops_chunk` and the `close_*` methods trust the in-memory `WriteHead`; FK + PK constraints catch the dangerous failure modes. ## Type Boundaries -- `SignedUserOp`: ingress/API signature domain. -- `ValidUserOp`: app execution domain after validation boundary. -- `SequencedL2Tx`: ordered replay/fanout domain (`UserOp | DirectInput`). -- Keep private DB-only helper/intermediary types private to storage modules; prefer shared domain types at module boundaries. +- `SignedUserOp` — ingress/API signature domain (post-validation, pre-execution). +- `ValidUserOp` — application execution domain (after validation boundary). +- `SequencedL2Tx` — ordered replay/fanout domain (`UserOp | DirectInput`). +- Keep DB-only helper types private to storage modules; prefer shared domain types at module boundaries. + +## HTTP Endpoints + +- **Ingress** (public-facing): `POST /tx`. +- **Egress** (internal indexers): `GET /ws/subscribe`, `GET /livez`, `GET /readyz`, `GET /healthz`. + +Today both sides serve from one listener; the planned API split puts each side on its own port (same binary) so internal probes and subscribers can be firewalled from public submit traffic. + +`/ws/subscribe` internal guardrails: subscriber cap 64, catch-up cap 50000. When the catch-up window is exceeded, the handler upgrades and then closes with WebSocket close code `1008` (`POLICY`), reason `catch-up window exceeded`. + +Health semantics: `/livez` — 200 if the process is alive. `/readyz` — 200 if shutdown not requested AND inclusion-lane channel still open, else 503. `/healthz` — JSON `{ status, inclusion_lane }` mirroring the same 200/503. + +## Environment Variables + +**Required:** + +- `SEQ_ETH_RPC_URL` +- `SEQ_CHAIN_ID` +- `SEQ_APP_ADDRESS` +- `SEQ_BATCH_SUBMITTER_PRIVATE_KEY` or `SEQ_BATCH_SUBMITTER_PRIVATE_KEY_FILE` + +**Optional:** + +- `SEQ_HTTP_ADDR` (default `127.0.0.1:3000`) +- `SEQ_DATA_DIR` (default `sequencer-data`; DB file `sequencer.db` inside it) +- `SEQ_LONG_BLOCK_RANGE_ERROR_CODES` +- `SEQ_BATCH_SUBMITTER_IDLE_POLL_INTERVAL_MS` (default 5000) +- `SEQ_BATCH_SUBMITTER_CONFIRMATION_DEPTH` (default 2) +- `SEQ_PREEMPTIVE_MARGIN_BLOCKS` (default 75) +- `SEQ_SECONDS_PER_BLOCK` (default 12) + +## Coding Conventions + +- Prefer small, composable functions at module boundaries (`ingress::api` → `ingress::inclusion_lane` → `storage::ingress`; `egress::l2_tx_feed` ← `storage::egress`). +- Keep application validation and execution deterministic for a given input/state. No `SystemTime::now()`, `HashMap` iteration order, or floating-point in consensus paths. +- Surface user-facing errors via `ApiError` (in `http.rs`); keep internal failures descriptive but safe. +- Avoid introducing heavy dependencies without strong reason. +- Documentation style: lean. Module headers (1–4 lines) + docs on public methods only when the contract isn't obvious from name+signature. Use inline comments for **why**, never for **what**. +- **Don't layer defense-in-depth checks against sequencer self-bugs.** Correctness is enforced via tests and review. See "Self-trust" in [`docs/threat-model/README.md`](docs/threat-model/README.md). + +## Testing Guidance + +Focus tests on: + +- Signature + sender-validation edge cases. +- Nonce progression rules. +- Fee and rejection behavior. +- Included-vs-rejected commit behavior. +- Storage batch atomicity and uniqueness constraints. +- Scheduler/sequencer agreement — any invariant the two sides share should have at least one test that exercises both. -## Agent Priorities +Prefer black-box tests around `POST /tx` and commit outcomes for integration. -When making changes, optimize for: -1. Deterministic sequencing semantics. -2. Safety and correctness of transaction validation/execution. -3. Clear, testable boundaries between API, application logic, and storage. -4. Backward-compatible, explicit error handling. -5. Minimal, focused diffs. +Some `sequencer` tests use Anvil (Foundry). They run by default and fail with a clear message if `anvil` is not on PATH. Install Foundry or use `nix develop`. ## Fast Start Commands -Run from repo root: +See [`CLAUDE.md`](CLAUDE.md) for shell setup and the full command list. In short: ```bash cargo check -cargo test +cargo test --workspace --exclude canonical-test cargo fmt --all cargo clippy --all-targets --all-features -- -D warnings ``` @@ -119,34 +286,22 @@ SEQ_BATCH_SUBMITTER_PRIVATE_KEY=0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5e cargo run -p sequencer ``` -Optional env vars: -- `SEQ_HTTP_ADDR` -- `SEQ_DATA_DIR` (default `sequencer-data`; DB file `sequencer.db` inside it) -- `SEQ_LONG_BLOCK_RANGE_ERROR_CODES` -- `SEQ_BATCH_SUBMITTER_PRIVATE_KEY_FILE` (alternative to `SEQ_BATCH_SUBMITTER_PRIVATE_KEY`) -- `SEQ_BATCH_SUBMITTER_IDLE_POLL_INTERVAL_MS`, `SEQ_BATCH_SUBMITTER_CONFIRMATION_DEPTH` - -Required env vars: -- `SEQ_ETH_RPC_URL` -- `SEQ_CHAIN_ID` -- `SEQ_APP_ADDRESS` -- `SEQ_BATCH_SUBMITTER_PRIVATE_KEY` or `SEQ_BATCH_SUBMITTER_PRIVATE_KEY_FILE` - ## Always / Ask First / Never ### Always -- Keep behavior explicit for transaction inclusion vs rejection. -- Preserve API error shape and status code mapping unless intentionally changing API contract. +- Keep inclusion-vs-rejection semantics explicit for transaction handling. +- Preserve API error shape and status code mapping unless intentionally changing the API contract. - Add or update tests when logic changes. - Run at least `cargo check` before finishing. +- Read `docs/recovery/` before touching recovery code, and `docs/threat-model/` before touching trust-boundary code. ### Ask First - Changing tx wire format (`UserOp`, SSZ payload layout, EIP-712 domain fields). - Changing DB schema or migration strategy. - Altering rejection semantics (what consumes nonce/gas vs what is rejected). -- Introducing concurrency changes to commit ordering guarantees. +- Introducing concurrency changes to commit ordering. - Changing chunk/frame/batch closure or ack semantics. ### Never @@ -156,50 +311,26 @@ Required env vars: - Rely on implicit defaults for consensus-relevant values. - Remove guardrails around queue backpressure or inclusion-lane error reporting. -## Coding Conventions for This Repo - -- Prefer small, composable functions at module boundaries (`api` -> `application` -> `storage`). -- Keep application validation/execution deterministic for a given input/state. -- Surface user-facing errors via `ApiError`; keep internal failures descriptive but safe. -- Avoid introducing heavy dependencies without strong reason. - -## Testing Guidance - -Focus tests on: -- signature + sender validation edge cases -- nonce progression rules -- fee/rejection behavior -- included vs rejected commit behavior -- storage batch atomicity and uniqueness constraints +## Migration Policy -If adding integration tests, prefer black-box tests around `POST /tx` and commit outcomes. +At this stage it is acceptable to rewrite baseline migrations for clarity. There are no deployed environments requiring forward-only migrations. Keep schema bootstrap (initial open rows and invariants) explicit and deterministic. -Some `sequencer` tests use Anvil and are opt-in locally: +Once environments are shared or deployed, switch to append-only forward migrations. -```bash -RUN_ANVIL_TESTS=1 cargo test -p sequencer --lib -``` +## Definition of Done -## Definition of Done for Agent Changes +Before finishing a change, ensure: -Before finishing, ensure: 1. Code compiles (`cargo check`). -2. Changed behavior is covered by tests (or explain why tests are pending). -3. Formatting/lints are clean (or list any unresolved warnings explicitly). -4. PR summary includes: - - what changed - - why it changed - - risk/compatibility notes - -## Near-Term Roadmap Hints - -Expected future evolution areas: -- stronger typing around tx metadata -- persistence for app state or deterministic replay -- explicit L1 block progression input - -## Migration Policy - -- Current prototype stage: it is acceptable to rewrite baseline migrations for clarity. -- Once environments are shared/deployed: switch to append-only forward migrations. -- Keep schema bootstrap (initial open rows/invariants) explicit and deterministic. +2. Changed behavior is covered by tests, or explain why tests are pending. +3. Formatting and lints are clean, or list any unresolved warnings explicitly. +4. PR summary includes **what changed**, **why it changed**, and **risk / compatibility notes**. + +## Related Documents + +- [`README.md`](README.md) — product framing, user-facing trust model. +- [`CLAUDE.md`](CLAUDE.md) — shell setup, quick reference, pointer back here. +- [`docs/threat-model/README.md`](docs/threat-model/README.md) — trust boundaries, in-scope and out-of-scope threats. +- [`docs/recovery/README.md`](docs/recovery/README.md) — recovery design, TLA+ formal verification, design history. +- [`SECURITY_TODO.md`](SECURITY_TODO.md) — open security findings from staged review. +- [`sequencer-core/`](sequencer-core/) — shared domain types and protocol contracts. diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..6a0a35b --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,58 @@ +# CLAUDE.md + +Quick reference for working in this repository. For the full guide — architecture, duality, recovery, invariants, threat model, and rules — read [`AGENTS.md`](AGENTS.md). + +## Shell Environment + +This project uses Nix + direnv. Before running any command that needs project tools (Foundry, TLA+, etc.), activate the direnv environment: + +```bash +eval "$(direnv export bash 2>/dev/null)" +``` + +This makes `anvil`, `forge`, `cast`, `tlc`, and other Nix-provided tools available. Cargo and rustc are available without direnv. + +## Commands + +```bash +cargo check # compile check +cargo test --workspace --exclude canonical-test # run tests (canonical-test needs libslirp) +cargo fmt --all # format +cargo clippy --all-targets --all-features -- -D warnings # lint +cargo test -p sequencer --lib # includes Anvil-backed tests (needs Foundry on PATH) +``` + +## What This Is + +Off-chain sequencer for an app-specific DeFi rollup. Accepts signed user operations, issues low-latency soft confirmations, and posts batches to L1. Currently backed by a placeholder wallet app (transfer, withdrawal). **Security-critical infrastructure** — handle every change accordingly. + +Rust edition 2024 / Axum API / SQLite (rusqlite, WAL) / EIP-712 signing / SSZ encoding. + +## Workspace Layout + +- `sequencer/` — main sequencer binary and library. +- `sequencer-core/` — shared domain types consumed by both sequencer and scheduler. +- `examples/app-core/` — placeholder wallet app implementing `Application`. +- `examples/canonical-app/` — on-chain scheduler reference implementation. +- `examples/canonical-test/` — e2e test harness for the canonical app. +- `sdk/rust-client/` — Rust client library for the sequencer API. +- `tests/{benchmarks,e2e,harness}/` — test infrastructure. + +## Sequencer Module Layout + +`sequencer/src/` is organized by writer role; `storage/.rs` holds each role's storage half. + +- `runtime/` — bootstrap, config, shutdown, shared clock. +- `ingress/` — public write path: `api.rs` (`POST /tx`) + `inclusion_lane/` (hot path). +- `egress/` — internal read path: `api/` (WS subscribe + health) + `l2_tx_feed/`. +- `l1/` — reader, submitter, provider, partition helper. +- `recovery/` — startup preemptive-recovery procedure, runtime danger detector, mempool flusher. +- `storage/` — SQLite persistence, split per writer role. +- `http.rs` — shared HTTP error type + `axum::serve` orchestration. + +## Before You Start Real Work + +- **[`AGENTS.md`](AGENTS.md)** — mission, requirements, invariants, duality, recovery, conventions, rules. +- **[`docs/threat-model/README.md`](docs/threat-model/README.md)** — trust boundaries and in-scope threats. +- **[`docs/recovery/README.md`](docs/recovery/README.md)** — preemptive recovery design + TLA+ proofs. +- **[`SECURITY_TODO.md`](SECURITY_TODO.md)** — open security findings awaiting fixes. diff --git a/Cargo.lock b/Cargo.lock index 2d42e6b..1156d1b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3581,6 +3581,7 @@ dependencies = [ "ethereum_ssz", "futures-util", "k256", + "rusqlite", "sequencer-core", "sequencer-rust-client", "serde", @@ -3870,6 +3871,7 @@ dependencies = [ "ethereum_ssz_derive", "futures-util", "k256", + "rollups-harness", "rusqlite", "rusqlite_migration", "sequencer-core", diff --git a/README.md b/README.md index d8b8997..96d84cf 100644 --- a/README.md +++ b/README.md @@ -1,189 +1,101 @@ -# Sequencer Prototype +# Sequencer -Prototype sequencer, currently backed by a dummy wallet app (`Transfer`, `Withdrawal`). +A sequencer for Cartesi app-specific rollups. Provides low-latency soft confirmations for user operations, posts them to L1 in batches, and maintains a deterministic replay feed that matches the application's final execution order. -Current focus is reliability of sequencing, persistence, and replay semantics. +**Security-critical infrastructure.** Handle every change with the care financial systems demand. -## Status +## What It Does -- Language: Rust (edition 2024) -- API: Axum (`POST /tx`, `GET /ws/subscribe`) -- Hot path: single blocking inclusion lane -- Storage: SQLite (`rusqlite`, WAL) -- Signing: EIP-712 (`alloy`) -- Payload encoding: SSZ +Rollup applications need fast transaction confirmations. Waiting for L1 finality on every user action (minutes) makes interactive applications impractical. The sequencer bridges this gap: it accepts signed user operations, immediately confirms them (soft confirmation), and asynchronously posts batches to L1. The application sees these batches posted on chain. -## Core Design +The core guarantee: **the off-chain sequencer and the rollup's on-chain scheduler produce identical execution order.** Users get instant feedback while the system converges to L1 truth. -- **User ops** arrive through the API, are validated, executed, and persisted by the inclusion lane. -- **Direct inputs** are stored in SQLite (`safe_inputs`) and sequenced in append-only replay order (`sequenced_l2_txs`). -- **Deposits** are direct-input-only (L1 -> L2) and are not accepted as user ops. -- **Ordering** is deterministic and persisted. Replay/catch-up reads `sequenced_l2_txs` joined with `user_ops` and `safe_inputs`. -- **Frame fee** is fixed per frame (`frames.fee`): - - users sign `max_fee` - - inclusion validates `max_fee >= current_frame_fee` - - execution charges `current_frame_fee` - - when opening a new frame or batch, the sequencer samples **`recommended_fee`** from the `batch_policy_derived` SQLite view (derived from `gas_price`, amortization `alpha`, and on-chain DA constants in `batch_policy`) -- **Batch closure by size** uses **`batch_size_target`** from the same view (stored on `WriteHead` as `max_batch_user_op_bytes`). The inclusion lane compares it to a **worst-case estimate** of in-batch user-op bytes (`batch_user_op_count × (per-op metadata cap + max method payload)`), not the exact SSZ-encoded batch size. A **time-based** max open duration also closes batches. +## Two Chains Synchronizing -## Quick Start +The sequencer maintains an optimistic chain of batches — a tree that normally degenerates into a list. Each batch contains frames, and each frame contains user operations plus a `safe_block` reference. The `safe_block` is the synchronization primitive: it tells the on-chain scheduler "drain all direct inputs (deposits) up to this L1 block, then execute these user ops." Both sides follow the rule, producing identical state. -From repo root: - -```bash -cargo check -cargo test -cargo fmt --all -cargo clippy --all-targets --all-features -- -D warnings ``` - -Run the server (example uses Anvil account #0 as batch submitter; use your own key in production): - -```bash -SEQ_ETH_RPC_URL=http://127.0.0.1:8545 \ -SEQ_CHAIN_ID=31337 \ -SEQ_APP_ADDRESS=0x1111111111111111111111111111111111111111 \ -SEQ_BATCH_SUBMITTER_PRIVATE_KEY=0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80 \ -cargo run -p sequencer +Sequencer (off-chain) Scheduler (on-chain) + frame: safe_block=100 drain directs up to block 100 + user_ops=[A, B, C] execute A, B, C + frame: safe_block=105 drain directs up to block 105 + user_ops=[D] execute D ``` -At startup the process checks that the RPC `eth_chainId` matches `SEQ_CHAIN_ID`. - -Optional runtime inputs: - -- `SEQ_HTTP_ADDR` defaults to `127.0.0.1:3000` -- `SEQ_DATA_DIR` defaults to `sequencer-data` (SQLite file is `sequencer.db` inside that directory; the directory is created if missing) -- `SEQ_LONG_BLOCK_RANGE_ERROR_CODES` defaults to `-32005,-32600,-32602,-32616` -- `SEQ_BATCH_SUBMITTER_PRIVATE_KEY_FILE` instead of `SEQ_BATCH_SUBMITTER_PRIVATE_KEY` (first line of the file is the key) -- `SEQ_BATCH_SUBMITTER_IDLE_POLL_INTERVAL_MS`, `SEQ_BATCH_SUBMITTER_CONFIRMATION_DEPTH` - -Required runtime inputs: +When things go well, the sequencer's chain and the scheduler's view converge. When they don't — batches arrive stale on L1 — the sequencer detects the divergence and recovers. -- `SEQ_ETH_RPC_URL` -- `SEQ_CHAIN_ID` -- `SEQ_APP_ADDRESS` -- `SEQ_BATCH_SUBMITTER_PRIVATE_KEY` or `SEQ_BATCH_SUBMITTER_PRIVATE_KEY_FILE` +## Trust Model -Fixed protocol identity (EIP-712): +The sequencer is a **centralized, single-writer** system. It cannot steal funds or forge invalid state — the rollup validates everything independently, and the proof system later enforces it. But the sequencer can: -- domain name: `CartesiAppSequencer` -- domain version: `1` -- `chain_id` and `verifying_contract` come from `SEQ_CHAIN_ID` and `SEQ_APP_ADDRESS` +- **Censor** — refuse to include a user's operations. +- **Go offline** — stop providing soft confirmations. +- **Diverge** — if batches fail to land on L1 in time, soft confirmations that were issued become invalid. -Most queue sizes, polling intervals, and safety limits are now internal runtime constants instead of public launch-time configuration. +**Direct inputs** (L1 → L2 messages, used for deposits) bypass the sequencer entirely. They are posted directly to L1 and are **uncensorable** by the sequencer — the scheduler drains them at every `safe_block` boundary. A censoring sequencer can delay when a direct input is executed (up to `MAX_WAIT_BLOCKS`, ~4h), but cannot prevent it. -## API +The third case is handled by the recovery subsystem. Batches that are too old when they reach L1 (`inclusion_block − safe_block ≥ MAX_WAIT_BLOCKS`) are skipped by the scheduler. This "staleness" poisons the nonce counter: all subsequent batches become unreachable regardless of their individual freshness. The sequencer detects this via a danger-zone threshold, preemptively goes offline, flushes the L1 mempool, and cascade-invalidates the doomed chain. See [`docs/recovery/`](docs/recovery/) for the full design, TLA+ formal verification, and design history. -### `POST /tx` +The sequencer trusts its own code is bug-free. Recovery means recovery from liveness failures, which can legitimately happen even in the absence of bugs (infrastructure outages, network failures, gateway failure). Code-level bugs are a separate problem handled by tests and review. See [`docs/threat-model/README.md`](docs/threat-model/README.md) for the complete threat model applied across the codebase. -Request shape: +## Failure Modes -```json -{ - "message": { - "nonce": 0, - "max_fee": 1, - "data": "0x..." - }, - "signature": "0x...", - "sender": "0x..." -} -``` - -Notes: +The sequencer is designed to handle: -- `signature` must be 65 bytes. -- `sender` is required and must match the recovered signer. -- `message.data` is SSZ-encoded method payload bytes. -- payload size is bounded at ingress; oversized requests are rejected before entering the hot path. -- overload is enforced at queue admission: if the inclusion-lane queue is full, `POST /tx` returns HTTP `429` with code `OVERLOADED` and message `queue full`. -- queue capacity is an internal runtime constant tuned alongside inclusion-lane chunking to absorb short bursts; if this starts triggering persistently, it is a signal to revisit runtime sizing or throughput rather than add another admission layer. +- **L1 provider outages** — workers retry with exponential backoff. The inclusion lane and API continue operating locally. A wall-clock fallback detects when an outage pushes batches into the danger zone. +- **Process crashes** — recovery runs at startup. All recovery state is derived from SQLite (atomic transactions) and L1 safe state. No external coordination needed. +- **Extended downtime** — on restart, the sequencer syncs to the current L1 safe head, flushes if needed, and recovers. +- **Adversarial L1 mempool** — block builders and private mempools are treated as adversarial. The recovery flusher consumes every pending nonce slot with a no-op so delayed "zombie" submissions cannot land later. -### `GET /ws/subscribe?from_offset=` +## Interfaces -WebSocket stream of sequenced L2 transactions from persisted order. +### User Operations -Notes: +Users submit signed operations via `POST /tx` (JSON). Operations are signed with EIP-712 using the rollup's chain ID and app address. The sequencer validates the signature, executes the operation against the current app state, and returns a soft confirmation. -- `from_offset` is optional and defaults to `0`. -- messages are JSON text frames. -- binary fields are hex-encoded (`0x`-prefixed). -- the current runtime enforces a subscriber cap of `64` and a catch-up cap of `50000` events. -- if the requested catch-up window exceeds that cap, the server upgrades and then immediately closes the socket with close code `1008` (`POLICY`) and reason `catch-up window exceeded`. +### Sequenced Transaction Feed -Message shapes: +Subscribers connect via `GET /ws/subscribe?from_offset=` (WebSocket). The feed delivers all sequenced transactions (user ops + direct inputs) in deterministic order, matching the on-chain execution order. This is the primary interface for downstream consumers (frontends, indexers). The endpoint is designed for a small number of indexer subscribers, which serve users directly. -```json -{ "kind": "user_op", "offset": 10, "sender": "0x...", "fee": 1, "data": "0x..." } -``` +### Batch Submission -```json -{ "kind": "direct_input", "offset": 11, "payload": "0x..." } -``` +The batch submitter posts closed batches to L1's InputBox contract. Each batch carries a sequential nonce for deduplication; L1 wallet nonces guarantee ordering. The submitter is stateless — it derives pending work from SQLite and L1 state each tick. -Success response: +## Running -```json -{ - "ok": true, - "sender": "0x...", - "nonce": 0 -} +```bash +SEQ_ETH_RPC_URL=http://127.0.0.1:8545 \ +SEQ_CHAIN_ID=31337 \ +SEQ_APP_ADDRESS=0x1111111111111111111111111111111111111111 \ +SEQ_BATCH_SUBMITTER_PRIVATE_KEY=0xac09...f2ff80 \ +cargo run -p sequencer ``` -## Storage Model - -- `batches`: batch metadata -- `frames`: frame boundaries within each batch -- `frames.fee`: committed fee for each frame -- `user_ops`: included user operations -- `sequenced_l2_txs`: append-only ordered replay rows (`UserOp` xor `DirectInput`); inserting into `user_ops` also appends the corresponding replay row via trigger `trg_sequence_user_op` -- `safe_inputs`: direct-input payload stream -- `batch_policy`: singleton knobs and constants for DA-style batch sizing and fee derivation; `batch_policy_derived` view exposes `recommended_fee` and `batch_size_target` - -## Project Layout - -- `sequencer/src/main.rs`: thin binary entrypoint -- `sequencer/src/lib.rs`: public crate surface -- `sequencer/src/config.rs`: runtime input parsing and EIP-712 domain construction -- `sequencer/src/runtime.rs`: sequencer bootstrap and component wiring -- `sequencer/src/api/`: HTTP API and error mapping -- `sequencer/src/inclusion_lane/`: hot-path inclusion loop, chunk/frame/batch rotation, catch-up -- `sequencer/src/input_reader/`: safe-input ingestion from InputBox into SQLite -- `sequencer/src/l2_tx_feed/`: DB-backed ordered-L2Tx feed for WS subscriptions -- `sequencer/src/storage/`: schema, migrations, SQLite persistence, and replay reads -- `sequencer-core/src/`: shared domain types and interfaces (`Application`, `SignedUserOp`, `SequencedL2Tx`, feed message types) -- `examples/app-core/src/`: wallet prototype implementing `Application` -- `tests/benchmarks/`: benchmark harnesses and benchmark spec +Required: `SEQ_ETH_RPC_URL`, `SEQ_CHAIN_ID`, `SEQ_APP_ADDRESS`, `SEQ_BATCH_SUBMITTER_PRIVATE_KEY` (or `_FILE`). -## Prototype Limits +Optional: `SEQ_HTTP_ADDR` (default `127.0.0.1:3000`), `SEQ_DATA_DIR` (default `sequencer-data`), `SEQ_PREEMPTIVE_MARGIN_BLOCKS` (default `75`), `SEQ_SECONDS_PER_BLOCK` (default `12`), `SEQ_BATCH_SUBMITTER_IDLE_POLL_INTERVAL_MS`, `SEQ_BATCH_SUBMITTER_CONFIRMATION_DEPTH`. -- Wallet state is in-memory and not persisted. -- Schema and migrations are still in prototype mode and may change. - -## Local Test Prerequisites - -- Some `sequencer` tests spin up `Anvil`; install Foundry locally if you want the full test suite: -- Self-contained benchmarks also spawn `Anvil` from a preloaded rollups state dump. +## Development ```bash -foundryup +cargo check # compile +cargo test --workspace --exclude canonical-test # test (canonical-test needs libslirp) +cargo fmt --all # format +cargo clippy --all-targets --all-features -- -D warnings # lint ``` -- Prepare local benchmark + guest build dependencies: - -```bash -just setup -``` +Some tests require [Foundry](https://getfoundry.sh) (`anvil` on PATH). They run by default and fail with a clear message if unavailable. This project uses Nix + direnv for tooling — `direnv allow` provides Foundry, TLA+, and other dependencies. -- Enable the Anvil-backed reader tests explicitly: +## Further Reading -```bash -RUN_ANVIL_TESTS=1 cargo test -p sequencer --lib -``` +- [`AGENTS.md`](AGENTS.md) — developer guide: architecture, conventions, duality, recovery, invariants, rules. +- [`CLAUDE.md`](CLAUDE.md) — quick reference for shell setup and commands. +- [`docs/threat-model/README.md`](docs/threat-model/README.md) — trust boundaries, in-scope and out-of-scope threats. +- [`docs/recovery/README.md`](docs/recovery/README.md) — recovery design, TLA+ formal verification, design history. +- [`SECURITY_TODO.md`](SECURITY_TODO.md) — open security findings. +- [`sequencer-core/`](sequencer-core/) — shared domain types (`Application`, `SignedUserOp`, `Batch`, `Frame`). +- [`examples/app-core/`](examples/app-core/) — placeholder wallet app implementing the `Application` trait. ## License -Apache-2.0. See `LICENSE`. - -Authors are listed in `AUTHORS`. +Apache-2.0. See [`LICENSE`](LICENSE). Authors in [`AUTHORS`](AUTHORS). diff --git a/SECURITY_TODO.md b/SECURITY_TODO.md new file mode 100644 index 0000000..8bcf7d3 --- /dev/null +++ b/SECURITY_TODO.md @@ -0,0 +1,205 @@ +# Security Review TODO + +Open findings from the staged security review. The threat model being applied is documented in [`docs/threat-model/README.md`](docs/threat-model/README.md). + +Findings accumulate here section by section as review parts complete. Fixes are batched after all passes finish to avoid interleaving changes with ongoing review. + +## Severity legend + +- **Critical** — protocol break or directly exploitable; must be fixed before any public deployment +- **High** — exploitable under realistic conditions +- **Medium** — real issue, conditional impact +- **Low** — defense-in-depth / hardening + +--- + +## Part 1 — Scheduler + +### [Critical] EIP-712 domain mismatch between scheduler and sequencer + +**Locations:** +- Scheduler uses `name: None, version: None` — [`examples/canonical-app/src/scheduler/core.rs:328`](examples/canonical-app/src/scheduler/core.rs) +- Sequencer uses `name: Some("CartesiAppSequencer"), version: Some("1")` — [`sequencer/src/runtime/config.rs:8`](sequencer/src/runtime/config.rs) and [`sequencer/src/runtime/config.rs:116`](sequencer/src/runtime/config.rs) + +**What it is.** The two sides disagree on which optional fields are present in the EIP-712 domain struct. Presence vs absence of `name` and `version` changes the `typeHash` used in `hashStruct(EIP712Domain)`, which changes the domain separator, which changes the final signing hash. The same signature recovers a different address (or fails) under each domain. + +`UserOp` has no `from` field ([`sequencer-core/src/user_op.rs:10`](sequencer-core/src/user_op.rs)), so the address returned by `recover_address_from_prehash` is authoritative. The scheduler passes it directly to `validate_and_execute_user_op(sender, ...)` without cross-check ([`examples/canonical-app/src/scheduler/core.rs:251`](examples/canonical-app/src/scheduler/core.rs)). + +**Impact.** Every honest user transaction that the sequencer admits is undeliverable on the scheduler. The sequencer's WS feed and HTTP responses promise soft confirmations for transactions the rollup cannot execute. Off-chain state diverges from canonical state on every tx. + +**Why no existing test catches it.** `examples/canonical-test/src/main.rs:233` constructs the domain with the same `None, None` form used by the scheduler, so scheduler-local tests agree with themselves while failing to cross-check against real sequencer-produced signatures. + +**Action items:** +- [ ] Promote `DOMAIN_NAME` and `DOMAIN_VERSION` into `sequencer-core` and expose a shared `build_input_domain(chain_id, app_address) -> Eip712Domain` constructor. +- [ ] Replace the local constructors in `sequencer/src/runtime/config.rs::build_domain` and `examples/canonical-app/src/scheduler/core.rs::input_domain` with the shared one. +- [ ] Add an integration test that signs a `UserOp` through the sequencer's signing path and asserts a scheduler-side recovery yields the same address. +- [ ] Update `examples/canonical-test/src/main.rs` to use the shared constructor so the harness cannot mask future drift. + +**Threat model note.** This is a correctness bug, not an attacker-triggered exploit. Under the rollup's security model, a correctness bug that causes scheduler/sequencer state divergence is as severe as direct theft — the sequencer's soft-confirmation guarantee is structurally broken. + +--- + +## Part 2 — `sequencer-core` (excluding `fee.rs`) + +### [Low] `INPUT_TAG_DIRECT_INPUT` is a dead-code constant with self-contradicting documentation + +**Locations:** +- [`sequencer-core/src/batch.rs:6-9`](sequencer-core/src/batch.rs) — constant and its stale docstring +- [`sequencer-core/src/batch.rs:40-41`](sequencer-core/src/batch.rs) — authoritative (and correct) contract documentation in the same file +- [`AGENTS.md:103`](AGENTS.md) — reinforces the stale claim + +**What it is.** The constant is documented as if it were part of the wire contract (`0x00 || body`), but zero code in the workspace reads it. Input classification is actually by `msg_sender`, with the payload treated as opaque bytes — which the adjacent paragraph correctly states. The two paragraphs in the same file contradict each other. + +**Impact.** No runtime exploit today; both sides agree on "ignore any tag byte." The forward-looking risk is that a future change acting on the misleading doc could add tag checking on one side but not the other, silently causing scheduler/sequencer divergence. + +**Action items:** +- [ ] Remove the `INPUT_TAG_DIRECT_INPUT` constant and its docstring from `sequencer-core/src/batch.rs`. +- [x] Remove the corresponding paragraph in `AGENTS.md`. *(Done in the 2026-04-15 AGENTS.md rewrite — classification is now documented as by `msg_sender`, payload opaque.)* +- [ ] Keep the correct paragraph at `batch.rs:40-41` as the authoritative wire contract. + +--- + +### [Low] Protocol invariant `max_fee >= current_fee` lives per-impl instead of in the shared trait default + +**Locations:** +- [`sequencer-core/src/application/mod.rs:99-116`](sequencer-core/src/application/mod.rs) — default `validate_and_execute_user_op`, no pre-check +- [`examples/canonical-app/src/scheduler/core.rs:247-250`](examples/canonical-app/src/scheduler/core.rs) — scheduler's explicit protocol-level pre-check +- [`examples/app-core/src/application/wallet.rs:150-155`](examples/app-core/src/application/wallet.rs) — wallet impl correctly enforces the same rule + +**What it is.** The scheduler treats `max_fee >= fee_price` as a protocol-level invariant, checked *before* dispatch into the `Application` trait. The sequencer's side relies on each `Application` impl to enforce the rule via its own `validate_user_op`. The shared `sequencer-core` trait default does not encode the invariant. An app impl that omits the check would cause the sequencer to admit ops the scheduler silently drops — structural soft-confirmation break. + +**Impact.** Latent. The shipping `WalletApp` enforces the check correctly. The concern is that a protocol invariant lives in two places (scheduler source + each app impl) rather than in the shared crate. + +**Action items:** +- [ ] Move the `max_fee < current_fee` check into the default `validate_and_execute_user_op` in `sequencer-core/src/application/mod.rs` (return `ExecutionOutcome::Invalid(InvalidMaxFee { .. })` before dispatching to `validate_user_op`). +- [ ] Optional: remove the now-redundant pre-check at `scheduler/core.rs:247-250`, or leave it as defense-in-depth. +- [ ] Optional: remove the now-redundant check from `WalletApp::validate_user_op`. + +--- + +## Part 5 — L1 Interaction + +No vulnerability findings. See Hardening section below for two defense-in-depth items surfaced by the Part 5 review. + +--- + +## Part 6 — Recovery + +### [Low] `open_recovery_batch_in_tx` masks `l1_safe_head` corruption with silent zero + +**Location:** [`sequencer/src/storage/recovery.rs:388`](sequencer/src/storage/recovery.rs) + +**What it is.** During recovery, the safe block is read via `query_current_safe_block(tx).unwrap_or(0)`. If the `l1_safe_head` singleton row is missing (DB corruption, manual tampering, forgotten migration), the recovery batch is opened with `safe_block = 0`. + +**Impact.** A recovery batch with `safe_block = 0` is immediately stale on any chain older than `MAX_WAIT_BLOCKS` blocks (i.e., effectively always). The scheduler skips it. The sequencer's danger-detection fires again on the next tick → new recovery → new batch with `safe_block = 0` → stale again. Infinite recovery loop, bounded only by the batch submitter's gas budget. + +Every other `query_current_safe_block` caller in the codebase propagates the error. This is an unprincipled silent-failure path in the one subsystem where silent failure is worst. + +**Why not higher severity.** The triggering condition is not adversary-reachable — it requires DB corruption. Under self-trust, operator-caused DB state is not a threat we runtime-defend. The finding is filed because the Part 6 threat-model calibration calls for extra rigor in recovery-internal correctness, and this is a silent-fail regression vs the rest of the codebase. + +**Action items:** +- [ ] Replace `.unwrap_or(0)` with `?` propagation: `let safe_block = query_current_safe_block(tx)?;` +- [ ] Add a test asserting `open_recovery_batch_in_tx` returns an error (not silent zero) when `l1_safe_head` has no row. + +--- + +*Vulnerability findings from subsequent review parts will be appended here, above the Hardening section.* + +--- + +## Hardening / Defense-in-Depth + +Not vulnerabilities under the project's threat model — filed here to track opportunistic hardening that reduces surface area or information disclosure without addressing concrete exploits. Apply when convenient; no urgency. + +### [Hardening] rusqlite error text echoed to 500 response body + +**Location:** [`sequencer/src/ingress/inclusion_lane/mod.rs:244-247`](sequencer/src/ingress/inclusion_lane/mod.rs) + +**What it is.** `append_user_ops_chunk` failures are mapped into the client-facing 500 JSON body via `SequencerError::internal(format!("db error: {err}"))`. `rusqlite::Error::Display` can include SQL fragments, table / column / constraint names, and SQLite detail messages. These then appear verbatim in the `message` field of the JSON response. + +**Why not a vulnerability.** Not adversary-reachable — no user-submitted field hits a UNIQUE constraint or FK, and the schema is visible in the open migration file anyway. The path only fires on operational incidents (disk full, WAL contention, migration drift). Surfaced in Part 4 review. + +**Action item:** +- [ ] Replace the interpolated `{err}` with a constant client-facing string (e.g. `"internal storage error"`). Keep the detailed `rusqlite::Error` on the lane-crash / structured-log path only. Mirrors the existing `ApiError::internal_error("inclusion lane dropped response")` pattern. + +### [Hardening] axum `JsonRejection` Display text echoed to 400 response body + +**Location:** [`sequencer/src/ingress/api.rs:94-100`](sequencer/src/ingress/api.rs) + +**What it is.** `map_json_rejection` wraps axum's raw `JsonRejection::Display` into `ApiError::bad_request(format!("invalid JSON: {err}"))`. For malformed bodies the Display text includes serde's line/column and an excerpt of the offending token, exposing parser-version fingerprinting and reflecting attacker-submitted bytes. + +**Why not a vulnerability.** Response content-type is `application/json`, so no XSS. The attacker is reflecting their own bytes back to themselves — no credential or third-party data exposure. Fingerprinting axum/serde versions is low-impact (dep versions are recoverable from `Cargo.lock`). Surfaced in Part 4 review. + +**Action item:** +- [ ] Replace `{err}` interpolation with a fixed taxonomy driven by the `JsonRejection` variant: `"invalid JSON"`, `"missing content type"`, `"unsupported content type"`, `"request body too large"`. Log the full `err` for operators. +- [ ] Audit any other handler that maps extractor rejections into user-visible error bodies and apply the same pattern. + +### [Hardening] Private-key parse error may echo key bytes into the error string + +**Location:** [`sequencer/src/l1/provider.rs:52-54`](sequencer/src/l1/provider.rs) + +**What it is.** `create_signer_provider` formats the underlying parse error as `format!("invalid private key: {e}")`. alloy's `LocalSignerError` wraps `hex::FromHexError::InvalidHexCharacter { c, index }`, which echoes a character from the input and its index. For a key that *almost* parsed (typo, stray whitespace, extra characters), the error string includes one character of the intended secret plus its position — enough to substantially narrow the secret for an observer with access to the startup log. + +**Why not a vulnerability.** Operator-trusted surface; not adversary-triggered. Surfaced in Part 5 review. + +**Action item:** +- [ ] Replace the interpolated `{e}` with a fixed string, e.g. `.map_err(|_| "invalid private key".to_string())`. Mirror in `runtime/mod.rs` and any other callsite that maps `PrivateKeySigner::from_str` errors. + +### [Hardening] Provider accepts `http://` URLs with no scheme enforcement + +**Location:** [`sequencer/src/l1/provider.rs:20-47`](sequencer/src/l1/provider.rs) + +**What it is.** `create_client` accepts any URL parseable by `reqwest::Url`. No guard against `http://` for non-loopback hosts. Our node and Infura/Alchemy fallback are both trusted fail-stop under the threat model (MITM is byzantine, out of scope), but a scheme typo in a remote RPC URL makes MITM newly possible — a concrete operational foot-gun. + +**Why not a vulnerability.** The threat being prevented is out-of-scope byzantine RPC. This guard just reduces the blast radius of operator misconfig. Surfaced in Part 5 review. + +**Action item:** +- [ ] In `create_client`, reject non-`https` schemes unless the host is a loopback address (`127.0.0.1`, `::1`, `localhost`). Three-line guard. + +### [Hardening] Flusher bumps `max_priority_fee_per_gas` but not `max_fee_per_gas` + +**Location:** [`sequencer/src/recovery/flusher.rs:147-155`](sequencer/src/recovery/flusher.rs) + +**What it is.** The flusher submits no-op txs with `max_priority_fee_per_gas` doubled vs the current fee estimate, but `max_fee_per_gas` unchanged. Ethereum's local-node replacement rule requires **both** fields to bump by ≥10% to evict an existing tx at the same `(sender, nonce)`. If a previously-submitted batch tx is still in our node's mempool when the flusher runs, the no-op replacement will be rejected by our own node. + +**Why not a vulnerability.** The outer `flush_and_wait` loop is unbounded (runs until `pending ≤ safe`), so eventual inclusion of either the original batch tx or the no-op resolves the slot. Safety holds regardless of which lands; only operational efficiency suffers. Surfaced in Part 6 review. + +**Action items:** +- [ ] Bump `max_fee_per_gas` by ≥10% in the flusher too, mirroring the priority-fee bump. +- [ ] Add a sentence to `docs/recovery/README.md` clarifying that flush safety does not depend on eviction — it depends on the unbounded outer loop. + +### [Hardening] Hardcoded 12s block time in flusher's confirmation timeout + +**Location:** [`sequencer/src/recovery/flusher.rs:22, 25`](sequencer/src/recovery/flusher.rs) + +**What it is.** `MempoolFlusher::CONFIRMATION_TIMEOUT = 120 seconds` hardcodes 10 × 12s = Ethereum cadence. On slower chains the per-tx watch fires spuriously; on faster chains, it's needlessly conservative. The related `SEQ_SECONDS_PER_BLOCK` is already operator-configurable for the wall-clock danger estimate but not wired into the flusher. + +**Why not a vulnerability.** Inner `watch_txs` timeout only affects retry cadence; the outer loop retries. No correctness impact. Surfaced in Part 6 review. + +**Action item:** +- [ ] Derive `confirmation_timeout` from `SEQ_SECONDS_PER_BLOCK * N` (e.g., N = 10), mirroring the batch poster's existing formula. + +### [Hardening] Chain-id mismatch check runs late in bootstrap, after recovery writes to DB + +**Location:** [`sequencer/src/runtime/mod.rs:211-257`](sequencer/src/runtime/mod.rs) and [`sequencer/src/runtime/mod.rs:132`](sequencer/src/runtime/mod.rs) (cache write) + +**What it is.** `assert_eq!(rpc_chain_id, config.chain_id)` runs at line 257 — **after** `run_preemptive_recovery` (line 211), `input_reader.start()` (line 232), and the L1-cache write at line 132. The cache stores `config.chain_id` (operator-supplied), not the live RPC value. On a misconfigured chain_id, recovery pulls safe inputs from the wrong chain's InputBox before the mismatch panic fires. On crash-loop (systemd/k8s restart), each boot accumulates more wrong-chain `safe_inputs` rows. + +**Why not a vulnerability.** Operator-config triggered; not adversary-reachable. Per the threat model, operator config is trusted. Filed as hardening because the fix is a genuine bootstrap-correctness improvement. Surfaced in Part 8 review. + +**Action items:** +- [ ] Move the chain_id check to immediately after `provider` construction, before any `sync_to_current_safe_head` or `input_reader.start`. +- [ ] Return a typed `RunError`, not `assert_eq!` panic. +- [ ] Store the live-queried chain_id in the L1 cache (not `config.chain_id`), so the cache-fallback path at line 160 has independent evidence. + +### [Hardening] `SEQ_SECONDS_PER_BLOCK=0` causes divide-by-zero panic during wall-clock fallback + +**Location:** [`sequencer/src/runtime/config.rs:111`](sequencer/src/runtime/config.rs) (config), [`sequencer/src/recovery/mod.rs:210`](sequencer/src/recovery/mod.rs) (use site) + +**What it is.** `SEQ_SECONDS_PER_BLOCK` is parsed as unbounded `u64` with no min validation. Used directly as divisor: `elapsed_secs / seconds_per_block`. An operator typo `=0` panics the process during the L1-outage fallback path — the worst time for the sequencer to crash. + +**Why not a vulnerability.** Operator-config triggered. Surfaced in Part 8 review. + +**Action items:** +- [ ] Add a clap `value_parser` on `seconds_per_block` requiring `>= 1`. +- [ ] Optionally mirror a guard at the use site in `wall_clock_danger_estimate` for defense in depth. diff --git a/SESSION_NOTES.md b/SESSION_NOTES.md new file mode 100644 index 0000000..08bb816 --- /dev/null +++ b/SESSION_NOTES.md @@ -0,0 +1,286 @@ +# Session Handoff — 2026-04-18 / 2026-04-19 + +Ephemeral note for the next agent. Delete after absorbing. + +## TL;DR + +This session landed **seventeen new e2e tests** (19 → 36 passing) across +four batches: + +1. §11 outage matrix + recovery critical path (8 tests). +2. Tier A e2e follow-up — WS cursor edges, direct-input drain corners, + replay determinism, input reader retry (6 tests). +3. Tier A bootstrap edges — first-boot-no-cache, chain-id mismatch via + live RPC, nonce-0 first-batch recovery (3 tests). + +Plus the harness primitives that unlocked them. All work under `tests/`. + +- **T7 (libfaketime dynamic)** was already in place from the prior session. +- **T8 (orchestrator-restart)** added: `RespawnAttemptOutcome` / + `RespawnPolicy` / `respawn_and_watch` / `respawn_until_stable`. +- **T2 (Anvil runtime toggle)** added: `set_automine(bool)` + + `drop_all_pending_txs` (via `anvil_setAutomine` / `anvil_dropAllTransactions`). +- **`reset_l1_safe_head_synced_at_ms`** added for §7.8.2. +- **`observe_for(Duration)`** added for §7.3.5-style negative controls. + +**§7.1.1 deliberately left `[-]` (out of scope).** See "Decisions" +below. + +## State of the tree + +- **New/modified files** (uncommitted; user plans a squash-later strategy): + - `tests/harness/src/sequencer.rs` + - `tests/harness/src/rollups.rs` + - `tests/harness/src/lib.rs` + - `tests/e2e/src/test_cases.rs` — 17 new scenarios. + - `tests/TEST_PLAN.md` — rows flipped; new T2 + T8 tooling rows. +- **Tests**: 36 e2e passing (`just test-rollups-e2e`). Unit/integration + suite not re-run this session (no sequencer code changed). +- **Lint**: `cargo fmt --all --check` + `cargo clippy --all-targets + --all-features -- -D warnings` clean. + +## Tests landed (this session, both iterations combined) + +Outage matrix / recovery critical path: + +| Row | Test | Shape | +|-----|------|-------| +| §11.4.1 | `provider_outage_short_hiccup_no_recovery_test` | Brief proxy disconnect, no L1/wall-clock advance; POST /tx keeps working, zero invalidation | +| §11.3.2 | `both_down_danger_zone_sequencer_first_refuses_boot_test` | Both stopped, advance into danger zone, sequencer respawn refuses while L1 still unreachable | +| §11.3.3 | `both_down_danger_zone_proxy_first_restart_cycle_recovers_test` | Both stopped, advance into danger zone, proxy reconnects first; `respawn_until_stable` drives to convergence with cascade | +| §11.1.5 | `sequencer_outage_danger_zone_coupled_restart_cycle_recovers_test` | Coupled wall+L1 advance into danger; orchestrator loop converges | +| §11.2.2-followup | `provider_outage_danger_zone_mid_run_exit_then_restart_cycle_recovers_test` | Mid-run DangerZone exit + reconnect + restart cycle → cascade | +| §7.8.2 | `first_boot_l1_unreachable_never_synced_refuses_boot_test` | `synced_at_ms == 0` branch of wall-clock fallback refuses to boot | +| §11.1.4 | `delayed_inclusion_cascades_on_restart_test` | Mempool-held submission, dropped, advance past MAX_WAIT, respawn cascades | +| §7.3.5 | `aging_open_tip_tolerated_by_zombie_check_test` | Submitter's closed-only zombie check tolerates aging open Tip; fires on subsequent auto-close | + +Tier A e2e follow-up (WS / drain / replay / input reader): + +| Row | Test | Shape | +|-----|------|-------| +| §4.4.2 | `ws_reconnect_at_invalidated_offset_skips_cleanly_test` | Reconnect at a previously-observed offset that got invalidated; cursor skips cleanly and delivers only post-recovery events | +| §4.1.3 | `ws_subscribe_from_future_offset_waits_silently_test` | Pin the "subscribe beyond head waits silently" contract (consistent with `from_offset=0` on an empty head) | +| §7.4.2 | `recovery_drains_safe_but_undrained_direct_input_test` | Deposit that was safe but never-drained before the sequencer stopped lands in the recovery batch's first frame on respawn | +| §7.4.3 | `recovery_batch_opens_empty_when_no_direct_inputs_pending_test` | Negative control: no deposits → recovery batch opens empty, cascade still fires on aged empty initial Tip | +| §10.1.1 | `replay_matches_live_for_mixed_workload_test` | 3-user mixed workload; post-restart WS catch-up produces per-user state identical to the live replay | +| §5.4.1 / §5.4.2 | `provider_outage_input_reader_retries_after_reconnect_test` | T1 proxy disconnect + L1 deposit (bypassing proxy) + reconnect → reader's retry loop catches up without crashing | + +Tier A bootstrap edges (the final batch this session): + +| Row | Test | Shape | +|-----|------|-------| +| §8.1.2 | `first_boot_no_cache_l1_unreachable_refuses_boot_test` | `clear_l1_bootstrap_cache` after a normal boot, then respawn through a disconnected proxy. Bootstrap discovery has nothing to fall back to → refuses boot. Distinct from §7.8.2 (wall-clock fallback): hits the *earlier* `InputReader::new` discovery step. | +| §8.2.1 / §8.3.1 / §6.5.1 | `chain_id_mismatch_via_live_rpc_refuses_boot_test` | H7 RPC-path regression. Spawns the full sequencer binary against real Anvil with a mismatched `--chain-id` (override via new `set_chain_id_override` harness method); bootstrap-time RPC check returns `RunError::ChainIdMismatch`. Reset-and-respawn proves the failed attempt didn't poison the cache. The previous integration-level scaffolding in `sequencer/tests/chain_id_validation.rs` (cache path) stays — these complement each other. | +| §7.5.1 / §7.5.2 | `nonce_zero_recovery_invalidates_then_accepts_at_nonce_zero_test` | Nonce-0 first-batch recovery edge. Uses T2 to ensure the first-ever batch's L1 submission is dropped before reaching the chain. Cascade fires; recovery batch reuses nonce 0 (parent NULL — no genesis sentinel). Then drives 150 transfers + 2 explicit L1 confirmations to land the recovery batch in `safe_accepted_batches` at the reused nonce, proving §7.5.2 (`populate_safe_accepted_batches_inner` cursor handles reuse). | + +## Harness primitives added + +Inline-documented in `tests/harness/src/sequencer.rs`: + +- `respawn_and_watch(stabilization) -> RespawnAttemptOutcome` — classifies a + single respawn attempt as `Stable` / `RespawnFailed(String)` / + `ExitedPostRespawn(ExitStatus)`. +- `respawn_until_stable(policy) -> Vec` — loops + `respawn_and_watch`, advancing L1+wall by `policy.advance_per_retry` + between failed attempts. Required for the danger-zone-to-cascade + convergence path (closed batch only cascades once it ages past + `MAX_WAIT_BLOCKS`, so each retry needs L1 + wall-clock drift). +- `set_automine(bool)` + `drop_all_pending_txs()` — T2. Toggle Anvil's + auto-mining and flush its mempool without respawning Anvil or affecting + other tests. Chosen over `--no-mining` spawn flag precisely because + it's runtime-toggleable. +- `reset_l1_safe_head_synced_at_ms()` — zeros the DB's + `l1_safe_head.synced_at_ms` while the sequencer is stopped, to simulate + "never synced L1" without reconstructing a truly-blank DB. +- `observe_for(grace) -> Option` — watches the child for + `grace` without consuming its exit handle. Returns `None` if still + alive (safe to continue), `Some(status)` if the child exited within + the window. Used by §7.3.5 as a negative-control "stayed up" check. +- `clear_l1_bootstrap_cache()` — DELETE on `l1_bootstrap_cache`. Used + by §8.1.2 to mimic a never-bootstrapped DB, and by §8.2.1 to force + the live-RPC chain-id check (bypasses the cache-path that would + catch the mismatch first). +- `set_chain_id_override(Option)` — overrides the `--chain-id` + argument the sequencer is spawned with on the next respawn. Used + by §8.2.1 / §8.3.1 to inject a deliberately wrong chain id and + exercise the bootstrap-time RPC mismatch path. +- `count_safe_accepted_batches() -> (count, min_nonce)` — read-only + snapshot of `safe_accepted_batches`. Used by §7.5.2 to verify that + the recovery batch's L1 submission lands and gets accepted at its + expected (reused-zero) nonce. + +## Decisions worth remembering + +### §7.1.1 — skipped, marked `[-]` + +Originally on the Tier A list. After investigating: + +- **Unique submitter-side code path it would exercise** (live + `check_danger_zone` firing on closed-in-danger batch): **already + covered** by §7.3.5. Both tests reach the same submitter state + (closed batch in `batches`, not in `safe_accepted_batches`); the + setup story differs (§7.3.5 = aged Tip auto-closes; §7.1.1 = mempool + lost submission), but the code path through + `BatchSubmitterError::DangerZone` is identical. +- **Other unique path** + (`populate_safe_accepted_batches_inner`'s `batch_age_is_stale` + continue, i.e., the scheduler's "skip past-stale inclusion" logic): + has a unit test. Hard to exercise e2e because Anvil's `anvil_mine(N)` + mines any pending tx into the first mined block — you can't hold a + tx in the mempool while L1 advances. +- **Bonus obstacle**: the submitter's + `wait_for_confirmations` timeout is `(confirmation_depth + 1) × 2 × + ETHEREUM_BLOCK_TIME_SECS`, hard-coded against + `ETHEREUM_BLOCK_TIME_SECS = 12s`. Minimum 24 s at depth 0. Tokio's + `Instant`-based timers aren't intercepted by libfaketime on macOS, so + we can't fast-forward through that wait. + +Verdict: the effort-to-value ratio doesn't justify adding the test. If +T3 ever lands (sub-second poll interval + config-tunable +`ETHEREUM_BLOCK_TIME_SECS`), §7.1.1 becomes a small marginal win; until +then, treat §7.3.5 + §11.1.4 as covering the delayed-inclusion space. + +### `set_faketime_offset` wants `"+Ns"`, not `"+2h5m"` + +I initially wrote §7.3.5 using `"+2h5m"` for the wall-clock jump past +`max_batch_open`. The test hung in `wait_for_exit`; libfaketime +doesn't parse combined unit forms reliably. Fix: use `"+7500s"` (same +format `advance_wall_and_mine` writes). Safer default going forward. + +### `§7.3.5`'s `observe_for` invariant + +The 8 s observation window isn't arbitrary — it must span at least one +full `batch_submitter_idle_poll_interval_ms` (default 5 s) + input +reader poll (~2 s). If someone lowers those defaults in the future, +consider whether §7.3.5's window is still large enough (it currently +has ~1 s of headroom). + +### `§11.1.5`'s `outcomes.len() >= 2` assertion + +Load-bearing: without it, a future change that made the first respawn +converge (e.g., startup recovery cascading at `danger_threshold` +instead of `MAX_WAIT_BLOCKS`) would silently turn this test into a +trivial single-respawn test, losing the flush/shutdown-path coverage. + +### §11.1.4's re-enable-auto-mining-before-respawn step + +Also load-bearing: the startup flusher submits a no-op at the stuck +wallet-nonce slot and needs auto-mining on to see it confirm. +Otherwise the flush hangs. Don't reorder the setup. + +### §4.4.2's "reconnect across invalidation" reframing + +The original TEST_PLAN phrasing ("live subscriber at the time of +invalidation") is structurally impossible — invalidation fires inside +`run_preemptive_recovery`, after the sequencer exits (DangerZone or +stop), so the WS socket always dies before the cascade. The +meaningful test is the reconnect arc: captured offset → kill → +cascade → reconnect at captured offset → cursor skips cleanly. Row +in TEST_PLAN is updated to match. + +### §10.1.1 complements, doesn't replace, `restart_and_replay_test` + +The existing `restart_and_replay_test` already does a restart + WS +catch-up + assert-replay-state for a single-user workload, and it +pins the specific balances. §10.1.1 adds a distinct test because the +property being asserted is *general* (any live workload must replay +deterministically), not the particular expected values, and because +it sweeps a wider multi-sender / multi-op workload. Keep both — the +single-sender test catches value regressions; the mixed-workload +test catches replay-divergence regressions. + +### Wallet endpoints don't survive respawn + +`runtime.endpoint()` rebinds to a fresh local port on every respawn +(see `build_local_endpoint`). Any `WalletL2Client` / `WsClient` +created BEFORE a respawn still holds the old endpoint string and +will fail with "tcp connect failed" on the next call. + +Idiom: re-create both via `runtime.wallet_l2(...)` and `runtime.ws(...)` +after every respawn. Caught this in §7.5.x during development; the +post-recovery transfer phase failed until the wallet was recreated. + +### §7.5.2's confirmation timing + +The submitter's `wait_for_confirmations` is hard-coded against +`ETHEREUM_BLOCK_TIME_SECS = 12` and waits for `confirmation_depth + +1 = 3` confirmations. With Anvil's instamine, the submission lands +at 1 confirmation (the block carrying it). To unblock the wait +without sitting through the 72 s timeout, §7.5.2 explicitly mines 2 +extra blocks via `mine_l1_blocks(2)` after the submission. If T3 +ever lands and `confirmation_depth` becomes test-tunable, this +manual mining can go away. + +## Open items + +### Tier A — remaining recovery-critical-path work + +Nothing. §7.1.1 closed as `[-]`; the critical path is fully covered. + +### Tier B — tooling quality-of-life + +- **T3** — plumb `SEQ_BATCH_SUBMITTER_IDLE_POLL_INTERVAL_MS` through + `ManagedSequencerConfig`. Would shorten §11.1.4's 7 s sleep and + §7.3.5's 8 s observation window, and open up §7.1.1 as a cheap test + (if combined with a config-tunable `ETHEREUM_BLOCK_TIME_SECS` at the + poster layer). Medium work. + +### Tier C — broader e2e coverage (mostly done) + +The remaining `tests/e2e` gaps are very small after this session: + +- **§4.3.1** — 65th WS subscriber rejected. Already covered at + integration level (`ws_subscribe_rejects_when_subscriber_limit_is_reached` + in `sequencer/tests/ws_broadcaster.rs`); duplicating at e2e is + marginal and CI fd-limit-prone. +- **§9.1.3 / §9.1.4** — shutdown during batch submission / input + reader poll. Timing-sensitive; would need T2 + careful mid-flight + signaling. Lower priority than what's left in other layers. +- **§2.1.2** — soft-confirmation latency budget (POST → WS within 500 + ms). Useful as a regression guard but flaky on slow CI; probably + needs a generous bound. + +Everything else of value at the e2e layer has landed. + +### Tier D — better at other layers + +- **§2.3.1–5** (API body hardening) — better in + `sequencer/tests/e2e_sequencer.rs`. Spinning up the full e2e stack + for a 400/413 check is wasteful. +- **§12.1.1** (schema CHECKs) — unit tests in `storage/`. +- **§7.7.4/5** (flusher H5/H6) — better in + `batch_submitter_integration.rs`; assertions are on tx field + values, not end-to-end flows. + +### Tier E — needs sequencer-side work (out of scope here) + +- **T5 failpoints** — gates §2.10.1 / §5.3.1 / §7.2.2 / §7.6.3. +- **TLA+ alignment** — docs/spec sync with the parent-pointer schema + refactor. + +## Commit hygiene + +The user has opted for a squash-later strategy on this branch. As of +handoff, all work is uncommitted. Natural squash boundaries if the +user changes their mind: + +1. **T8 + five §11.x tests** (orchestrator-restart + matrix closure) +2. **T2 + §11.1.4** (delayed-inclusion) +3. **§7.8.2** (first-boot L1 down) +4. **§7.3.5** (aging Tip negative control) +5. **TEST_PLAN + SESSION_NOTES updates** bundled through each above + +## Context a new agent will need + +All doc pointers from prior handoffs remain accurate. Specific to this +session: + +- New harness primitives are documented inline in + `tests/harness/src/sequencer.rs` (`respawn_and_watch`, + `respawn_until_stable`, `set_automine`, `drop_all_pending_txs`, + `reset_l1_safe_head_synced_at_ms`, `observe_for`). +- `cargo run -p rollups-e2e -- ` runs a single scenario; + `just test-rollups-e2e` runs all 27 (~145 s). +- Before running tests fresh in a clean worktree: `just setup` + + `just canonical-build-machine-image`. Both were run earlier in this + session. diff --git a/TODO.md b/TODO.md deleted file mode 100644 index f9e02e5..0000000 --- a/TODO.md +++ /dev/null @@ -1,83 +0,0 @@ -# TODO - -## North Star - -Build a robust sequencer prototype for a future DeFi stack, with deterministic ordering, low-latency acks, and strong replay/canonical alignment. - ---- - -## Done - -### Sequencer Foundation - -- Thin binary entrypoint plus library runtime (`sequencer::run`, `RunConfig`). -- Simplified runtime/config surface with explicit EIP-712 deployment inputs. -- Hardened write path: API -> inclusion lane -> app execution -> persistence -> ack. -- `L2Tx` broadcaster with WebSocket fanout of ordered `L2Tx`s. -- Bounded WebSocket catch-up window plus subscriber guardrails. -- Shared shutdown supervision across API, inclusion lane, and broadcaster. -- Paged replay/catch-up in inclusion lane and broadcaster to avoid unbounded startup memory growth. -- Persisted `safe_block` frontier model for frames, with leading direct inputs materialized when opening a new frame. - -### Benchmarks & Tooling - -- Benchmark harnesses in `tests/benchmarks/` for ack latency, round-trip latency, sweeps, and unit hot path. -- Baseline reporting for p50 / p95 / p99, throughput, and RSS trends. -- Same-host benchmark workflows and docs aligned with the current runtime/config model. - ---- - -## MVP Scope (Remaining) - -### 1) Sequencer Core - -- Implement direct-input reader from blockchain (ingests into `safe_inputs`). -- Implement batch submitter (reads closed batches and submits on-chain). -- Implement inclusion fee estimator module that updates the suggested fee in DB (`batch_policy`, e.g. `gas_price` or related knobs). -- Add paginated historical `L2Tx` sync endpoint so lagging readers can backfill over HTTP before switching to `/ws/subscribe` for live updates. -- Keep storage/replay semantics deterministic and catch-up-safe as direct-input ingestion, batch submission, and recovery flows land. - -### 2) Recovery / Canonicality - -- Define how canonical progress is derived from persisted facts so replay stays deterministic. -- Detect when scheduler/canonical execution invalidates previously closed batches. -- Define the recovery procedure when persisted batches are invalidated: - - fail fast if the persisted state is inconsistent with canonical inputs - - rebuild or flush invalidated batches before resuming normal service - - notify readers when batches are invalidated - - notify readers when batches become final on-chain - -### 3) Canonical App / Scheduler - -- Implement scheduler behavior in `examples/canonical-app` using shared `sequencer-core` + `examples/app-core`. -- Ensure deterministic ordering model compatible with persisted sequencer order. -- Keep the canonical app as the state-transition artifact used by verification flow (Cartesi Machine / RISC-V path), not by sequencer runtime itself. -- Add focused tests for queue/drain/backstop behavior and ordering invariants. - -### 4) Benchmarks & Evaluation - -- Add canonical network-aware benchmark runs (client/server on different hosts or with injected latency/jitter). -- Turn target evaluation into a real pass/fail mode for the canonical network profile, not just same-host comparison. -- Tune queue / broadcaster / buffer sizing from benchmark evidence instead of ad hoc guesses. -- Revisit inclusion-lane adaptive chunk sizing only after the baseline latency/throughput envelopes are stable. - -### 5) Client / API Ergonomics - -- Add API endpoint to query current suggested inclusion fee. -- Decide whether wallet-specific convenience endpoints belong in the sequencer or in the application/client layer: - - current nonce / tx count - - EIP-712 domain discovery -- If those helper endpoints stay in the sequencer, implement them with a clear separation between core sequencer state and wallet-app-specific state. - ---- - -## Post-MVP (Nice to Have / Dogfooding Artifacts) - -- `sdk/ts-client/`: TypeScript client library for browser/server JS callers. -- `sdk/cli/`: Rust CLI for manual tx submission and debugging flows. -- `examples/web-demo/`: browser demo app consuming `sdk/ts-client`. - -Notes: - -- These are intentionally outside MVP scope. -- Still valuable for dogfooding and contributor onboarding. diff --git a/docs/recovery/README.md b/docs/recovery/README.md new file mode 100644 index 0000000..67b78bc --- /dev/null +++ b/docs/recovery/README.md @@ -0,0 +1,296 @@ +# Batch Recovery + +This document describes the recovery design for the sequencer: how the system detects that batches are failing to land on L1, and how it recovers to a consistent state. The design is verified with bounded TLA+ model checking ([`preemptive.tla`](preemptive.tla)). + +See `AGENTS.md` "Batch Staleness and Recovery" for quick-reference tables and function names. + +## Runtime lifecycle at a glance + +The sequencer's recovery loop spans two process lifetimes: an in-process **danger detector** observes and crashes the process; an external orchestrator (systemd, k8s, …) respawns; the fresh boot runs `run_preemptive_recovery` before any writers come online. + +```text + steady state danger + ┌──────────┐ ┌──────────┐ + │ running │───detector tick──▶ 🚨 │ exiting │ + └──────────┘ └─────┬────┘ + ▲ │ RunError::DangerZoneDetected + │ ▼ + ┌────┴─────┐ ┌─────────────────┐ + │ normal │◀────────────────│ orchestrator │──respawn──▶ startup + │ ticks │ │ (systemd/k8s) │ │ + └──────────┘ └─────────────────┘ ▼ + ┌────────────────────────┐ + │ run_preemptive_recovery│ + │ 1. sync L1 safe head │ + │ 2. decide action │ + │ (pure function) │ + │ 3. flush mempool │ + │ + re-sync │ + │ 4. detect_and_recover│ + └────────────────────────┘ +``` + +Key abstractions, by responsibility: + +- **`DangerDetector`** ([`recovery/detector.rs`](../../sequencer/src/recovery/detector.rs)): tiny background task that calls `Storage::check_danger` on a cadence. Never writes to the DB, never talks to L1. Exits with `DetectorExit::DangerZone` when either the strict or wall-clock-adjusted check fires. The runtime converts that into `RunError::DangerZoneDetected` and the process exits. +- **`BatchSubmitter`** ([`l1/submitter/worker.rs`](../../sequencer/src/l1/submitter/worker.rs)): makes L1 progress only — never checks danger. Productive ticks re-enter immediately; idle/transient ticks sleep `idle_poll_interval`. A pure `decide_submit_start` function folds observed L1 nonces over the scheduler-accepted frontier. +- **`decide_startup_action`** ([`recovery/mod.rs`](../../sequencer/src/recovery/mod.rs)): pure function. Takes `(danger, l1_reachable, last_safe_progress_ms)` and returns `Proceed | FlushAndCascade | Refuse(reason)`. The side-effectful driver executes the chosen action. +- **`MempoolFlusher`** ([`recovery/flusher.rs`](../../sequencer/src/recovery/flusher.rs)): submits no-op transactions to consume all pending wallet-nonce slots and waits for safe finality. Does **not** retry internally on provider errors — the orchestrator's respawn loop is the retry mechanism. +- **`ProtocolConfig`** ([`sequencer-core/src/protocol.rs`](../../sequencer-core/src/protocol.rs)): single source of truth for the scheduler-mirroring fields (`batch_submitter`, `max_wait_blocks`) plus the sequencer-local tuning knobs (`preemptive_margin_blocks`, `seconds_per_block`). Exposes `scheduler_accepts`, `is_scheduler_stale`, `danger_threshold`. + +All five pieces are replaceable at the abstraction boundary: the tick decision is a pure function; the storage surface returns structs, not ad-hoc tuples; the danger detector and submitter are independently testable. + +## The Batch Tree + +Batches form a tree where each node is a batch and edges point from child to parent. Each batch has a single parent: the preceding batch in the valid chain. + +Batches have two identifiers: + +- **Index** (`batch_index`): monotonically increasing, unique, never reused. Creation order. +- **Nonce** (`batch_nonce`): depth of the node in the tree. Assigned by the batch submitter to valid closed batches. + +In normal operation the tree degenerates into a list -- index and nonce increase in lockstep. Branches appear only after recovery, when a suffix of the chain is invalidated and a new batch forks from the last valid ancestor. + +There is always exactly one **valid path** (root to leaf) that constitutes the current batch chain. The valid path splits into a **prefix** (safe on L1, accepted by the scheduler) and a **suffix** (pending or confirming). + +### Genesis sentinel (nonce-0 edge case) + +Recovery requires at least one Gold ancestor (the cascade invalidates a suffix and forks from the last Gold batch). If the very first batch (nonce 0) goes stale before any batch becomes Gold, there is no ancestor to fork from. + +The TLA+ model handles this with a **genesis sentinel**: the initial state starts with a Gold batch at nonce 0. This is a modeling technique that eliminates the nonce-0 special case, allowing Resolve to use uniform logic (the `fng > 1` guard is always satisfied). Without it, the model would need a separate Resolve action with different arithmetic for the "no Gold ancestor" case. + +The implementation can handle the nonce-0 case either by submitting a sentinel batch at first startup, or by special-casing the recovery code for the "no Gold ancestor" branch. + +## Coloring + +Every batch on the valid path has exactly one color. Dead branches are lead (permanently invalid). + +### Simplified model (three colors) + +| Color | Meaning | Terminal? | +|------------|----------------------------------------------------------------|-----------| +| **Gold** | Safe on L1 and accepted by the scheduler | Yes | +| **Silver** | Valid, optimistically executed, but not yet safe/accepted | No | +| **Lead** | Invalid (has `batches.invalidated_at_ms` set) | Yes | + +Gold batches form a contiguous prefix of the valid path. Silver batches form a contiguous suffix (after the gold prefix up to the open batch). Lead batches hang off gold nodes as dead branches -- the first lead in any cascade always has a gold parent. + +### Extended model (five colors) + +To model the full lifecycle including L1 submission: + +| Color | Meaning | Has `w_nonce`? | +|-------------|--------------------------------------------------------|----------------| +| **Tip** | Open batch, not yet closed | No | +| **Pending** | Closed, may or may not be submitted to mempool | Maybe | +| **Bronze** | Included in an L1 block, block not yet safe | Yes | +| **Silver** | Included, block has reached safe finality | Yes | +| **Gold** | Safe, accepted and executed by the scheduler | Yes | + +The spine ordering invariant: `Gold* Silver* Bronze* Pending* Tip` + +A Pending batch may have a `w_nonce` (submitted to the L1 mempool but not yet included in a block) or not (not yet submitted). The batch submitter assigns `w_nonce`s to all unsubmitted Pending batches at once, in spine-position order. + +## Nonce Poisoning + +The scheduler maintains a single counter: "I expect batch nonce N next." + +When a batch with nonce N arrives stale, the scheduler **skips it entirely** -- no nonce increment, no state change, no report. It is a true noop in nonce-space. + +This poisons the nonce counter. Every subsequent batch (nonce N+1, N+2, ...) is dead on arrival. Not because they are individually stale, but because the scheduler still expects nonce N. The only batch with nonce N was stale and skipped, so the counter will never advance past N. + +Cascade invalidation is therefore **exact, not conservative**. The sequencer's `WHERE batch_index >= stale_batch_index` mirrors precisely what the scheduler will do (refuse). The entire silver suffix is unreachable once any batch in it is stale. + +Recovery is the only way forward: create a new batch with nonce N, giving the scheduler what it needs to resume. + +## Two Staleness References + +The staleness formula is `reference_block - first_frame_safe_block >= MAX_WAIT_BLOCKS`, but the reference block differs by context: + +### Inclusion staleness (scheduler's perspective) + +``` +inclusion_block - first_frame_safe_block >= MAX_WAIT_BLOCKS +``` + +Used by `populate_safe_accepted_batches` to simulate what the scheduler accepts. Each batch has its own inclusion block (the L1 block where its submission landed). **Not monotonic** across batches -- a promptly submitted old batch can be healthy while a late-submitted newer batch is stale. + +Inclusion staleness determines the **gold frontier**: the set of batches the scheduler has accepted. + +### Current staleness (sequencer's detection) + +``` +current_safe_block - first_frame_safe_block >= MAX_WAIT_BLOCKS +``` + +Used by the danger threshold detector. The reference block (`current_safe_block`) is the same for all batches. **Monotonic within the valid path** -- earlier batches have smaller `first_frame_safe_block`, so larger difference. If the frontier batch is not stale by this measure, no batch is. + +Current staleness triggers **preemptive recovery** (see below). + +## Nonce Uniqueness on the Valid Path + +`batches.nonce` can repeat across the full table -- a recovery batch inherits `parent.nonce + 1` from the last valid ancestor, which is the same nonce the first invalidated suffix batch had. Among **valid batches** (those with `invalidated_at_ms IS NULL`), nonces are unique because the valid path is a strict chain via `parent_batch_index`. + +This matters because L1 works in nonce-space (the scheduler identifies batches by nonce) while the sequencer works in index-space (local `batch_index`). The recovery path needs to translate between them: "which batch indexes should we invalidate?" Nonce uniqueness on the valid path is what makes this mapping unambiguous. + +## The L1 Stream + +L1 processes transactions in `w_nonce` order. At each slot (a given `w_nonce` value), exactly one transaction is included. If multiple transactions compete for the same slot (e.g., a dead batch and a flush no-op), L1 non-deterministically picks one. The loser is discarded. + +This is the interface between the sequencer and the scheduler. The scheduler sees a stream of entries ordered by `w_nonce`, each with a `batch_nonce`, `inclusion_block`, and `safe_block`. It processes them in order, accepting or rejecting based on nonce match and staleness. + +## The Uncertainty Interval + +The core insight behind the recovery design is that **mempool uncertainty is bounded by a time interval**. + +Once a batch's `safe_block` is old enough that `current_safe_block - safe_block >= MAX_WAIT_BLOCKS`, we know it is stale no matter when it lands on L1 (because `inclusion_block >= current_safe_block`). Any batch in the mempool with that `safe_block` is dead-on-arrival. This means mempool uncertainty has a natural expiration: after `MAX_WAIT_BLOCKS`, the L1 outcome doesn't matter. + +This gives us three regimes: + +``` +|---------- safe ----------|-- danger zone --|-- past MAX_WAIT --| + no action flush + recover self-resolved +``` + +- **Before the danger zone**: batches are young. Nothing to do. +- **In the danger zone**: batches might land stale, or might still make it. This is the window of uncertainty. For **closed unresolved batches**, the flush resolves it by forcing every `w_nonce` slot to finalize (batch wins or no-op wins). After the flush, the sequencer reads the scheduler's finalized state and cascades if needed. An **open Tip** has no `w_nonce` slot yet, so it is not part of this uncertainty set. +- **Past MAX_WAIT**: all unresolved batches are guaranteed stale by L1 monotonicity (`inclusion_block >= current_safe_block >= safe_block + MAX_WAIT`). For closed unresolved batches, the L1 outcome no longer matters because every eventual inclusion is stale, but wallet-nonce slots may still need to be flushed (or naturally consumed) before recovery can reconstruct the scheduler frontier. For an aging open Tip, there is no L1-slot uncertainty at all, so startup recovery can invalidate it directly. + +**What TLA+ proves vs external reasoning**: the TLA+ model ([`preemptive.tla`](preemptive.tla)) proves that after all `w_nonce` slots are resolved (however that happens), ZombieSafety holds. It does not model the danger threshold or the passage of time. The claim that "past MAX_WAIT, staleness self-resolves" is an external argument from L1 monotonicity (`inclusion_block >= current_safe_block`), not something TLA+ checks. + +Any recovery design must wait out this uncertainty. The question is how. The preemptive design (implemented here) forces resolution by going offline and flushing. An alternative optimistic design lets the uncertainty resolve naturally but keeps serving soft confirmations -- see [`history/`](history/) for that approach and why we preferred preemptive. + +## Silver-Only for Submitted Batches + +The Silver-only constraint applies to **submitted batches whose L1 slot outcome is still relevant**. This is the zombie path, and it is where the optimistic-design counterexample from [`history/`](history/) still matters. + +A Silver batch's L1 entry is permanent -- no mempool competition can kill it. The scheduler **will** see it, at a `w_nonce` lower than any recovery batch, and be poisoned. This ordering guarantee is what makes nonce poisoning reliable. + +Detecting staleness on Pending or Bronze submitted batches *before wallet-nonce uncertainty is resolved* is unsafe: a recovery batch can take the frontier's L1 slot via wallet-nonce mutual exclusion, preventing the scheduler from ever seeing the stale frontier, and allowing non-frontier dead batches to pass the nonce check. TLA+ model checking found this bug; see [`history/`](history/) for the counterexample. + +The open Tip is different. It has no L1 transaction yet, so there is no `w_nonce` competition and no zombie risk. Once `current_safe_block - first_frame_safe_block >= MAX_WAIT_BLOCKS`, startup recovery can invalidate the stale Tip directly and open a fresh one. Likewise, after a preemptive flush has resolved all competing `w_nonce` slots for closed batches, the atomic recovery transaction can safely use **current staleness** on the oldest unresolved batch (closed or open). + +## Preemptive Recovery Design + +The sequencer uses a preemptive approach: detect danger early, go offline, flush the mempool, then recover on solid ground. This design was preferred over the optimistic alternative because it is simpler to reason about and produces fewer invalidated soft confirmations (the sequencer stops issuing them before the cascade). + +### Step 1: Danger threshold + +Define `DANGER_THRESHOLD = MAX_WAIT_BLOCKS - MARGIN`. When the frontier batch's current staleness (`current_safe_block - safe_block`) reaches `DANGER_THRESHOLD`, trigger preemptive recovery. + +The margin must cover: flush submission time + L1 safe finality wait (~15 min on Ethereum) + recovery execution time. With `MAX_WAIT_BLOCKS = 1200` (~4 hours), a margin of ~75 blocks (~15 min) is conservative. + +### Step 2: Go offline + +Stop accepting new user operations. From the outside world, the sequencer is temporarily unavailable. This eliminates concurrent batch creation during recovery. + +### Step 3: Flush mempool + +Query the latest confirmed `w_nonce` (N) and the pending `w_nonce` (M). Submit `M - N` no-op transactions (e.g., self-transfer of 0 ETH) at nonces N, N+1, ..., M-1. These compete with any batches in the mempool at the same slots. + +Wait for all `M - N` slots to reach L1 safe finality. + +### Step 4: Post-flush state + +Every `w_nonce` slot from N to M-1 is now resolved: + +- **Batch won**: the batch is on L1 and safe (Silver or Gold) +- **No-op won**: the batch is dead forever, its slot consumed + +There are no more mempool entries. All uncertainty is resolved. + +### Step 5: Run recovery + +This is an atomic SQLite transaction operating on fully-finalized L1 state: + +1. **Populate gold frontier** (`populate_safe_accepted_batches`): scan L1 safe inputs, simulate scheduler acceptance logic. Learn `schedulerExpected` -- the next batch nonce the scheduler needs. +2. **Detect staleness**: find the oldest unresolved batch (first closed batch past the accepted frontier, otherwise the open Tip). If its **current staleness** (`current_safe_block - first_frame_safe_block`) has reached `MAX_WAIT_BLOCKS`, cascade-invalidate it and all successors (set `invalidated_at_ms` on each). Closed-batch cascades rely on the preceding flush/safe-head sync to remove wallet-nonce uncertainty; Tip cascades need no flush because the Tip has no L1 slot yet. If nothing is stale, skip to step 6 (Resume). +3. **Open recovery batch**: fresh batch whose `parent_batch_index` is the last valid ancestor. Its `nonce` is structurally `parent.nonce + 1`, which equals `schedulerExpected`. Re-drain direct inputs from invalidated batches. + +### Step 6: Resume + +Restart the batch submitter and user-op acceptance. The sequencer is back online. + +### Startup behavior + +On startup, the sequencer doesn't know whether it was a preemptive shutdown, a spurious restart, or coming online after a long outage. It therefore splits the check in two: + +1. **Closed unresolved frontier batch in danger**: run the zombie-path check (`check_danger_zone`). If the first closed batch past the accepted frontier has entered the danger zone, flush (step 3), wait for finality (step 4), then run recovery (step 5). +2. **No closed batch in danger**: skip the flush and run the atomic recovery transaction directly. This is the normal path on a clean restart, and it is also how startup handles an open Tip that has already crossed `MAX_WAIT_BLOCKS`. + +This means "danger at startup" is not one unified flow: + +- **Closed unresolved batches** still need the flush because their `w_nonce` slots may contain zombie uncertainty. +- **An aging open Tip** can be recovered directly because there is no L1 slot to resolve. +- **Closed unresolved batches already past `MAX_WAIT_BLOCKS`** are guaranteed stale by monotonicity, but the sequencer still flushes before recovery so `populate_safe_accepted_batches` can reconstruct the scheduler frontier from fully resolved safe inputs. + +**What TLA+ proves here**: the model still abstracts away the full startup cutover/flush decision. It proves ZombieSafety once wallet-nonce slots resolve, and separately models direct recovery of an aging open Tip. The claim that past `MAX_WAIT`, closed-batch staleness self-resolves is external reasoning from L1 monotonicity. + +### L1 unreachability + +The danger zone check and the flush both require L1. If L1 is unreachable, the sequencer must decide whether to proceed (before danger zone) or block (in danger zone). + +**At startup**: the sequencer attempts to sync the safe head from L1. If this fails, it falls back to a **wall-clock danger estimate** based on the persisted last-L1-sync marker: compute `estimated_missed_blocks = (now - last_l1_sync_ms) / seconds_per_block`, adjust the danger threshold downward by that estimate, and run the unresolved-batch danger check against the stale DB view. If the estimate is before the danger zone, the sequencer proceeds with stale DB data — the input reader and batch submitter will catch up when L1 returns. If the estimate is in or past the danger zone, the sequencer refuses to start (it can't safely issue soft confirmations without knowing L1 state). + +**At runtime**: the batch submitter retries on L1 errors (provider failures). On each retry, it runs the same wall-clock estimate: `estimated_missed_blocks = (now - last_l1_sync_ms) / seconds_per_block`. It adjusts the danger threshold downward by this estimate. If the adjusted check triggers, the batch submitter crashes for recovery. This ensures the sequencer doesn't keep issuing soft confirmations while disconnected from L1 long enough to cross the danger zone. + +**Other workers during L1 outages**: the inclusion lane and API are purely local (SQLite) and continue operating. The input reader retries L1 polling with error logging. All L1-dependent workers log errors at the `error` level to alert operators. + +The `seconds_per_block` parameter (default: 12 for Ethereum) is configurable via `SEQ_SECONDS_PER_BLOCK`. The wall-clock estimate is conservative — it may overestimate age (if blocks are slower than assumed), which causes earlier detection. This is correct: better to crash early than to issue doomed soft confirmations. + +## Dead Batches + +After cascade invalidation, submitted Pending batches (those with `w_nonce` assigned) are **dead batches**. They are still in the L1 mempool, competing with their flush no-op transactions. + +Two outcomes per dead batch, non-deterministic: + +- **Dead batch beats no-op**: lands on L1, scheduler sees it, rejects it (stale by inclusion, or nonce-poisoned by a preceding stale/missing batch) +- **No-op beats dead batch**: dead batch killed forever, scheduler never sees it (the scheduler skips the gap) + +A killed batch acts as **silent nonce poison**: the scheduler never sees it, so `schedulerExpected` stays stuck at its `batch_nonce`. All subsequent batches have wrong nonces. + +Dead batches occupy `w_nonce` slots strictly below `walletNonce`. Recovery batches occupy `w_nonce` slots at or above `walletNonce`. **No overlap.** This is why no mutual exclusion is needed between dead batches and recovery batches -- they live in non-overlapping `w_nonce` ranges. + +## Implementation Constraints + +These constraints were discovered during TLA+ model checking and are required for correctness: + +1. **`walletNonce` must NOT be reset during recovery.** Recovery batches must use `w_nonces` strictly past all dead batch slots. The flush consumes dead batch slots by advancing `nextL1Slot` up to `walletNonce`. Recovery starts fresh from there. + +2. **`SubmitBatch` must use `max(walletNonce, nextL1Slot)`.** Prevents assigning `w_nonce` values for slots L1 has already consumed. + +3. **`SubmitBatch` must assign ALL pending batches at once, in spine-position order.** If batches are submitted individually, a flush-win can bump one batch's `w_nonce` past a later batch's, violating the spine ordering invariant. + +4. **Wall-clock fallback when L1 is unreachable.** The batch submitter must track the last successful L1 communication time. On provider errors, it must estimate block progression from wall-clock time (`elapsed / seconds_per_block`) and crash if the estimated age exceeds the danger threshold. Without this, an L1 outage can silently push batches past the danger zone while the DB-based check sees stale (frozen) data. + +## Formal Verification + +The recovery design is verified with bounded TLA+ model checking. The canonical spec is [`preemptive.tla`](preemptive.tla). An alternative optimistic design is preserved in [`history/optimistic.tla`](history/optimistic.tla). + +**Scope and limitations**: these are bounded safety models. They exhaustively check all reachable states within the configured bounds, but do not prove liveness (eventual progress), do not model the danger threshold trigger or timing margins, and do not model crash/restart (the implementation relies on SQLite atomic transactions for crash safety). + +### `preemptive.tla` -- Slot-level safety under adversarial flush + +Models the core slot-level mechanics of preemptive recovery. At every `w_nonce` slot, L1 non-deterministically includes the spine batch OR a flush no-op (killing the batch). This covers the case where the frontier batch itself is killed during flush. The model also treats the open Tip's `safe_block` as meaningful, so it can explicitly recover an aging Tip that has no L1 footprint yet. + +The model is a **safety over-approximation**: it allows `AdvanceTip` and `SubmitBatch` to interleave freely with recovery, which the real protocol prevents (the sequencer goes offline). This makes the proof stronger -- if `ZombieSafety` holds under more interleavings, it holds under fewer. However, the model does not verify the full sequential protocol phases (cutover, flush, wait, recover, resume) described above; in particular, the startup decision of whether a closed unresolved batch must flush before recovery remains an external argument layered on top of the slot-level proof. + +**Verified**: 157M states, 0 violations. + +| Invariant | Meaning | +|-----------|---------| +| ZombieSafety | `schedulerExpected = CountGold(spine)` -- scheduler accepts exactly the Gold prefix | +| BatchNoncesContiguous | Batch nonces are 0..N-1 for non-Tip spine | +| InvalidOnlyOnGold | Dead branches only hang off Gold nodes | +| L1WNonceUnique | No two L1 entries share a `w_nonce` | +| L1BeforeCursor | All L1 entries have `w_nonce < nextL1Slot` | +| SchedulerBehindL1 | Scheduler cursor doesn't pass L1 cursor | +| DeadNotYetIncluded | Dead batches have `w_nonce >= nextL1Slot` | + +### Running the spec + +```bash +tlc -workers auto -deadlock docs/recovery/preemptive.tla # ~90s +``` + +Bounds are in `preemptive.cfg`. The `MaxWalletNonce` bound keeps the state space finite (kill/resubmit cycles generate new `w_nonce` values). Increase bounds for higher confidence at the cost of longer runtime. diff --git a/docs/recovery/history/README.md b/docs/recovery/history/README.md new file mode 100644 index 0000000..74e173e --- /dev/null +++ b/docs/recovery/history/README.md @@ -0,0 +1,56 @@ +# Recovery Design History + +This directory preserves the optimistic recovery design -- an alternative to the preemptive approach documented in the parent [`README.md`](../README.md). Both designs are sound. We preferred preemptive for its operational properties. + +## The Optimistic Design + +In the optimistic design, the sequencer keeps accepting user operations and building batches while recovery plays out in the background. If a batch goes stale, the system detects it when the batch becomes Silver (safe on L1), cascade-invalidates, and submits recovery batches -- all while the sequencer continues serving soft confirmations. + +The TLA+ spec [`optimistic.tla`](optimistic.tla) models this design with a scheduler, wallet nonces, zombie batches (invalidated batches still in the L1 mempool), and adversarial L1 inclusion. At each `w_nonce` slot where a zombie and a recovery batch compete, L1 non-deterministically picks one (wallet-nonce mutual exclusion). + +**Verified**: 194M states, 0 violations (after the Silver-only fix below). + +## The Silver-Only Constraint + +Both designs share a critical constraint: **recovery must wait for the frontier batch to be Silver before cascade-invalidating.** + +This constraint was discovered through the optimistic model. The original design allowed staleness detection on Pending or Bronze batches (a "short-circuit" for faster recovery). TLA+ found a counterexample: + +Three batches with `MAX_WAIT_BLOCKS = 2`: + +``` +batch bn=0 bn=1 bn=2 +sb 0 0 1 +wn 0 1 2 +``` + +With `currentSafeBlock = 2`, `bn=1` is stale by current block, `bn=2` is fresh. If we cascade from `bn=1`, both become zombies. Recovery creates a new `bn=1` at `wn=1`. + +At L1 slot 1, zombie `bn=1` and recovery `bn=1` compete (same `w_nonce`): + +- **Zombie wins**: scheduler sees it, stale, skip. Nonce poisoned. Safe. +- **Recovery wins**: zombie `bn=1` dies (never reaches L1). Recovery accepted. `schedulerExpected` advances to 2. Zombie `bn=2(wn=2)` is fresh (`inclusion_block - safe_block = 1 < 2`), matches expected nonce -> **accepted**. The scheduler executes invalidated batch data. + +The two protection layers (wallet-nonce mutual exclusion and nonce poisoning) undercut each other: mutual exclusion kills the batch that nonce poisoning needs. + +The fix: only detect staleness when the frontier is Silver (safe on L1, immutable). The scheduler is guaranteed to see it before any recovery batch. + +## Why We Chose Preemptive + +Both designs are sound once Silver-only detection is enforced. The difference is operational: + +**Both designs wait.** Any recovery design must wait for the frontier to become Silver before cascading. In the optimistic design, the sequencer keeps issuing soft confirmations during this wait -- confirmations that will be invalidated when the cascade fires. In the preemptive design, the sequencer goes offline before the cascade, so no doomed soft confirmations are issued. + +**Preemptive is simpler to reason about.** The optimistic design has concurrent actors: the batch submitter, the inclusion lane, L1 mempool competition, and recovery all interleave. The preemptive design is sequential: stop, flush, recover, resume. Each step has clear preconditions and postconditions. + +**Preemptive eliminates mempool races.** The flush resolves all `w_nonce` slot uncertainty before recovery runs. Recovery operates on fully-finalized L1 state. No zombie mutual exclusion needed. + +**The cost is downtime.** Preemptive recovery takes the sequencer offline for the duration of the flush + safe finality wait (~15-20 minutes on Ethereum). For a rare event (a batch approaching the 4-hour staleness deadline), this is acceptable. + +## Running the Spec + +```bash +tlc -workers auto -deadlock docs/recovery/history/optimistic.tla # ~3min +``` + +Bounds are in `optimistic.cfg`. diff --git a/docs/recovery/history/optimistic.cfg b/docs/recovery/history/optimistic.cfg new file mode 100644 index 0000000..1bb370c --- /dev/null +++ b/docs/recovery/history/optimistic.cfg @@ -0,0 +1,9 @@ +SPECIFICATION Spec + +CONSTANTS + MaxBatchIndex = 6 + MaxSafeBlock = 7 + MAX_WAIT_BLOCKS = 2 + +INVARIANTS + Inv diff --git a/docs/recovery/history/optimistic.tla b/docs/recovery/history/optimistic.tla new file mode 100644 index 0000000..8340ae5 --- /dev/null +++ b/docs/recovery/history/optimistic.tla @@ -0,0 +1,460 @@ +---------------------------- MODULE optimistic ----------------------------- +(* + * Formal model of sequencer batch tree with scheduler, wallet nonces, + * zombie batches, and adversarial L1 inclusion. + * + * Proves: ZombieSafety == schedulerExpected = CountGold(spine) + * + * After recovery, no zombie batch from an invalidated chain is ever + * accepted by the scheduler. + * + * Colors (spine ordering): Gold* Silver* Bronze* Pending* Tip + * - Tip: open batch (not yet closed) + * - Pending: closed, may have w_nonce (submitted to L1 mempool) + * - Bronze: included in an L1 block (not yet safe) + * - Silver: included in a safe L1 block + * - Gold: accepted by the scheduler + * + * Key mechanism — two-layer zombie protection: + * (1) Wallet nonce mutual exclusion: zombie and recovery batch compete + * for the same L1 slot. Loser's w_nonce is bumped. + * (2) Nonce poisoning: stale batch is a no-op in the scheduler (does + * not increment expected nonce), making all subsequent zombies + * have wrong batch_nonce. + * + * Actions: + * AdvanceTip -- close tip -> Pending, append new Tip + * SubmitBatch -- assign w_nonce to first unsubmitted Pending + * L1Include -- include tx at nextL1Slot (spine or zombie wins) + * AdvanceSafeBlock -- L1 safe block advances, Bronze -> Silver + * SchedulerStep -- scheduler processes next safe L1 entry + Gold + * Resolve -- detect staleness, cascade, create zombies + * + * See docs/recovery.md for the conceptual model. + *) + +EXTENDS Integers, Sequences, FiniteSets + +CONSTANTS + MaxBatchIndex, \* bound on total batch creations + MaxSafeBlock, \* bound on L1 safe block + MAX_WAIT_BLOCKS \* staleness threshold + +NONE == -1 \* sentinel: "no w_nonce assigned" + +--------------------------------------------------------------------------- +(* Colors *) + +Gold == "Gold" +Silver == "Silver" +Bronze == "Bronze" +Pending == "Pending" +Tip == "Tip" + +Colors == {Gold, Silver, Bronze, Pending, Tip} + +ColorOrd(c) == + CASE c = Gold -> 0 + [] c = Silver -> 1 + [] c = Bronze -> 2 + [] c = Pending -> 3 + [] c = Tip -> 4 + +--------------------------------------------------------------------------- +(* Variables *) + +VARIABLES + spine, \* Seq of [index, color, safe_block, inclusion_block, + \* w_nonce, batch_nonce] + invalid, \* Seq of Nat: dead-branch count per spine position + nextIndex, \* Nat: next batch index + currentSafeBlock, \* Nat: L1 safe block (environment) + walletNonce, \* Nat: next w_nonce for mempool submission + zombies, \* Set of [batch_nonce, w_nonce, safe_block] + nextL1Slot, \* Nat: L1 nonce cursor (next w_nonce to include) + l1Included, \* Set of [batch_nonce, w_nonce, inclusion_block, + \* safe_block, is_safe] + schedulerCursor, \* Nat: next w_nonce the scheduler will process + schedulerExpected \* Nat: scheduler's expected batch nonce + +vars == <> + +--------------------------------------------------------------------------- +(* Helpers *) + +CountGold(s) == Cardinality({i \in 1..Len(s) : s[i].color = Gold}) + +FirstNonGold(s) == + IF \E i \in 1..Len(s) : s[i].color # Gold + THEN CHOOSE i \in 1..Len(s) : + s[i].color # Gold /\ \A j \in 1..i-1 : s[j].color = Gold + ELSE 0 + +\* First Pending without a w_nonce. +FirstUnsubmitted(s) == + IF \E i \in 1..Len(s) : s[i].color = Pending /\ s[i].w_nonce = NONE + THEN CHOOSE i \in 1..Len(s) : + s[i].color = Pending /\ s[i].w_nonce = NONE + /\ \A j \in 1..i-1 : ~(s[j].color = Pending /\ s[j].w_nonce = NONE) + ELSE 0 + +\* Spine position of Pending batch with a given w_nonce. +PendingAtWNonce(s, wn) == + IF \E i \in 1..Len(s) : s[i].color = Pending /\ s[i].w_nonce = wn + THEN CHOOSE i \in 1..Len(s) : s[i].color = Pending /\ s[i].w_nonce = wn + ELSE 0 + +\* Spine position of Silver batch with a given batch_nonce. +SilverAtBN(s, bn) == + IF \E i \in 1..Len(s) : s[i].color = Silver /\ s[i].batch_nonce = bn + THEN CHOOSE i \in 1..Len(s) : s[i].color = Silver /\ s[i].batch_nonce = bn + ELSE 0 + +--------------------------------------------------------------------------- +(* Staleness *) + +IsStaleByInclusion(b) == b.inclusion_block - b.safe_block >= MAX_WAIT_BLOCKS +IsStaleByCurrentBlock(b) == currentSafeBlock - b.safe_block >= MAX_WAIT_BLOCKS + +--------------------------------------------------------------------------- +(* Invariants *) + +TypeOK == + /\ Len(spine) >= 1 + /\ nextIndex \in Nat + /\ currentSafeBlock \in Nat + /\ walletNonce \in Nat + /\ nextL1Slot \in Nat + /\ schedulerCursor \in Nat + /\ schedulerExpected \in Nat + +\* Gold* Silver* Bronze* Pending* Tip +SpineOrdering == + /\ spine[Len(spine)].color = Tip + /\ \A i \in 1..Len(spine)-1 : + ColorOrd(spine[i].color) <= ColorOrd(spine[i+1].color) + +SafeBlockMonotonic == + \A i \in 1..Len(spine)-1 : + (spine[i].color # Tip /\ spine[i+1].color # Tip) + => spine[i].safe_block <= spine[i+1].safe_block + +InvalidOnlyOnGold == + \A i \in 1..Len(spine) : invalid[i] > 0 => spine[i].color = Gold + +CurrentStalenessMonotonic == + \A i, j \in 1..Len(spine) : + (i < j /\ spine[i].color # Tip /\ spine[j].color # Tip + /\ IsStaleByCurrentBlock(spine[j])) + => IsStaleByCurrentBlock(spine[i]) + +BatchNoncesContiguous == + \A i \in 1..Len(spine) : + spine[i].color # Tip => spine[i].batch_nonce = i - 1 + +\* ------- THE KEY THEOREM ------- +ZombieSafety == schedulerExpected = CountGold(spine) + +\* Supporting L1 invariants +L1WNonceUnique == + \A e1, e2 \in l1Included : e1.w_nonce = e2.w_nonce => e1 = e2 + +ZombieNotYetIncluded == + \A z \in zombies : z.w_nonce >= nextL1Slot + +L1BeforeCursor == + \A e \in l1Included : e.w_nonce < nextL1Slot + +SchedulerBehindL1 == + schedulerCursor <= nextL1Slot + +Inv == + /\ TypeOK + /\ SpineOrdering + /\ SafeBlockMonotonic + /\ InvalidOnlyOnGold + /\ CurrentStalenessMonotonic + /\ BatchNoncesContiguous + /\ ZombieSafety + /\ L1WNonceUnique + /\ ZombieNotYetIncluded + /\ L1BeforeCursor + /\ SchedulerBehindL1 + +--------------------------------------------------------------------------- +(* Initial state *) + +Init == + /\ spine = <<[index |-> 0, color |-> Tip, safe_block |-> 0, + inclusion_block |-> 0, w_nonce |-> NONE, batch_nonce |-> 0]>> + /\ invalid = <<0>> + /\ nextIndex = 1 + /\ currentSafeBlock = 0 + /\ walletNonce = 0 + /\ zombies = {} + /\ nextL1Slot = 0 + /\ l1Included = {} + /\ schedulerCursor = 0 + /\ schedulerExpected = 0 + +--------------------------------------------------------------------------- +(* + * AdvanceTip: close the current Tip -> Pending, append new Tip. + * Assigns safe_block (from environment) and batch_nonce. + *) +AdvanceTip == + /\ nextIndex <= MaxBatchIndex + /\ LET tipPos == Len(spine) + IN + /\ spine[tipPos].color = Tip + /\ \E sb \in 0..currentSafeBlock : + /\ (tipPos > 1 => sb >= spine[tipPos - 1].safe_block) + /\ spine' = [i \in 1..Len(spine) + 1 |-> + IF i < tipPos THEN spine[i] + ELSE IF i = tipPos + THEN [index |-> spine[tipPos].index, + color |-> Pending, + safe_block |-> sb, + inclusion_block |-> 0, + w_nonce |-> NONE, + batch_nonce |-> tipPos - 1] + ELSE [index |-> nextIndex, + color |-> Tip, + safe_block |-> 0, + inclusion_block |-> 0, + w_nonce |-> NONE, + batch_nonce |-> 0]] + /\ invalid' = [i \in 1..Len(spine) + 1 |-> + IF i <= Len(spine) THEN invalid[i] ELSE 0] + /\ nextIndex' = nextIndex + 1 + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* + * SubmitBatch: assign w_nonces to ALL unsubmitted Pending batches + * at once, in spine-position order. This models the real batch + * submitter which reads the on-chain nonce and submits every + * pending batch each tick. + *) +SubmitBatch == + LET unsubPos == {i \in 1..Len(spine) : + spine[i].color = Pending /\ spine[i].w_nonce = NONE} + \* Read on-chain nonce: can't use a slot L1 already consumed + wn0 == IF walletNonce >= nextL1Slot THEN walletNonce ELSE nextL1Slot + IN + /\ unsubPos # {} + /\ spine' = [i \in 1..Len(spine) |-> + IF i \in unsubPos + THEN [spine[i] EXCEPT + !.w_nonce = wn0 + Cardinality({j \in unsubPos : j < i})] + ELSE spine[i]] + /\ walletNonce' = wn0 + Cardinality(unsubPos) + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* + * L1Include: include one transaction at w_nonce = nextL1Slot. + * + * If both a spine Pending and a zombie exist at this slot, L1 + * non-deterministically picks one (mempool competition). + * + * Spine wins: Pending -> Bronze (or Silver if block already safe). + * Zombie wins: zombie included; competing Pending's w_nonce bumped. + * + * inclusion_block >= currentSafeBlock (L1 monotonicity: transactions + * are included in current or future blocks) and >= all previous + * inclusion blocks (block numbers are monotonic). + *) + +L1IncludeSpine == + LET pos == PendingAtWNonce(spine, nextL1Slot) + IN + /\ pos > 0 + /\ \E ib \in currentSafeBlock..MaxSafeBlock : + \* Block ordering: non-decreasing inclusion_block + /\ \A e \in l1Included : ib >= e.inclusion_block + /\ LET isSafe == ib <= currentSafeBlock + newColor == IF isSafe THEN Silver ELSE Bronze + IN + /\ spine' = [spine EXCEPT ![pos].color = newColor, + ![pos].inclusion_block = ib] + /\ l1Included' = l1Included \union + {[batch_nonce |-> spine[pos].batch_nonce, + w_nonce |-> nextL1Slot, + inclusion_block |-> ib, + safe_block |-> spine[pos].safe_block, + is_safe |-> isSafe]} + /\ nextL1Slot' = nextL1Slot + 1 + \* Kill zombie at this slot if it existed + /\ zombies' = {z \in zombies : z.w_nonce # nextL1Slot} + /\ UNCHANGED <> + +L1IncludeZombie == + /\ \E z \in zombies : z.w_nonce = nextL1Slot + /\ LET z == CHOOSE zz \in zombies : zz.w_nonce = nextL1Slot + IN + \E ib \in currentSafeBlock..MaxSafeBlock : + /\ \A e \in l1Included : ib >= e.inclusion_block + /\ l1Included' = l1Included \union + {[batch_nonce |-> z.batch_nonce, + w_nonce |-> nextL1Slot, + inclusion_block |-> ib, + safe_block |-> z.safe_block, + is_safe |-> (ib <= currentSafeBlock)]} + /\ nextL1Slot' = nextL1Slot + 1 + /\ zombies' = {zz \in zombies : zz.w_nonce # nextL1Slot} + \* If a spine Pending was competing at this slot, reset ALL + \* submitted Pending w_nonces. The batch submitter will + \* re-read the on-chain nonce and resubmit everything. + /\ LET hasConflict == PendingAtWNonce(spine, nextL1Slot) > 0 + IN + IF hasConflict + THEN /\ spine' = [i \in 1..Len(spine) |-> + IF spine[i].color = Pending + /\ spine[i].w_nonce # NONE + THEN [spine[i] EXCEPT !.w_nonce = NONE] + ELSE spine[i]] + /\ walletNonce' = nextL1Slot + 1 + ELSE /\ UNCHANGED spine + /\ UNCHANGED walletNonce + /\ UNCHANGED <> + +L1Include == L1IncludeSpine \/ L1IncludeZombie + +--------------------------------------------------------------------------- +(* + * AdvanceSafeBlock: environment advances the L1 safe block. + * Bronze -> Silver on spine when inclusion_block becomes safe. + * Marks l1Included entries as safe. + *) +AdvanceSafeBlock == + /\ currentSafeBlock < MaxSafeBlock + /\ \E sb \in (currentSafeBlock + 1)..MaxSafeBlock : + /\ currentSafeBlock' = sb + /\ spine' = [i \in 1..Len(spine) |-> + IF spine[i].color = Bronze /\ spine[i].inclusion_block <= sb + THEN [spine[i] EXCEPT !.color = Silver] + ELSE spine[i]] + /\ l1Included' = {[e EXCEPT !.is_safe = + (e.is_safe \/ (e.inclusion_block <= sb))] + : e \in l1Included} + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* + * SchedulerStep: process the L1 entry at schedulerCursor. + * + * The on-chain scheduler sees L1 inputs in w_nonce order and + * maintains an expected batch nonce counter. + * + * Accept: batch_nonce matches AND not stale by inclusion. + * -> increment schedulerExpected, promote spine Silver -> Gold. + * Skip: nonce mismatch OR stale (nonce poisoning). + * -> schedulerExpected unchanged. + * + * If accepted but the batch is not on the spine (zombie was accepted), + * spine is unchanged but schedulerExpected increments. ZombieSafety + * would then be violated — which is exactly what we're proving + * cannot happen. + *) +SchedulerStep == + /\ \E e \in l1Included : e.w_nonce = schedulerCursor /\ e.is_safe + /\ LET entry == CHOOSE e \in l1Included : + e.w_nonce = schedulerCursor /\ e.is_safe + IN + LET stale == entry.inclusion_block - entry.safe_block + >= MAX_WAIT_BLOCKS + accepted == entry.batch_nonce = schedulerExpected /\ ~stale + IN + /\ schedulerCursor' = schedulerCursor + 1 + /\ IF accepted + THEN /\ schedulerExpected' = schedulerExpected + 1 + /\ LET gp == SilverAtBN(spine, schedulerExpected) + IN IF gp > 0 + THEN spine' = [spine EXCEPT ![gp].color = Gold] + ELSE UNCHANGED spine + ELSE /\ UNCHANGED schedulerExpected + /\ UNCHANGED spine + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* + * Resolve: detect staleness at the frontier, cascade-invalidate, + * create zombies from submitted Pending batches, open recovery Tip. + * + * CRITICAL: the frontier must be Silver (safe on L1) before we + * cascade. This guarantees the stale batch is permanently on L1 + * and the scheduler WILL see it and be poisoned — no mempool + * mutual exclusion can kill it. Detecting staleness on Bronze + * or Pending would allow a race where the recovery batch takes + * the frontier's L1 slot, preventing nonce poisoning and letting + * non-frontier zombies be accepted (see counterexample in commit + * history). + * + * Only submitted Pending batches (w_nonce # NONE) become zombies. + * Bronze/Silver batches are already in l1Included; the scheduler + * will process and reject them (stale or nonce mismatch). + * + * walletNonce is reset to nextL1Slot: the sequencer reads the + * latest on-chain nonce and resubmits from there. + *) +Resolve == + /\ nextIndex <= MaxBatchIndex + /\ LET fng == FirstNonGold(spine) + IN + /\ fng > 0 + /\ fng > 1 \* need a Gold parent + /\ spine[fng].color = Silver \* ONLY Silver — must be safe on L1 + /\ IsStaleByInclusion(spine[fng]) + /\ LET newLen == fng \* (fng-1) Golds + 1 new Tip + \* Zombies from submitted Pending batches in the cascade + newZombies == + {[batch_nonce |-> spine[i].batch_nonce, + w_nonce |-> spine[i].w_nonce, + safe_block |-> spine[i].safe_block] : + i \in {j \in fng..Len(spine) : + spine[j].color = Pending /\ spine[j].w_nonce # NONE}} + IN + /\ spine' = [i \in 1..newLen |-> + IF i < fng THEN spine[i] \* all Gold + ELSE [index |-> nextIndex, + color |-> Tip, + safe_block |-> 0, + inclusion_block |-> 0, + w_nonce |-> NONE, + batch_nonce |-> 0]] + /\ invalid' = [i \in 1..newLen |-> + IF i = fng - 1 + THEN invalid[fng - 1] + (Len(spine) - fng + 1) + ELSE IF i < fng THEN invalid[i] + ELSE 0] + /\ nextIndex' = nextIndex + 1 + /\ zombies' = zombies \union newZombies + /\ walletNonce' = nextL1Slot + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* Spec *) + +Next == + \/ AdvanceTip + \/ SubmitBatch + \/ L1Include + \/ AdvanceSafeBlock + \/ SchedulerStep + \/ Resolve + +Spec == Init /\ [][Next]_vars + +========================================================================= diff --git a/docs/recovery/justfile b/docs/recovery/justfile new file mode 100644 index 0000000..a35604a --- /dev/null +++ b/docs/recovery/justfile @@ -0,0 +1,12 @@ +tlc := env("TLC", "tlc") + +# Check the preemptive recovery spec (~90s) +check-preemptive: + {{tlc}} -workers auto -deadlock preemptive.tla + +# Check the optimistic recovery spec (~3min) +check-optimistic: + {{tlc}} -workers auto -deadlock history/optimistic.tla + +# Check all specs +check-all: check-preemptive check-optimistic diff --git a/docs/recovery/preemptive.cfg b/docs/recovery/preemptive.cfg new file mode 100644 index 0000000..a5ce60d --- /dev/null +++ b/docs/recovery/preemptive.cfg @@ -0,0 +1,10 @@ +SPECIFICATION Spec + +CONSTANTS + MaxBatchIndex = 5 + MaxSafeBlock = 5 + MAX_WAIT_BLOCKS = 2 + MaxWalletNonce = 8 + +INVARIANTS + Inv diff --git a/docs/recovery/preemptive.tla b/docs/recovery/preemptive.tla new file mode 100644 index 0000000..1991540 --- /dev/null +++ b/docs/recovery/preemptive.tla @@ -0,0 +1,445 @@ +---------------------------- MODULE preemptive ----------------------------- +(* + * Full operational model of the preemptive recovery design. + * + * Extends V3 with flush modeling: at each w_nonce slot, L1 + * non-deterministically includes the spine batch OR a flush no-op + * (killing the batch). This captures the complete flush lifecycle + * including the case where the frontier batch itself is killed. + * + * A killed batch acts as silent poison: the scheduler never sees it, + * so schedulerExpected stays stuck at its batch_nonce. All subsequent + * batches — whether alive on L1 or dead — have wrong nonces. + * Recovery resubmits the killed batch; if stale by inclusion, Resolve + * cascades; if fresh, the scheduler accepts it. Resolve can also + * discard an aging open Tip whose current-safe-block age has reached + * MAX_WAIT_BLOCKS. + * + * Colors on the spine: Gold* Silver* Bronze* Pending* Tip + * During flush, SpineOrdering can be temporarily violated (a killed + * Pending appears before a surviving Silver). This is transient — + * recovery restores Gold* + Tip. SpineOrdering is NOT checked as + * an invariant. + * + * Proves: ZombieSafety == schedulerExpected = CountGold(spine) + * + * Actions: + * AdvanceTip -- close tip -> Pending, append new Tip + * SubmitBatch -- assign w_nonces to unsubmitted Pendings + * L1IncludeSpine -- spine batch wins its slot -> Bronze/Silver + * L1SkipSpine -- flush no-op wins, spine batch killed + * L1IncludeDead -- dead batch beats its flush no-op + * L1SkipDead -- flush no-op wins, dead batch killed + * AdvanceSafeBlock -- L1 safe block advances, Bronze -> Silver + * SchedulerStep -- scheduler processes next safe entry -> Gold + * SchedulerSkip -- scheduler skips gap (no-op slot) + * Resolve -- stale unresolved frontier -> cascade, recover + *) + +EXTENDS Integers, Sequences, FiniteSets + +CONSTANTS + MaxBatchIndex, + MaxSafeBlock, + MAX_WAIT_BLOCKS, + MaxWalletNonce \* bound on wallet nonce to keep state space finite + +NONE == -1 + +--------------------------------------------------------------------------- +(* Colors *) + +Gold == "Gold" +Silver == "Silver" +Bronze == "Bronze" +Pending == "Pending" +Tip == "Tip" + +Colors == {Gold, Silver, Bronze, Pending, Tip} + +ColorOrd(c) == + CASE c = Gold -> 0 + [] c = Silver -> 1 + [] c = Bronze -> 2 + [] c = Pending -> 3 + [] c = Tip -> 4 + +--------------------------------------------------------------------------- +(* Variables *) + +VARIABLES + spine, + invalid, + nextIndex, + currentSafeBlock, + walletNonce, + nextL1Slot, + l1Included, + schedulerCursor, + schedulerExpected, + deadBatches + +vars == <> + +--------------------------------------------------------------------------- +(* Helpers *) + +CountGold(s) == Cardinality({i \in 1..Len(s) : s[i].color = Gold}) + +FirstNonGold(s) == + IF \E i \in 1..Len(s) : s[i].color # Gold + THEN CHOOSE i \in 1..Len(s) : + s[i].color # Gold /\ \A j \in 1..i-1 : s[j].color = Gold + ELSE 0 + +PendingAtWNonce(s, wn) == + IF \E i \in 1..Len(s) : s[i].color = Pending /\ s[i].w_nonce = wn + THEN CHOOSE i \in 1..Len(s) : s[i].color = Pending /\ s[i].w_nonce = wn + ELSE 0 + +SilverAtBN(s, bn) == + IF \E i \in 1..Len(s) : s[i].color = Silver /\ s[i].batch_nonce = bn + THEN CHOOSE i \in 1..Len(s) : s[i].color = Silver /\ s[i].batch_nonce = bn + ELSE 0 + +--------------------------------------------------------------------------- +(* Staleness *) + +IsStaleByInclusion(b) == b.inclusion_block - b.safe_block >= MAX_WAIT_BLOCKS + +IsStaleByCurrent(b) == currentSafeBlock - b.safe_block >= MAX_WAIT_BLOCKS + +--------------------------------------------------------------------------- +(* Invariants *) + +TypeOK == + /\ Len(spine) >= 1 + /\ nextIndex \in Nat + /\ currentSafeBlock \in Nat + /\ walletNonce \in Nat + /\ nextL1Slot \in Nat + /\ schedulerCursor \in Nat + /\ schedulerExpected \in Nat + +\* Batch nonces are contiguous (0..N-1) for non-Tip spine elements. +BatchNoncesContiguous == + \A i \in 1..Len(spine) : + spine[i].color # Tip => spine[i].batch_nonce = i - 1 + +\* Dead branches only hang off Gold nodes. +InvalidOnlyOnGold == + \A i \in 1..Len(spine) : invalid[i] > 0 => spine[i].color = Gold + +\* ------- THE KEY THEOREM ------- +\* The scheduler accepts exactly the Gold prefix. +ZombieSafety == schedulerExpected = CountGold(spine) + +\* L1 consistency +L1WNonceUnique == + \A e1, e2 \in l1Included : e1.w_nonce = e2.w_nonce => e1 = e2 + +L1BeforeCursor == + \A e \in l1Included : e.w_nonce < nextL1Slot + +SchedulerBehindL1 == + schedulerCursor <= nextL1Slot + +DeadNotYetIncluded == + \A d \in deadBatches : d.w_nonce >= nextL1Slot + +Inv == + /\ TypeOK + /\ BatchNoncesContiguous + /\ InvalidOnlyOnGold + /\ ZombieSafety + /\ L1WNonceUnique + /\ L1BeforeCursor + /\ SchedulerBehindL1 + /\ DeadNotYetIncluded + +--------------------------------------------------------------------------- +(* Initial state *) + +(* + * Initial state: Genesis sentinel (nonce 0) is already Gold. + * This is a modeling technique that eliminates the nonce-0 edge + * case, allowing Resolve to use uniform logic. The implementation + * can handle nonce-0 however is simplest (see README.md). + * + * Tip.safe_block models the first frame's safe_block of the open batch. + * Keeping it meaningful lets the spec represent a Tip that ages past + * MAX_WAIT_BLOCKS before ever getting an L1 transaction. + *) +Init == + /\ spine = <<[index |-> 0, color |-> Gold, safe_block |-> 0, + inclusion_block |-> 0, w_nonce |-> 0, batch_nonce |-> 0], + [index |-> 1, color |-> Tip, safe_block |-> 0, + inclusion_block |-> 0, w_nonce |-> NONE, batch_nonce |-> 0]>> + /\ invalid = <<0, 0>> + /\ nextIndex = 2 + /\ currentSafeBlock = 0 + /\ walletNonce = 1 + /\ nextL1Slot = 1 + /\ l1Included = {[batch_nonce |-> 0, w_nonce |-> 0, + inclusion_block |-> 0, safe_block |-> 0, + is_safe |-> TRUE]} + /\ schedulerCursor = 1 + /\ schedulerExpected = 1 + /\ deadBatches = {} + +--------------------------------------------------------------------------- +(* AdvanceTip: close tip -> Pending, append new Tip *) + +AdvanceTip == + /\ nextIndex <= MaxBatchIndex + /\ LET tipPos == Len(spine) IN + /\ spine[tipPos].color = Tip + /\ spine[tipPos].safe_block <= currentSafeBlock + /\ (tipPos > 1 => spine[tipPos].safe_block >= spine[tipPos - 1].safe_block) + /\ spine' = [i \in 1..Len(spine) + 1 |-> + IF i < tipPos THEN spine[i] + ELSE IF i = tipPos + THEN [index |-> spine[tipPos].index, + color |-> Pending, + safe_block |-> spine[tipPos].safe_block, + inclusion_block |-> 0, + w_nonce |-> NONE, + batch_nonce |-> tipPos - 1] + ELSE [index |-> nextIndex, + color |-> Tip, + safe_block |-> currentSafeBlock, + inclusion_block |-> 0, + w_nonce |-> NONE, + batch_nonce |-> 0]] + /\ invalid' = [i \in 1..Len(spine) + 1 |-> + IF i <= Len(spine) THEN invalid[i] ELSE 0] + /\ nextIndex' = nextIndex + 1 + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* + * SubmitBatch: assign w_nonces to ALL unsubmitted Pending batches + * at once, in spine-position order. + *) +SubmitBatch == + LET unsubPos == {i \in 1..Len(spine) : + spine[i].color = Pending /\ spine[i].w_nonce = NONE} + wn0 == IF walletNonce >= nextL1Slot THEN walletNonce ELSE nextL1Slot + IN + /\ unsubPos # {} + /\ wn0 + Cardinality(unsubPos) <= MaxWalletNonce \* bound check + /\ spine' = [i \in 1..Len(spine) |-> + IF i \in unsubPos + THEN [spine[i] EXCEPT + !.w_nonce = wn0 + Cardinality({j \in unsubPos : j < i})] + ELSE spine[i]] + /\ walletNonce' = wn0 + Cardinality(unsubPos) + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* + * L1 actions: the L1 stream processes transactions in w_nonce order. + * At each slot, if both a spine batch and a flush no-op exist, + * L1 non-deterministically picks one. + * + * inclusion_block >= currentSafeBlock (L1 monotonicity) and + * >= all previous inclusion_blocks (block ordering). + *) + +\* Spine batch wins its slot -> Bronze or Silver. +L1IncludeSpine == + LET pos == PendingAtWNonce(spine, nextL1Slot) IN + /\ pos > 0 + /\ \E ib \in currentSafeBlock..MaxSafeBlock : + /\ \A e \in l1Included : ib >= e.inclusion_block + /\ LET isSafe == ib <= currentSafeBlock + newColor == IF isSafe THEN Silver ELSE Bronze + IN + /\ spine' = [spine EXCEPT ![pos].color = newColor, + ![pos].inclusion_block = ib] + /\ l1Included' = l1Included \union + {[batch_nonce |-> spine[pos].batch_nonce, + w_nonce |-> nextL1Slot, + inclusion_block |-> ib, + safe_block |-> spine[pos].safe_block, + is_safe |-> isSafe]} + /\ nextL1Slot' = nextL1Slot + 1 + /\ UNCHANGED <> + +\* Flush no-op wins at a spine Pending's slot. +\* The batch is killed: w_nonce reset to NONE. +\* The scheduler never sees it — silent nonce poison. +L1SkipSpine == + LET pos == PendingAtWNonce(spine, nextL1Slot) IN + /\ pos > 0 + /\ spine' = [spine EXCEPT ![pos].w_nonce = NONE] + /\ nextL1Slot' = nextL1Slot + 1 + /\ UNCHANGED <> + +\* Dead batch (from cascade) beats its flush no-op. +L1IncludeDead == + /\ \E d \in deadBatches : d.w_nonce = nextL1Slot + /\ LET d == CHOOSE dd \in deadBatches : dd.w_nonce = nextL1Slot IN + \E ib \in currentSafeBlock..MaxSafeBlock : + /\ \A e \in l1Included : ib >= e.inclusion_block + /\ l1Included' = l1Included \union + {[batch_nonce |-> d.batch_nonce, + w_nonce |-> nextL1Slot, + inclusion_block |-> ib, + safe_block |-> d.safe_block, + is_safe |-> (ib <= currentSafeBlock)]} + /\ deadBatches' = deadBatches \ {d} + /\ nextL1Slot' = nextL1Slot + 1 + /\ UNCHANGED <> + +\* Flush no-op wins at a dead batch's slot. +L1SkipDead == + /\ \E d \in deadBatches : d.w_nonce = nextL1Slot + /\ LET d == CHOOSE dd \in deadBatches : dd.w_nonce = nextL1Slot IN + /\ deadBatches' = deadBatches \ {d} + /\ nextL1Slot' = nextL1Slot + 1 + /\ UNCHANGED <> + +L1Include == + \/ L1IncludeSpine + \/ L1SkipSpine + \/ L1IncludeDead + \/ L1SkipDead + +--------------------------------------------------------------------------- +(* AdvanceSafeBlock: L1 safe block advances, Bronze -> Silver *) + +AdvanceSafeBlock == + /\ currentSafeBlock < MaxSafeBlock + /\ \E sb \in (currentSafeBlock + 1)..MaxSafeBlock : + /\ currentSafeBlock' = sb + /\ spine' = [i \in 1..Len(spine) |-> + IF spine[i].color = Bronze /\ spine[i].inclusion_block <= sb + THEN [spine[i] EXCEPT !.color = Silver] + ELSE spine[i]] + /\ l1Included' = {[e EXCEPT !.is_safe = + (e.is_safe \/ (e.inclusion_block <= sb))] + : e \in l1Included} + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* + * SchedulerStep: process the L1 entry at schedulerCursor. + * Accept: batch_nonce matches AND not stale -> Gold promotion. + * Skip: nonce mismatch OR stale (nonce poisoning). + *) +SchedulerStep == + /\ \E e \in l1Included : e.w_nonce = schedulerCursor /\ e.is_safe + /\ LET entry == CHOOSE e \in l1Included : + e.w_nonce = schedulerCursor /\ e.is_safe + IN + LET stale == entry.inclusion_block - entry.safe_block + >= MAX_WAIT_BLOCKS + accepted == entry.batch_nonce = schedulerExpected /\ ~stale + IN + /\ schedulerCursor' = schedulerCursor + 1 + /\ IF accepted + THEN /\ schedulerExpected' = schedulerExpected + 1 + /\ LET gp == SilverAtBN(spine, schedulerExpected) + IN IF gp > 0 + THEN spine' = [spine EXCEPT ![gp].color = Gold] + ELSE UNCHANGED spine + ELSE /\ UNCHANGED schedulerExpected + /\ UNCHANGED spine + /\ UNCHANGED <> + +(* + * SchedulerSkip: advance cursor over a gap (no-op consumed the slot, + * so no l1Included entry exists). + *) +SchedulerSkip == + /\ schedulerCursor < nextL1Slot + /\ ~(\E e \in l1Included : e.w_nonce = schedulerCursor) + /\ schedulerCursor' = schedulerCursor + 1 + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* + * Resolve: the oldest unresolved batch is definitely stale -> + * cascade-invalidate. + * + * Two cases are modeled: + * 1. the frontier unresolved batch is Silver and stale by inclusion + * (the submitted-batch zombie path), or + * 2. the frontier unresolved batch is Tip and stale by currentSafeBlock + * (the aging open-batch path). + * + * Cascade-invalidated batches already on L1 (Silver/Bronze) remain + * in l1Included. Submitted Pendings become dead batches. + * Unsubmitted Pendings are discarded. + * + * walletNonce is NOT reset — recovery batches use w_nonces past + * all dead batch slots. + * + * The genesis sentinel guarantees fng > 1 (there is always at + * least one Gold ancestor). + *) +Resolve == + /\ nextIndex <= MaxBatchIndex + /\ LET fng == FirstNonGold(spine) IN + /\ fng > 1 + /\ ((spine[fng].color = Silver /\ IsStaleByInclusion(spine[fng])) + \/ (spine[fng].color = Tip /\ IsStaleByCurrent(spine[fng]))) + /\ LET newLen == fng + newDead == + {[batch_nonce |-> spine[i].batch_nonce, + w_nonce |-> spine[i].w_nonce, + safe_block |-> spine[i].safe_block] : + i \in {j \in fng..Len(spine) : + spine[j].color = Pending /\ spine[j].w_nonce # NONE}} + IN + /\ spine' = [i \in 1..newLen |-> + IF i < fng THEN spine[i] + ELSE [index |-> nextIndex, + color |-> Tip, + safe_block |-> currentSafeBlock, + inclusion_block |-> 0, + w_nonce |-> NONE, + batch_nonce |-> 0]] + /\ invalid' = [i \in 1..newLen |-> + IF i = fng - 1 + THEN invalid[fng - 1] + (Len(spine) - fng + 1) + ELSE IF i < fng THEN invalid[i] + ELSE 0] + /\ nextIndex' = nextIndex + 1 + /\ deadBatches' = deadBatches \union newDead + /\ UNCHANGED <> + +--------------------------------------------------------------------------- +(* Spec *) + +Next == + \/ AdvanceTip + \/ SubmitBatch + \/ L1Include + \/ AdvanceSafeBlock + \/ SchedulerStep + \/ SchedulerSkip + \/ Resolve + +Spec == Init /\ [][Next]_vars + +========================================================================= diff --git a/docs/threat-model/README.md b/docs/threat-model/README.md new file mode 100644 index 0000000..6b61cd2 --- /dev/null +++ b/docs/threat-model/README.md @@ -0,0 +1,102 @@ +# Threat Model + +The security posture this codebase defends against. Defines what is in scope for security review, what is out of scope, and the trust level assigned to each actor and interface. + +See [`../recovery/README.md`](../recovery/README.md) for the recovery subsystem, which operationalizes parts of this threat model (adversarial mempool, fail-stop L1 provider). + +## Assets + +What we are protecting: + +- **Rollup state integrity.** The canonical on-chain state must reflect a deterministic replay of user operations and direct inputs. Any divergence between the sequencer's off-chain view and the scheduler's on-chain execution is a state-integrity failure. +- **Soft-confirmation honesty.** Every soft confirmation issued by the sequencer must land on L1 as promised, or be explicitly revoked via recovery. +- **User funds.** No user operation, replay, or protocol break can cause users to lose funds. +- **Batch-submitter key.** Held in operator infra; not hijackable by network attackers. + +## Actors and trust + +| Actor | Trust | Capabilities | +|-------|-------|--------------| +| InputBox contract | Trusted | Authenticates `msg_sender` on `addInput`. Use correctly; do not model forgery. | +| Our Ethereum node | Trusted, fail-stop | Inside our infra. May become unreachable; will never lie. | +| Fallback RPC (Infura / Alchemy) | Semi-trusted, fail-stop | Liveness fallback during primary outages. May withhold or delay. Never byzantine. | +| Operator env / CLI flags | Trusted | Configuration is authoritative. | +| Batch-submitter private key | Private | Held in operator infra. Not reachable by the network. | +| Sequencer's own code | Trusted (bug-free is a precondition) | Bugs are caught via tests and review, not defended against at runtime. See "self-trust" below. | +| **L1 mempool and block builders** | **Fully adversarial** | May reorder, delay, drop, or selectively include submitted transactions. Private mempools mean "dropped" is indistinguishable from "delayed indefinitely." | +| HTTP clients at `POST /tx` | Untrusted | Arbitrary public callers. May submit malformed, malicious, or replay payloads. | +| WebSocket subscribers at `/ws/subscribe` | Internal, but untrusted for data-exposure | Intended for internal indexers. Treat as public for what is exposed. | +| Direct-input senders on L1 | Untrusted | Arbitrary L1 accounts calling InputBox. May submit any calldata. | + +### Self-trust + +The sequencer trusts that its own code is correct. If the sequencer emits a malformed batch, frame, or user op, it is already in a bug state that requires manual intervention — we do not layer runtime defenses against sequencer self-misbehavior. Recovery addresses liveness failures (infrastructure outages, network partitions, gateway failure), not bug-induced malformed state. + +This is not an excuse to skip validation at trust boundaries. Inputs from untrusted actors are validated rigorously. Internal invariants are enforced by type system, SQL constraints, and tests — not by defensive runtime checks against hypothetical self-misbehavior. + +## In-scope failure modes + +- L1 provider outages (primary and fallback), minutes to hours +- Process crashes at arbitrary points, including mid-transaction +- **Adversarial mempool:** reorder, delay, drop, selective inclusion by builders +- **Zombie transactions:** a submitted batch may sit in a private mempool indefinitely and land long after we believed it was gone. The recovery flusher is load-bearing for this threat: it consumes every pending `w_nonce` slot with a no-op so zombies cannot claim them. +- L1 reorgs up to safe depth +- Malicious `POST /tx` callers: malformed signatures, spoofed sender, replay across chains or apps, nonce manipulation +- Malicious direct-input senders: arbitrary payload, any intent; sender authenticity is guaranteed by InputBox +- Scheduler/sequencer protocol divergence of any kind (ordering, nonce rules, signature validity, fee semantics) + +## Out of scope + +- **DoS, rate limiting, resource exhaustion.** Handled by infrastructure (WAF, load balancer, connection limits). Not addressed at the Rust layer. +- **Byzantine L1 provider.** Our own node; honest by assumption. +- **Byzantine InputBox.** Audited L1 contract; trusted. +- **Memory safety.** Rust eliminates this class. +- **Secrets-at-rest security.** Handled by operator infra (secrets manager, file permissions, encrypted volumes). +- **Supply-chain compromise of dependencies.** Tracked via dependency pinning and out-of-band vulnerability feeds, not by code review. +- **Sequencer self-bugs as an attack vector.** Addressed via correctness review, tests, and manual intervention when they occur — see "Self-trust" above. + +## External assumptions we rely on + +These are preconditions the sequencer takes as given. They are neither "trust" nor "threat" — they are invariants about the environment that must hold for the design to be sound. If they break, the sequencer's safety guarantees degrade. + +### L1 block-time coupling + +The wall-clock fallback in [`sequencer/src/recovery/mod.rs`](../../sequencer/src/recovery/mod.rs) estimates missed blocks as: + +``` +estimated_missed_blocks = (now - last_sync_ms) / SEQ_SECONDS_PER_BLOCK +``` + +This assumes a **known, bounded-variance relationship** between elapsed wall-clock time and mined L1 block count. The assumption has three parts: + +1. **Known average block time** — `SEQ_SECONDS_PER_BLOCK` (default 12s, Ethereum mainnet) accurately reflects the target chain's block cadence. +2. **Bounded variance** — over the danger-threshold window (~4h on mainnet), the delta between `elapsed_seconds / avg_block_time` and actual mined blocks is small. On Ethereum mainnet this holds: slot proposers occasionally skip, but >99% of slots produce a block. +3. **Wall clock is monotonic and accurate** — the host's `SystemTime::now()` does not jump backward significantly or drift. Handled by saturating subtraction against clock backward jumps, but not against systematic drift. + +**Where it matters.** Only on the fallback path — when L1 is unreachable and we cannot observe block numbers directly. When L1 is up, observed block numbers are authoritative and this assumption is not consulted. + +**Violation modes.** +- **Chain with unstable block time.** A chain where average block time drifts substantially (e.g., PoW networks under major hashrate swings) would make the estimate less reliable. Mitigation: `SEQ_SECONDS_PER_BLOCK` should be tuned conservatively (overestimate block time → underestimate missed blocks → more cautious recovery triggers). +- **Operator misconfigures `SEQ_SECONDS_PER_BLOCK`.** Typo or copy-paste error pointing at the wrong chain's cadence. Operator-trust scope. +- **Significant host clock drift.** A sequencer host whose clock lags or leads the real-world by minutes per day could slowly desynchronize its danger estimates from reality. + +**Corollary for test design.** To deterministically exercise the wall-clock fallback, tests must maintain this coupling: when advancing the L1 block count, they should also advance (or simulate) the corresponding wall-clock interval. Our e2e harness does the reverse — it rewinds `l1_safe_head.synced_at_ms` to an older timestamp, which is semantically equivalent to advancing the wall clock. See [`tests/TEST_PLAN.md`](../../tests/TEST_PLAN.md) §7.8 and tool T7. + +## How to apply this doc in code review + +For each code path under review: + +1. **Where does the input come from?** Map the source to the actor table. Untrusted sources require validation; trusted sources do not. +2. **What are the downstream effects?** DB write, signed L1 submission, WS broadcast, process control. The more consequential the effect, the tighter the validation must be. +3. **Does the code assume any actor behaves better than the table says?** Common mistakes: + - Assuming the mempool won't hold a tx indefinitely. + - Assuming a tx we "gave up on" is permanently dead. + - Assuming `safe_block` is current during an RPC outage. + - Assuming the sequencer's own code is correct where a bug would breach a trust boundary (e.g., emit signed state to L1). +4. **Correctness or exploitation?** Both are in scope. Under rollup semantics, a correctness bug that causes state divergence is as severe as a direct exploit. + +## Related documents + +- [`../recovery/README.md`](../recovery/README.md) — recovery design, TLA+ formal verification +- [`../../AGENTS.md`](../../AGENTS.md) — architecture, coding conventions, hot-path invariants +- [`../../SECURITY_TODO.md`](../../SECURITY_TODO.md) — open findings from staged security review diff --git a/examples/app-core/src/application/wallet.rs b/examples/app-core/src/application/wallet.rs index e1db37a..d4f5f55 100644 --- a/examples/app-core/src/application/wallet.rs +++ b/examples/app-core/src/application/wallet.rs @@ -145,14 +145,8 @@ impl Application for WalletApp { }); } - let max_fee = user_op.max_fee; - // Users sign a cap (log-space exponent); sequencer executes against the committed frame fee. - if max_fee < current_fee { - return Err(InvalidReason::InvalidMaxFee { - max_fee, - base_fee: current_fee, - }); - } + // max_fee < current_fee is already checked by the trait default in + // validate_and_execute_user_op. No need to repeat here. let gas_cost = sequencer_core::fee::fee_to_linear(current_fee); let balance = self.balance_of(&sender); @@ -183,33 +177,31 @@ impl Application for WalletApp { let method = Method::from_ssz_bytes(user_op.data.as_slice()).ok(); match method.as_ref() { - Some(Method::Transfer(transfer)) => { - if self.debit_if_possible(sender, transfer.amount) { - self.credit(transfer.to, transfer.amount); - outputs.push(AppOutput::Notice( - TransferNotice { - sender, - recipient: transfer.to, - amount: transfer.amount, - } - .abi_encode(), - )); - } + Some(Method::Transfer(transfer)) if self.debit_if_possible(sender, transfer.amount) => { + self.credit(transfer.to, transfer.amount); + outputs.push(AppOutput::Notice( + TransferNotice { + sender, + recipient: transfer.to, + amount: transfer.amount, + } + .abi_encode(), + )); } - Some(Method::Withdrawal(withdrawal)) => { - if self.debit_if_possible(sender, withdrawal.amount) { - outputs.push(AppOutput::Voucher { - destination: self.config.supported_erc20_token, - value: U256::ZERO, - payload: Erc20Transfer { - recipient: sender, - amount: withdrawal.amount, - } - .abi_encode(), - }); - } + Some(Method::Withdrawal(withdrawal)) + if self.debit_if_possible(sender, withdrawal.amount) => + { + outputs.push(AppOutput::Voucher { + destination: self.config.supported_erc20_token, + value: U256::ZERO, + payload: Erc20Transfer { + recipient: sender, + amount: withdrawal.amount, + } + .abi_encode(), + }); } - None => {} + _ => {} } self.executed_input_count = self.executed_input_count.saturating_add(1); @@ -279,6 +271,8 @@ mod tests { #[test] fn validate_rejects_when_max_fee_below_current_fee() { + use sequencer_core::application::{Application, ExecutionOutcome}; + let mut app = WalletApp::new(WalletConfig::default()); let sender = Address::from_slice(&[0x11; 20]); app.balances.insert(sender, U256::from(10_u64)); @@ -289,15 +283,17 @@ mod tests { data: Vec::::new().into(), }; - let err = app - .validate_user_op(sender, &user_op, 2) - .expect_err("max_fee < current_fee should be invalid"); + // The max_fee < current_fee check now lives in the trait default + // (validate_and_execute_user_op), not in validate_user_op directly. + let result = app + .validate_and_execute_user_op(sender, &user_op, 2) + .expect("should return Ok(Invalid), not Err"); assert_eq!( - err, - InvalidReason::InvalidMaxFee { + result, + ExecutionOutcome::Invalid(InvalidReason::InvalidMaxFee { max_fee: 1, base_fee: 2 - } + }) ); } diff --git a/examples/canonical-app/justfile b/examples/canonical-app/justfile index 10f08f0..f3eb915 100644 --- a/examples/canonical-app/justfile +++ b/examples/canonical-app/justfile @@ -2,7 +2,13 @@ set shell := ["bash", "-euo", "pipefail", "-c"] out_dir := "out" source_date_epoch := "0" +cartesi_machine_version := "0.20.0" +linux_image_release := "v0.20.0" +linux_kernel_filename := "linux-6.5.13-ctsi-1-v0.20.0.bin" linux_kernel := out_dir + "/linux.bin" +linux_kernel_sha512 := linux_kernel + ".sha512" +linux_kernel_url := "https://github.com/cartesi/machine-linux-image/releases/download/" + linux_image_release + "/" + linux_kernel_filename +linux_kernel_sha512_url := linux_kernel_url + ".sha512" rootfs_tar := out_dir + "/canonical-rootfs.tar" rootfs_ext2 := out_dir + "/canonical-rootfs.ext2" machine_image := out_dir + "/canonical-machine-image" @@ -13,7 +19,17 @@ machine_image_sepolia := out_dir + "/canonical-machine-image-sepolia" download-deps: @mkdir -p {{out_dir}} - @if [[ ! -f {{linux_kernel}} ]]; then wget https://github.com/cartesi/image-kernel/releases/download/v0.20.0/linux-6.5.13-ctsi-1-v0.20.0.bin -O {{linux_kernel}}; fi + @kernel_tmp="{{linux_kernel}}.tmp"; checksum_tmp="{{linux_kernel_sha512}}.tmp"; \ + verify_kernel() { (cd {{out_dir}} && shasum -a 512 -c "$(basename {{linux_kernel_sha512}})" >/dev/null); }; \ + if [[ ! -s {{linux_kernel}} || ! -s {{linux_kernel_sha512}} ]] || ! verify_kernel; then \ + rm -f "{{linux_kernel}}" "{{linux_kernel_sha512}}" "$kernel_tmp" "$checksum_tmp"; \ + wget "{{linux_kernel_url}}" -O "$kernel_tmp"; \ + wget "{{linux_kernel_sha512_url}}" -O "$checksum_tmp"; \ + mv "$kernel_tmp" "{{linux_kernel}}"; \ + sed "s# artifacts/[^ ]*\$# $(basename {{linux_kernel}})#" "$checksum_tmp" > "{{linux_kernel_sha512}}"; \ + rm -f "$checksum_tmp"; \ + verify_kernel; \ + fi build-dapp: build-dapp-devnet @@ -59,8 +75,9 @@ clean: rm -rf {{out_dir}} build-machine-image: clean-machine-image build-rootfs-devnet - test -f {{linux_kernel}} || { echo "missing {{linux_kernel}}; run 'just setup' first"; exit 1; } + test -s {{linux_kernel}} || { echo "missing or empty {{linux_kernel}}; run 'just setup' first"; exit 1; } cartesi-machine \ + --assert-version={{cartesi_machine_version}} \ --ram-length=128Mi \ --ram-image={{linux_kernel}} \ --flash-drive=label:root,data_filename:{{rootfs_ext2}} \ @@ -70,8 +87,9 @@ build-machine-image: clean-machine-image build-rootfs-devnet --store={{machine_image}} build-machine-image-sepolia: clean-machine-image-sepolia build-rootfs-sepolia - test -f {{linux_kernel}} || { echo "missing {{linux_kernel}}; run 'just setup' first"; exit 1; } + test -s {{linux_kernel}} || { echo "missing or empty {{linux_kernel}}; run 'just setup' first"; exit 1; } cartesi-machine \ + --assert-version={{cartesi_machine_version}} \ --ram-length=128Mi \ --ram-image={{linux_kernel}} \ --flash-drive=label:root,data_filename:{{rootfs_ext2}} \ diff --git a/examples/canonical-app/src/scheduler/core.rs b/examples/canonical-app/src/scheduler/core.rs index 95618e2..90b49bd 100644 --- a/examples/canonical-app/src/scheduler/core.rs +++ b/examples/canonical-app/src/scheduler/core.rs @@ -13,7 +13,7 @@ pub const DEVNET_SEQUENCER_ADDRESS: Address = address!("0xf39Fd6e51aad88F6F4ce6aB8827279cffFb92266"); pub const SEPOLIA_SEQUENCER_ADDRESS: Address = address!("0x16d5FF3Fdd14e2a86FBA77cbcE6B3Cd9C32b8Ff3"); -pub const MAX_WAIT_BLOCKS: u64 = 1200; +pub const MAX_WAIT_BLOCKS: u64 = sequencer_core::MAX_WAIT_BLOCKS; #[derive(Debug, Clone, PartialEq, Eq)] pub struct SchedulerConfig { @@ -187,7 +187,6 @@ impl Scheduler { self.config.max_wait_blocks, inclusion_block, ) { - self.advance_expected_batch_nonce(); return ProcessResult::without_outputs(ProcessOutcome::BatchSkippedStale); } @@ -245,6 +244,8 @@ impl Scheduler { for user_op in &frame.user_ops { if let Some(sender) = self.recover_sender(domain, user_op) { let plain = user_op.to_user_op(); + // Defense-in-depth: the trait default in validate_and_execute_user_op + // now centralizes this check, but we keep it here as an extra guard. if plain.max_fee < frame.fee_price { eprintln!("scheduler skipped frame user-op due to max_fee < fee_price"); continue; @@ -327,13 +328,7 @@ fn has_elapsed_since(start_block: u64, wait_blocks: u64, current_block: u64) -> } pub(super) fn input_domain(chain_id: u64, verifying_contract: Address) -> Eip712Domain { - Eip712Domain { - name: None, - version: None, - chain_id: Some(U256::from(chain_id)), - verifying_contract: Some(verifying_contract), - salt: None, - } + sequencer_core::build_input_domain(chain_id, verifying_contract) } pub(super) fn block_to_u64(block: U256) -> u64 { @@ -619,7 +614,7 @@ mod tests { } #[test] - fn stale_batch_is_skipped_and_consumes_nonce() { + fn stale_batch_is_skipped_without_consuming_nonce() { let mut scheduler = Scheduler::new( RecordingApp::default(), SchedulerConfig { @@ -648,14 +643,16 @@ mod tests { let outcome = scheduler.process_input(batch_input(10, stale_batch)); assert_eq!(outcome, ProcessOutcome::BatchSkippedStale); assert_eq!(scheduler.app.events(), [RecordedTx::Direct(9)]); - assert_eq!(scheduler.next_expected_batch_nonce(), 1); + // Stale batches do NOT consume the nonce — they are true no-ops in nonce space. + assert_eq!(scheduler.next_expected_batch_nonce(), 0); + // The next valid batch reuses nonce 0. let fresh_signing_key = SigningKey::from_bytes((&[13_u8; 32]).into()).expect("fresh signing key"); let fresh_sender = address_from_signing_key(&fresh_signing_key); scheduler.app.credit(fresh_sender, 1); let fresh_batch = Batch { - nonce: 1, + nonce: 0, frames: vec![Frame { user_ops: vec![sign_wire_user_op( &test_domain(), diff --git a/examples/canonical-test/src/main.rs b/examples/canonical-test/src/main.rs index 775df0a..32d57f6 100644 --- a/examples/canonical-test/src/main.rs +++ b/examples/canonical-test/src/main.rs @@ -49,20 +49,23 @@ pub fn scheduler_rejected_batch_does_not_consume_nonce() -> TestResult { } #[testsi::test_dapp(kind("scheduler"))] -pub fn scheduler_stale_batch_consumes_nonce_without_report() -> TestResult { +pub fn scheduler_stale_batch_is_skipped_without_consuming_nonce() -> TestResult { let mut machine = devnet_machine()?; let stale_trigger_block = SchedulerConfig::devnet().max_wait_blocks as usize + 1; + // Stale batch (nonce 0, safe_block 1, inclusion block > max_wait_blocks) → skipped silently. let (outputs, reports) = machine.advance_state(batch_input( stale_trigger_block, batch_with_safe_blocks(0, &[1]), ))?; assert_no_outputs_or_reports(&outputs, &reports); + // Fresh batch with nonce 0 succeeds — stale batch did NOT consume the nonce. let (outputs, reports) = machine.advance_state(batch_input(stale_trigger_block + 1, empty_batch(0)))?; - assert_invalid_batch_step(&outputs, &reports); + assert_no_outputs_or_reports(&outputs, &reports); + // Next batch with nonce 1 also succeeds. let (outputs, reports) = machine.advance_state(batch_input(stale_trigger_block + 2, empty_batch(1)))?; assert_no_outputs_or_reports(&outputs, &reports); @@ -228,13 +231,7 @@ fn devnet_machine() -> Result> } fn input_domain() -> Eip712Domain { - Eip712Domain { - name: None, - version: None, - chain_id: Some(U256::from(TEST_CHAIN_ID)), - verifying_contract: Some(TEST_DAPP_ADDRESS), - salt: None, - } + sequencer_core::build_input_domain(TEST_CHAIN_ID, TEST_DAPP_ADDRESS) } fn signing_key(byte: u8) -> SigningKey { diff --git a/sequencer-core/src/application/mod.rs b/sequencer-core/src/application/mod.rs index d3eb462..671cfc0 100644 --- a/sequencer-core/src/application/mod.rs +++ b/sequencer-core/src/application/mod.rs @@ -102,6 +102,15 @@ pub trait Application: Send { user_op: &UserOp, current_fee: u16, ) -> Result { + // Protocol invariant: max_fee must cover the current frame fee. + // Enforced here so every Application impl inherits it. + if user_op.max_fee < current_fee { + return Ok(ExecutionOutcome::Invalid(InvalidReason::InvalidMaxFee { + max_fee: user_op.max_fee, + base_fee: current_fee, + })); + } + if let Err(reason) = self.validate_user_op(sender, user_op, current_fee) { return Ok(ExecutionOutcome::Invalid(reason)); } diff --git a/sequencer-core/src/batch.rs b/sequencer-core/src/batch.rs index ff20fdd..8343763 100644 --- a/sequencer-core/src/batch.rs +++ b/sequencer-core/src/batch.rs @@ -4,10 +4,6 @@ use crate::user_op::UserOp; use ssz_derive::{Decode, Encode}; -/// Tag byte for InputBox payloads that are L1 app direct inputs (e.g. deposits). -/// L1/app must post such inputs as `0x00 || body`. Only these are stored (body only) and executed. -pub const INPUT_TAG_DIRECT_INPUT: u8 = 0x00; - // --------------------------------------------------------------------------- // Gas-economics-derived batch sizing // @@ -76,23 +72,140 @@ impl WireUserOp { } } -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct BatchForSubmission { - pub batch_index: u64, - pub created_at_ms: u64, - pub batch: Batch, -} +#[cfg(test)] +mod tests { + use super::*; + use ssz::{Decode, Encode}; + + fn sample_user_op(nonce: u32) -> WireUserOp { + WireUserOp { + nonce, + max_fee: 100, + data: vec![0xaa, 0xbb, 0xcc, 0xdd], + signature: vec![0xee; WireUserOp::SIGNATURE_BYTES], + } + } + + fn sample_frame(safe_block: u64, user_op_count: u32) -> Frame { + Frame { + user_ops: (0..user_op_count).map(sample_user_op).collect(), + safe_block, + fee_price: 42, + } + } + + fn sample_batch(nonce: u64, frame_count: u64) -> Batch { + Batch { + nonce, + frames: (0..frame_count).map(|i| sample_frame(100 + i, 2)).collect(), + } + } + + // ── §1.4 SSZ round-trip determinism ────────────────────────────────── -impl BatchForSubmission { - /// Encode the batch for the scheduler as a single SSZ payload. - /// - /// Payload is `ssz(Batch { nonce: batch_index, frames })`. The scheduler decodes this - /// and uses `batch.nonce` for deduplication; classification at the rollup is by msg_sender. - pub fn encode_for_scheduler(&self) -> Vec { + #[test] + fn ssz_roundtrip_empty_batch_is_identity() { let batch = Batch { - nonce: self.batch_index, - frames: self.batch.frames.clone(), + nonce: 0, + frames: vec![], }; - ssz::Encode::as_ssz_bytes(&batch) + let encoded = batch.as_ssz_bytes(); + let decoded = Batch::from_ssz_bytes(&encoded).expect("decode empty batch"); + assert_eq!(decoded, batch); + assert_eq!(decoded.as_ssz_bytes(), encoded); + } + + #[test] + fn ssz_roundtrip_populated_batch_is_identity() { + let batch = sample_batch(42, 3); + let encoded = batch.as_ssz_bytes(); + let decoded = Batch::from_ssz_bytes(&encoded).expect("decode populated batch"); + assert_eq!(decoded, batch); + assert_eq!(decoded.as_ssz_bytes(), encoded); + } + + #[test] + fn ssz_roundtrip_frame_with_empty_user_ops_is_identity() { + // Closed-empty frames (direct-input-only) are a real on-wire shape. + let frame = Frame { + user_ops: vec![], + safe_block: 7, + fee_price: 0, + }; + let encoded = frame.as_ssz_bytes(); + let decoded = Frame::from_ssz_bytes(&encoded).expect("decode"); + assert_eq!(decoded, frame); + } + + #[test] + fn ssz_roundtrip_wire_user_op_is_identity() { + let uop = sample_user_op(99); + let encoded = uop.as_ssz_bytes(); + let decoded = WireUserOp::from_ssz_bytes(&encoded).expect("decode wire user op"); + assert_eq!(decoded, uop); + } + + #[test] + fn ssz_encoding_is_deterministic_across_calls() { + // Determinism under the same input is a consensus requirement; encoding + // the same batch twice must produce byte-identical output. + let batch = sample_batch(7, 2); + assert_eq!(batch.as_ssz_bytes(), batch.as_ssz_bytes()); + } + + // ── §1.5 Decode robustness (no panics on adversarial bytes) ────────── + + #[test] + fn ssz_decode_empty_payload_returns_error() { + assert!(Batch::from_ssz_bytes(&[]).is_err()); + } + + #[test] + fn ssz_decode_below_fixed_header_returns_error() { + // Batch's fixed portion is 8 (nonce) + 4 (frames offset) = 12 bytes. + for len in 0..12 { + let buf = vec![0u8; len]; + assert!( + Batch::from_ssz_bytes(&buf).is_err(), + "decoding {len} bytes below fixed header must fail", + ); + } + } + + #[test] + fn ssz_decode_truncated_valid_batch_returns_error() { + let batch = sample_batch(1, 2); + let full = batch.as_ssz_bytes(); + // Truncating anywhere before the full length must not round-trip. + for cut in 0..full.len() { + let truncated = &full[..cut]; + match Batch::from_ssz_bytes(truncated) { + Err(_) => {} + Ok(decoded) => assert_ne!( + decoded, batch, + "truncation at {cut} silently decoded to the original batch", + ), + } + } + } + + #[test] + fn ssz_decode_invalid_offset_returns_error() { + // Well-formed nonce (8 zero bytes), frames offset points far past the + // buffer end. SSZ must reject rather than read out of bounds. + let mut buf = vec![0u8; 12]; + buf[8..12].copy_from_slice(&0xffff_ffff_u32.to_le_bytes()); + assert!(Batch::from_ssz_bytes(&buf).is_err()); + } + + #[test] + fn ssz_decode_garbage_bytes_never_panics() { + // Adversarial fixed patterns. Decoding may Err or Ok; the invariant we + // care about is "no panic" — the test passing proves it. + for pattern in [0x00, 0x01, 0x42, 0x7f, 0x80, 0xff] { + for len in [1, 12, 64, 256, 1024] { + let _ = Batch::from_ssz_bytes(&vec![pattern; len]); + } + } } } diff --git a/sequencer-core/src/lib.rs b/sequencer-core/src/lib.rs index fe33e65..3f645ee 100644 --- a/sequencer-core/src/lib.rs +++ b/sequencer-core/src/lib.rs @@ -1,10 +1,40 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) +use alloy_primitives::{Address, U256}; +use alloy_sol_types::Eip712Domain; + pub mod api; pub mod application; pub mod batch; pub mod broadcast; pub mod fee; pub mod l2_tx; +pub mod protocol; pub mod user_op; + +/// Maximum number of L1 blocks a batch can wait before the scheduler considers it stale. +/// Shared between the scheduler (canonical-app) and the sequencer (batch submitter, startup detection). +pub const MAX_WAIT_BLOCKS: u64 = 1200; + +/// EIP-712 domain name shared between sequencer and scheduler. +pub const DOMAIN_NAME: &str = "CartesiAppSequencer"; + +/// EIP-712 domain version shared between sequencer and scheduler. +pub const DOMAIN_VERSION: &str = "1"; + +/// Build the canonical EIP-712 domain for user-op signing and verification. +/// +/// Both the sequencer (signature verification at ingress) and the scheduler +/// (signature recovery during batch execution) MUST use this constructor. +/// A mismatch in any field changes the domain separator and causes every +/// signature to recover a different address. +pub fn build_input_domain(chain_id: u64, verifying_contract: Address) -> Eip712Domain { + Eip712Domain { + name: Some(DOMAIN_NAME.into()), + version: Some(DOMAIN_VERSION.into()), + chain_id: Some(U256::from(chain_id)), + verifying_contract: Some(verifying_contract), + salt: None, + } +} diff --git a/sequencer-core/src/protocol.rs b/sequencer-core/src/protocol.rs new file mode 100644 index 0000000..53c2642 --- /dev/null +++ b/sequencer-core/src/protocol.rs @@ -0,0 +1,312 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Protocol rules the sequencer mirrors from the scheduler, plus the +//! sequencer-side tuning knobs that govern preemptive self-protection. +//! +//! [`ProtocolConfig`] is the single source of truth for: +//! +//! - **Scheduler-acceptance** predicates (`scheduler_accepts`, `is_scheduler_stale`). +//! These match the on-chain scheduler's behavior exactly — mis-aligning them +//! would cause the sequencer's cached "gold frontier" to diverge from the +//! scheduler's actual accepted set. +//! - **Preemptive-recovery** tuning (`danger_threshold`, `seconds_per_block`). +//! These do not exist on the scheduler side; they control when the sequencer +//! proactively stops to avoid letting a batch age into the scheduler's skip +//! window. +//! +//! Keep the scheduler-mirroring fields (`batch_submitter`, `max_wait_blocks`) +//! aligned with the scheduler's config at deployment time. The two tuning +//! fields (`preemptive_margin_blocks`, `seconds_per_block`) are sequencer-local. + +use crate::batch::Batch; +use alloy_primitives::Address; + +/// Bundled protocol config: scheduler-acceptance parameters plus +/// sequencer-side preemptive-recovery tuning. +#[derive(Debug, Clone, Copy)] +pub struct ProtocolConfig { + /// L1 address that submits batches. The scheduler only accepts batches + /// whose `msg_sender` matches this. + pub batch_submitter: Address, + /// `MAX_WAIT_BLOCKS` — after this many L1 blocks, the scheduler skips a + /// submitted batch as stale. + pub max_wait_blocks: u64, + /// How many blocks before `max_wait_blocks` the sequencer triggers + /// preemptive recovery. Sequencer-local; must be strictly less than + /// `max_wait_blocks`. + pub preemptive_margin_blocks: u64, + /// Wall-clock estimate of L1 block time, used as a fallback when the L1 + /// safe head appears frozen. Sequencer-local. + pub seconds_per_block: u64, +} + +impl ProtocolConfig { + /// The block-age threshold at which preemptive recovery triggers. + /// + /// Panics if `preemptive_margin_blocks >= max_wait_blocks` — a threshold of + /// zero would make preemptive recovery indistinguishable from hard + /// staleness. Callers should catch this at startup. + pub fn danger_threshold(&self) -> u64 { + assert!( + self.preemptive_margin_blocks < self.max_wait_blocks, + "preemptive_margin_blocks ({}) must be less than max_wait_blocks ({})", + self.preemptive_margin_blocks, + self.max_wait_blocks, + ); + self.max_wait_blocks - self.preemptive_margin_blocks + } + + /// Scheduler's staleness predicate: a batch is stale when + /// `inclusion_block - first_frame_safe_block >= max_wait_blocks`. Used by + /// the scheduler to skip stale submissions, and by the sequencer's frontier + /// simulator to match that behavior. + pub fn is_scheduler_stale(&self, inclusion_block: u64, first_frame_safe_block: u64) -> bool { + age_exceeds( + inclusion_block, + first_frame_safe_block, + self.max_wait_blocks, + ) + } + + /// Off-chain simulation of the scheduler's batch-acceptance predicate. + /// + /// Returns `Some(AcceptedBatch)` iff the scheduler would accept the input + /// at the given `expected_nonce`. The caller threads `expected_nonce` + /// across a stream of inputs, advancing by one on each `Some`. + /// + /// Rejection paths (wrong sender, SSZ decode failure, stale by inclusion, + /// nonce mismatch) return `None` without advancing — matching what the + /// scheduler does on-chain. + pub fn scheduler_accepts( + &self, + input: SafeInputView<'_>, + expected_nonce: u64, + ) -> Option { + if input.sender != self.batch_submitter { + return None; + } + let batch = ::from_ssz_bytes(input.payload).ok()?; + let first_frame_safe_block = batch.frames.first().map(|f| f.safe_block).unwrap_or(0); + if !batch.frames.is_empty() + && self.is_scheduler_stale(input.inclusion_block, first_frame_safe_block) + { + return None; + } + if batch.nonce != expected_nonce { + return None; + } + Some(AcceptedBatch { + safe_input_index: input.safe_input_index, + nonce: batch.nonce, + first_frame_safe_block, + inclusion_block: input.inclusion_block, + }) + } +} + +/// Generic "age exceeds threshold" predicate shared between scheduler-staleness +/// and the preemptive danger-zone check. Saturating subtraction keeps the +/// arithmetic total over pathological inputs (safe head below a batch's first +/// frame). +pub fn age_exceeds(reference_block: u64, first_frame_safe_block: u64, threshold: u64) -> bool { + reference_block.saturating_sub(first_frame_safe_block) >= threshold +} + +/// Borrowed view of one safe-input row, in the shape scheduler_accepts needs. +/// Using a borrowed payload avoids copying during iteration. +#[derive(Debug, Clone, Copy)] +pub struct SafeInputView<'a> { + pub safe_input_index: u64, + pub sender: Address, + pub payload: &'a [u8], + pub inclusion_block: u64, +} + +/// One batch submission the scheduler would accept as part of its gold frontier. +#[derive(Debug, Clone, Copy)] +pub struct AcceptedBatch { + pub safe_input_index: u64, + pub nonce: u64, + pub first_frame_safe_block: u64, + pub inclusion_block: u64, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::batch::{Batch, Frame}; + + const SUBMITTER: Address = Address::repeat_byte(0xAA); + const OTHER: Address = Address::repeat_byte(0xBB); + const MAX_WAIT: u64 = 1200; + + fn config() -> ProtocolConfig { + ProtocolConfig { + batch_submitter: SUBMITTER, + max_wait_blocks: MAX_WAIT, + preemptive_margin_blocks: 75, + seconds_per_block: 12, + } + } + + fn encode(batch: &Batch) -> Vec { + ssz::Encode::as_ssz_bytes(batch) + } + + fn single_frame_batch(nonce: u64, safe_block: u64) -> Batch { + Batch { + nonce, + frames: vec![Frame { + user_ops: vec![], + safe_block, + fee_price: 0, + }], + } + } + + #[test] + fn danger_threshold_is_max_wait_minus_margin() { + assert_eq!(config().danger_threshold(), MAX_WAIT - 75); + } + + #[test] + #[should_panic(expected = "preemptive_margin_blocks")] + fn danger_threshold_panics_when_margin_ge_max_wait() { + let cfg = ProtocolConfig { + preemptive_margin_blocks: MAX_WAIT, + ..config() + }; + let _ = cfg.danger_threshold(); + } + + #[test] + fn age_exceeds_saturates_on_underflow() { + assert!(!age_exceeds(5, 10, 1)); + assert!(age_exceeds(1200, 0, 1200)); + assert!(!age_exceeds(1199, 0, 1200)); + } + + // ── ProtocolConfig::is_scheduler_stale direct boundary tests ────────── + // + // Indirectly covered by `scheduler_accepts_boundary_just_below_stale`, but + // the staleness predicate is load-bearing on its own (the scheduler skips + // submissions that trip it) and deserves direct tests that don't go through + // SSZ decoding. + + #[test] + fn is_scheduler_stale_reports_false_below_threshold() { + // age = inclusion - first = MAX_WAIT - 1, strictly below. + assert!(!config().is_scheduler_stale(MAX_WAIT, 1)); + // age = 0 (safe head right at the first frame). + assert!(!config().is_scheduler_stale(100, 100)); + } + + #[test] + fn is_scheduler_stale_reports_true_at_and_past_threshold() { + // age = MAX_WAIT exactly — `>=` comparison trips. + assert!(config().is_scheduler_stale(MAX_WAIT, 0)); + // age = MAX_WAIT + 1, clearly past. + assert!(config().is_scheduler_stale(MAX_WAIT + 1, 0)); + } + + #[test] + fn is_scheduler_stale_saturates_when_first_frame_is_ahead() { + // Degenerate input: safe head is behind the first frame's safe_block. + // Saturating subtraction yields 0, strictly below threshold — never stale. + assert!(!config().is_scheduler_stale(50, 100)); + } + + #[test] + fn scheduler_accepts_fresh_batch_with_matching_nonce() { + let payload = encode(&single_frame_batch(3, 100)); + let input = SafeInputView { + safe_input_index: 7, + sender: SUBMITTER, + payload: payload.as_slice(), + inclusion_block: 500, + }; + let accepted = config() + .scheduler_accepts(input, 3) + .expect("matching nonce + fresh inclusion should be accepted"); + assert_eq!(accepted.safe_input_index, 7); + assert_eq!(accepted.nonce, 3); + assert_eq!(accepted.first_frame_safe_block, 100); + assert_eq!(accepted.inclusion_block, 500); + } + + #[test] + fn scheduler_rejects_wrong_sender() { + let payload = encode(&single_frame_batch(0, 0)); + let input = SafeInputView { + safe_input_index: 0, + sender: OTHER, + payload: payload.as_slice(), + inclusion_block: 0, + }; + assert!(config().scheduler_accepts(input, 0).is_none()); + } + + #[test] + fn scheduler_rejects_stale_by_inclusion() { + let payload = encode(&single_frame_batch(0, 0)); + let input = SafeInputView { + safe_input_index: 0, + sender: SUBMITTER, + payload: payload.as_slice(), + inclusion_block: MAX_WAIT, + }; + assert!(config().scheduler_accepts(input, 0).is_none()); + } + + #[test] + fn scheduler_accepts_boundary_just_below_stale() { + let payload = encode(&single_frame_batch(0, 1)); + let input = SafeInputView { + safe_input_index: 0, + sender: SUBMITTER, + payload: payload.as_slice(), + inclusion_block: MAX_WAIT, + }; + assert!(config().scheduler_accepts(input, 0).is_some()); + } + + #[test] + fn scheduler_rejects_nonce_mismatch() { + let payload = encode(&single_frame_batch(2, 100)); + let input = SafeInputView { + safe_input_index: 0, + sender: SUBMITTER, + payload: payload.as_slice(), + inclusion_block: 200, + }; + assert!(config().scheduler_accepts(input, 3).is_none()); + assert!(config().scheduler_accepts(input, 1).is_none()); + } + + #[test] + fn scheduler_rejects_garbage_payload() { + let input = SafeInputView { + safe_input_index: 0, + sender: SUBMITTER, + payload: &[0xFF, 0xEE, 0xDD], + inclusion_block: 0, + }; + assert!(config().scheduler_accepts(input, 0).is_none()); + } + + #[test] + fn scheduler_accepts_empty_frames_batch_regardless_of_age() { + let payload = encode(&Batch { + nonce: 0, + frames: vec![], + }); + let input = SafeInputView { + safe_input_index: 0, + sender: SUBMITTER, + payload: payload.as_slice(), + inclusion_block: MAX_WAIT.saturating_mul(10), + }; + assert!(config().scheduler_accepts(input, 0).is_some()); + } +} diff --git a/sequencer/Cargo.toml b/sequencer/Cargo.toml index 2d8d0d6..a2343b2 100644 --- a/sequencer/Cargo.toml +++ b/sequencer/Cargo.toml @@ -40,3 +40,7 @@ tokio-tungstenite = "0.28" k256 = "0.13.4" tempfile = "3" sequencer-rust-client = { path = "../sdk/rust-client" } +# Used for `TcpProxy` in inline tests that need to simulate provider disconnect +# (e.g., flusher survives extended outage). The sequencer crate doesn't depend +# on `rollups-harness` in production; only the test profile pulls it in. +rollups-harness = { path = "../tests/harness" } diff --git a/sequencer/src/api/error.rs b/sequencer/src/api/error.rs deleted file mode 100644 index 9a75d76..0000000 --- a/sequencer/src/api/error.rs +++ /dev/null @@ -1,115 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use axum::Json; -use axum::http::StatusCode; -use axum::response::{IntoResponse, Response}; -use serde::Serialize; -use thiserror::Error; - -use crate::inclusion_lane::SequencerError; -use sequencer_core::api::TxRequestError; - -#[derive(Debug, Error, Clone)] -pub enum ApiError { - #[error("{0}")] - BadRequest(String), - #[error("{0}")] - PayloadTooLarge(String), - #[error("{0}")] - InvalidSignature(String), - #[error("{0}")] - ExecutionRejected(String), - #[error("{0}")] - Unavailable(String), - #[error("{0}")] - InternalError(String), - #[error("{0}")] - Overloaded(String), -} - -#[derive(Debug, Serialize)] -struct ErrorResponse { - ok: bool, - code: &'static str, - message: String, -} - -impl ApiError { - pub fn bad_request(message: impl Into) -> Self { - Self::BadRequest(message.into()) - } - - pub fn payload_too_large(message: impl Into) -> Self { - Self::PayloadTooLarge(message.into()) - } - - pub fn invalid_signature(message: impl Into) -> Self { - Self::InvalidSignature(message.into()) - } - - pub fn internal_error(message: impl Into) -> Self { - Self::InternalError(message.into()) - } - - pub fn unavailable(message: impl Into) -> Self { - Self::Unavailable(message.into()) - } - - pub fn overloaded(message: impl Into) -> Self { - Self::Overloaded(message.into()) - } - - pub fn status(&self) -> StatusCode { - match self { - Self::BadRequest(_) | Self::InvalidSignature(_) => StatusCode::BAD_REQUEST, - Self::PayloadTooLarge(_) => StatusCode::PAYLOAD_TOO_LARGE, - Self::ExecutionRejected(_) => StatusCode::UNPROCESSABLE_ENTITY, - Self::Unavailable(_) => StatusCode::SERVICE_UNAVAILABLE, - Self::InternalError(_) => StatusCode::INTERNAL_SERVER_ERROR, - Self::Overloaded(_) => StatusCode::TOO_MANY_REQUESTS, - } - } - - pub fn code(&self) -> &'static str { - match self { - Self::BadRequest(_) => "BAD_REQUEST", - Self::PayloadTooLarge(_) => "PAYLOAD_TOO_LARGE", - Self::InvalidSignature(_) => "INVALID_SIGNATURE", - Self::ExecutionRejected(_) => "EXECUTION_REJECTED", - Self::Unavailable(_) => "UNAVAILABLE", - Self::InternalError(_) => "INTERNAL_ERROR", - Self::Overloaded(_) => "OVERLOADED", - } - } -} - -impl From for ApiError { - fn from(value: SequencerError) -> Self { - match value { - SequencerError::Invalid(message) => Self::ExecutionRejected(message), - SequencerError::Unavailable(message) => Self::Unavailable(message), - SequencerError::Internal(message) => Self::InternalError(message), - } - } -} - -impl From for ApiError { - fn from(value: TxRequestError) -> Self { - match value { - TxRequestError::BadRequest(message) => Self::BadRequest(message), - TxRequestError::InvalidSignature(message) => Self::InvalidSignature(message), - } - } -} - -impl IntoResponse for ApiError { - fn into_response(self) -> Response { - let body = ErrorResponse { - ok: false, - code: self.code(), - message: self.to_string(), - }; - (self.status(), Json(body)).into_response() - } -} diff --git a/sequencer/src/api/mod.rs b/sequencer/src/api/mod.rs deleted file mode 100644 index cd7cf1e..0000000 --- a/sequencer/src/api/mod.rs +++ /dev/null @@ -1,118 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -mod error; -mod state; -mod tx; -mod ws; - -use std::io; -use std::sync::Arc; - -use alloy_sol_types::Eip712Domain; -use axum::Router; -use axum::extract::DefaultBodyLimit; -use axum::http::StatusCode; -use axum::routing::{get, post}; -use tokio::sync::mpsc; -use tower_http::trace::TraceLayer; - -pub use error::ApiError; -use state::ApiState; - -use crate::inclusion_lane::PendingUserOp; -use crate::l2_tx_feed::L2TxFeed; -use crate::shutdown::ShutdownSignal; -use sequencer_core::api::TxRequest; - -const DEFAULT_WS_MAX_SUBSCRIBERS: usize = 64; -const DEFAULT_WS_MAX_CATCHUP_EVENTS: u64 = 50_000; -const DEFAULT_MAX_BODY_BYTES: usize = TxRequest::MAX_JSON_BYTES_RECOMMENDED; -pub const WS_CATCHUP_WINDOW_EXCEEDED_REASON: &str = "catch-up window exceeded"; - -pub type ApiServerTask = tokio::task::JoinHandle>; - -#[derive(Debug, Clone, Copy)] -pub struct ApiConfig { - pub max_body_bytes: usize, - pub ws_max_subscribers: usize, - pub ws_max_catchup_events: u64, -} - -impl Default for ApiConfig { - fn default() -> Self { - Self { - max_body_bytes: DEFAULT_MAX_BODY_BYTES, - ws_max_subscribers: DEFAULT_WS_MAX_SUBSCRIBERS, - ws_max_catchup_events: DEFAULT_WS_MAX_CATCHUP_EVENTS, - } - } -} - -pub async fn start( - http_addr: impl tokio::net::ToSocketAddrs, - tx_sender: mpsc::Sender, - domain: Eip712Domain, - max_user_op_data_bytes: usize, - shutdown: ShutdownSignal, - tx_feed: L2TxFeed, - config: ApiConfig, -) -> io::Result { - let listener = tokio::net::TcpListener::bind(http_addr).await?; - Ok(start_on_listener( - listener, - tx_sender, - domain, - max_user_op_data_bytes, - shutdown, - tx_feed, - config, - )) -} - -pub fn start_on_listener( - listener: tokio::net::TcpListener, - tx_sender: mpsc::Sender, - domain: Eip712Domain, - max_user_op_data_bytes: usize, - shutdown: ShutdownSignal, - tx_feed: L2TxFeed, - config: ApiConfig, -) -> ApiServerTask { - let state = Arc::new(ApiState::new( - tx_sender, - domain, - max_user_op_data_bytes, - shutdown.clone(), - tx_feed, - config, - )); - let app = router(state, config.max_body_bytes); - - tokio::spawn(async move { - axum::serve(listener, app) - .with_graceful_shutdown(async move { - shutdown.wait_for_shutdown().await; - }) - .await - }) -} - -fn router(state: Arc, max_body_bytes: usize) -> Router { - Router::new() - .route("/tx", post(tx::submit_tx)) - .route("/ws/subscribe", get(ws::subscribe_l2_txs)) - .with_state(state) - // Enforces a raw request-body cap before JSON deserialization, including whitespace. - .layer(DefaultBodyLimit::max(max_body_bytes)) - .layer(TraceLayer::new_for_http()) -} - -// Keep non-413 JSON extractor failures normalized to 400 for a stable API contract. -fn map_json_rejection(err: axum::extract::rejection::JsonRejection) -> ApiError { - if err.status() == StatusCode::PAYLOAD_TOO_LARGE { - ApiError::payload_too_large(format!("request body too large: {err}")) - } else { - ApiError::bad_request(format!("invalid JSON: {err}")) - } -} diff --git a/sequencer/src/api/tx.rs b/sequencer/src/api/tx.rs deleted file mode 100644 index dad6617..0000000 --- a/sequencer/src/api/tx.rs +++ /dev/null @@ -1,168 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use std::sync::Arc; -use std::time::SystemTime; - -use axum::extract::{Json, State}; -use tokio::sync::mpsc::error::TrySendError; -use tokio::sync::oneshot; -use tracing::debug; - -use super::{ApiError, ApiState}; -use crate::inclusion_lane::PendingUserOp; -use sequencer_core::api::{TxRequest, TxResponse}; -use sequencer_core::user_op::SignedUserOp; - -pub(super) async fn submit_tx( - State(state): State>, - req: Result, axum::extract::rejection::JsonRejection>, -) -> Result, ApiError> { - let Json(req) = req.map_err(super::map_json_rejection)?; - - let signed = req - .into_signed_user_op(&state.domain, state.max_user_op_data_bytes) - .map_err(ApiError::from)?; - let nonce = signed.user_op.nonce; - let sender = signed.sender; - let ack = enqueue_verified_tx(state.as_ref(), signed)?; - - let commit_result = ack - .await - .map_err(|_| ApiError::internal_error("inclusion lane dropped response"))?; - commit_result.map_err(ApiError::from)?; - debug!(sender = %sender, nonce, "tx committed"); - - Ok(Json(TxResponse { - ok: true, - sender: sender.to_string(), - nonce, - })) -} - -fn enqueue_verified_tx( - state: &ApiState, - signed: SignedUserOp, -) -> Result>, ApiError> { - state.reject_if_shutting_down()?; - - let (respond_to, recv) = oneshot::channel(); - let pending = PendingUserOp { - signed, - respond_to, - received_at: SystemTime::now(), - }; - - match state.tx_sender.try_send(pending) { - Ok(()) => Ok(recv), - Err(TrySendError::Full(_)) => Err(ApiError::overloaded("queue full")), - Err(TrySendError::Closed(_)) => Err(ApiError::internal_error("inclusion lane unavailable")), - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use alloy_primitives::{Address, Signature}; - use alloy_sol_types::Eip712Domain; - use alloy_sol_types::SolStruct; - use axum::http::StatusCode; - use k256::ecdsa::SigningKey; - use k256::ecdsa::signature::hazmat::PrehashSigner; - use std::sync::Arc; - use tempfile::TempDir; - use tokio::sync::mpsc; - - use crate::storage::Storage; - use sequencer_core::user_op::UserOp; - - #[tokio::test(flavor = "current_thread")] - async fn submit_tx_rejects_when_shutdown_has_started() { - let db = TempDir::new().expect("create temp dir"); - let db_path = db.path().join("sequencer.db"); - let _storage = Storage::open(&db_path.to_string_lossy(), "NORMAL").expect("create db"); - let shutdown = crate::shutdown::ShutdownSignal::default(); - let tx_feed = crate::l2_tx_feed::L2TxFeed::new( - db_path.to_string_lossy().into_owned(), - shutdown.clone(), - crate::l2_tx_feed::L2TxFeedConfig { - idle_poll_interval: std::time::Duration::from_millis(2), - page_size: 64, - batch_submitter_address: None, - }, - ); - - shutdown.request_shutdown(); - - let (tx_sender, _rx) = mpsc::channel::(1); - let state = Arc::new(ApiState::new( - tx_sender, - Eip712Domain { - name: None, - version: None, - chain_id: None, - verifying_contract: None, - salt: None, - }, - 128, - shutdown, - tx_feed.clone(), - crate::api::ApiConfig { - max_body_bytes: 128, - ws_max_subscribers: 1, - ws_max_catchup_events: 1, - }, - )); - - let signing_key = SigningKey::from_bytes((&[7_u8; 32]).into()).expect("create signing key"); - let sender = address_from_signing_key(&signing_key); - let user_op = UserOp { - nonce: 0, - max_fee: 0, - data: Vec::new().into(), - }; - let request = TxRequest { - message: user_op.clone(), - signature: sign_user_op_hex(&state.domain, &user_op, &signing_key), - sender: sender.to_string(), - }; - - let result = submit_tx(State(state), Ok(Json(request))).await; - - let err = result.expect_err("submit should be rejected during shutdown"); - assert_eq!(err.status(), StatusCode::SERVICE_UNAVAILABLE); - assert_eq!(err.code(), "UNAVAILABLE"); - } - - fn sign_user_op_hex( - domain: &Eip712Domain, - user_op: &UserOp, - signing_key: &SigningKey, - ) -> String { - let hash = user_op.eip712_signing_hash(domain); - let k256_sig = signing_key - .sign_prehash(hash.as_slice()) - .expect("sign user op hash"); - - let sender = address_from_signing_key(signing_key); - let signature = [false, true] - .into_iter() - .map(|parity| Signature::from_signature_and_parity(k256_sig, parity)) - .find(|candidate| { - candidate - .recover_address_from_prehash(&hash) - .ok() - .map(|value| value == sender) - .unwrap_or(false) - }) - .expect("recoverable parity for signature"); - - alloy_primitives::hex::encode_prefixed(signature.as_bytes()) - } - - fn address_from_signing_key(signing_key: &SigningKey) -> Address { - let verifying = signing_key.verifying_key().to_encoded_point(false); - Address::from_raw_public_key(&verifying.as_bytes()[1..]) - } -} diff --git a/sequencer/src/batch_submitter/batch_poster.rs b/sequencer/src/batch_submitter/batch_poster.rs deleted file mode 100644 index 86cb27a..0000000 --- a/sequencer/src/batch_submitter/batch_poster.rs +++ /dev/null @@ -1,219 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use alloy::providers::{DynProvider, Provider}; -use async_trait::async_trait; -use cartesi_rollups_contracts::input_box::InputBox; -use sequencer_core::batch::Batch; -use thiserror::Error; - -use crate::partition::{decode_evm_advance_input, get_input_added_events}; - -pub type TxHash = alloy_primitives::B256; - -#[derive(Debug, Clone)] -pub struct BatchPosterConfig { - pub l1_submit_address: alloy_primitives::Address, - pub app_address: alloy_primitives::Address, - pub batch_submitter_address: alloy_primitives::Address, - pub start_block: u64, - pub confirmation_depth: u64, - /// Error codes that trigger `get_logs` retries with a shorter block range. - pub long_block_range_error_codes: Vec, -} - -#[derive(Debug, Error)] -pub enum BatchPosterError { - #[error("provider/transport: {0}")] - Provider(String), -} - -#[async_trait] -pub trait BatchPoster: Send + Sync { - async fn submit_batch(&self, payload: Vec) -> Result; - - async fn observed_submitted_batch_nonces( - &self, - from_block: u64, - ) -> Result, BatchPosterError>; -} - -#[derive(Clone)] -pub struct EthereumBatchPoster { - provider: DynProvider, - config: BatchPosterConfig, -} - -impl EthereumBatchPoster { - pub fn new(provider: DynProvider, config: BatchPosterConfig) -> Self { - Self { provider, config } - } -} - -#[async_trait] -impl BatchPoster for EthereumBatchPoster { - async fn submit_batch(&self, payload: Vec) -> Result { - let input_box = InputBox::new(self.config.l1_submit_address, &self.provider); - let pending = input_box - .addInput(self.config.app_address, payload.into()) - .send() - .await - .map_err(|err| BatchPosterError::Provider(err.to_string()))?; - let tx_hash = *pending.tx_hash(); - - pending - .with_required_confirmations(self.config.confirmation_depth.saturating_add(1)) - .watch() - .await - .map_err(|err| BatchPosterError::Provider(err.to_string()))?; - - Ok(tx_hash) - } - - async fn observed_submitted_batch_nonces( - &self, - from_block: u64, - ) -> Result, BatchPosterError> { - let latest = self - .provider - .get_block_number() - .await - .map_err(|err| BatchPosterError::Provider(err.to_string()))?; - let end_block = latest.saturating_sub(self.config.confirmation_depth); - let start_block = from_block.max(self.config.start_block); - if start_block > end_block { - return Ok(Vec::new()); - } - - let events = get_input_added_events( - &self.provider, - self.config.app_address, - &self.config.l1_submit_address, - start_block, - end_block, - self.config.long_block_range_error_codes.as_slice(), - ) - .await - .map_err(|errs| { - BatchPosterError::Provider( - errs.into_iter() - .next() - .map(|e| e.to_string()) - .unwrap_or_default(), - ) - })?; - - let mut observed_nonces = Vec::new(); - for (event, _log) in events { - let evm_advance = decode_evm_advance_input(event.input.as_ref()) - .map_err(BatchPosterError::Provider)?; - if evm_advance.msgSender != self.config.batch_submitter_address { - continue; - } - let batch: Batch = ssz::Decode::from_ssz_bytes(evm_advance.payload.as_ref()) - .map_err(|err| BatchPosterError::Provider(format!("{err:?}")))?; - observed_nonces.push(batch.nonce); - } - - Ok(observed_nonces) - } -} - -#[cfg(test)] -pub(crate) mod mock { - use super::{Batch, BatchPoster, BatchPosterError, TxHash}; - use async_trait::async_trait; - use std::sync::Mutex; - - #[derive(Debug)] - pub struct MockBatchPoster { - pub submissions: Mutex>, - pub fail_submit: Mutex, - pub observed_submitted_nonces: Mutex>, - pub observed_submitted_error: Mutex>, - pub last_from_block: Mutex>, - } - - impl MockBatchPoster { - pub fn new() -> Self { - Self { - submissions: Mutex::new(Vec::new()), - fail_submit: Mutex::new(false), - observed_submitted_nonces: Mutex::new(Vec::new()), - observed_submitted_error: Mutex::new(None), - last_from_block: Mutex::new(None), - } - } - - pub fn submissions(&self) -> Vec<(u64, usize)> { - self.submissions.lock().expect("lock").clone() - } - - pub fn set_observed_submitted_nonces(&self, value: Vec) { - *self.observed_submitted_nonces.lock().expect("lock") = value; - } - - pub fn set_observed_submitted_error(&self, value: Option<&str>) { - *self.observed_submitted_error.lock().expect("lock") = value.map(str::to_string); - } - - pub fn last_from_block(&self) -> Option { - *self.last_from_block.lock().expect("lock") - } - } - - #[async_trait] - impl BatchPoster for MockBatchPoster { - async fn submit_batch(&self, payload: Vec) -> Result { - if *self.fail_submit.lock().expect("lock") { - return Err(BatchPosterError::Provider("mock submit fail".into())); - } - let batch_index = ssz::Decode::from_ssz_bytes(payload.as_ref()) - .map(|b: Batch| b.nonce) - .unwrap_or(0); - self.submissions - .lock() - .expect("lock") - .push((batch_index, payload.len())); - Ok(TxHash::ZERO) - } - - async fn observed_submitted_batch_nonces( - &self, - from_block: u64, - ) -> Result, BatchPosterError> { - *self.last_from_block.lock().expect("lock") = Some(from_block); - if let Some(err) = self.observed_submitted_error.lock().expect("lock").clone() { - return Err(BatchPosterError::Provider(err)); - } - let configured = self.observed_submitted_nonces.lock().expect("lock").clone(); - if !configured.is_empty() { - return Ok(configured); - } - Ok(self - .submissions - .lock() - .expect("lock") - .iter() - .map(|(idx, _)| *idx) - .collect()) - } - } -} - -#[cfg(test)] -mod tests { - use super::{BatchPoster, mock::MockBatchPoster}; - - #[tokio::test] - async fn mock_poster_tracks_requested_suffix_start_block() { - let poster = MockBatchPoster::new(); - let observed = poster - .observed_submitted_batch_nonces(42) - .await - .expect("observe submitted batches"); - - assert!(observed.is_empty()); - assert_eq!(poster.last_from_block(), Some(42)); - } -} diff --git a/sequencer/src/batch_submitter/mod.rs b/sequencer/src/batch_submitter/mod.rs deleted file mode 100644 index 7b33556..0000000 --- a/sequencer/src/batch_submitter/mod.rs +++ /dev/null @@ -1,18 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -//! Batch submitter: posts closed batches to L1 with at-least-once semantics. -//! -//! The batch index is used as the batch nonce (id). The scheduler checks that nonces are -//! strictly increasing and invalidates otherwise, so duplicates are deduplicated at the -//! scheduler level. See `worker` for the wake → read S → compare → submit → sleep loop. - -mod batch_poster; -mod config; -mod worker; - -pub use batch_poster::{ - BatchPoster, BatchPosterConfig, BatchPosterError, EthereumBatchPoster, TxHash, -}; -pub use config::BatchSubmitterConfig; -pub use worker::{BatchSubmitter, BatchSubmitterError, TickOutcome}; diff --git a/sequencer/src/batch_submitter/worker.rs b/sequencer/src/batch_submitter/worker.rs deleted file mode 100644 index b5a79cc..0000000 --- a/sequencer/src/batch_submitter/worker.rs +++ /dev/null @@ -1,382 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -//! Batch submitter worker: at-least-once submission to L1, deduplicated by the scheduler. -//! -//! The worker is intentionally stateless with respect to submitted-batch progress. -//! On each tick it derives the highest submitted batch nonce from L1, compares that -//! with locally closed batches, submits the first missing batch if any, then loops. - -use std::sync::Arc; -use std::time::Duration; - -use alloy_primitives::Address; -use thiserror::Error; -use tracing::{debug, info, warn}; - -use crate::batch_submitter::{BatchPoster, BatchPosterError, BatchSubmitterConfig, TxHash}; -use crate::shutdown::ShutdownSignal; -use crate::storage::{Storage, StorageOpenError}; - -#[derive(Debug, Error)] -pub enum BatchSubmitterError { - #[error(transparent)] - OpenStorage(#[from] StorageOpenError), - #[error(transparent)] - Storage(#[from] rusqlite::Error), - #[error("batch submitter join error: {0}")] - Join(String), - #[error(transparent)] - Poster(#[from] BatchPosterError), -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum TickOutcome { - Idle, - Submitted { batch_index: u64, tx_hash: TxHash }, -} - -pub struct BatchSubmitter { - db_path: String, - batch_submitter_address: Address, - poster: Arc

, - idle_poll_interval: Duration, - shutdown: ShutdownSignal, -} - -impl BatchSubmitter

{ - pub fn new( - db_path: impl Into, - batch_submitter_address: Address, - poster: Arc

, - shutdown: ShutdownSignal, - config: BatchSubmitterConfig, - ) -> Self { - Self { - db_path: db_path.into(), - batch_submitter_address, - poster, - idle_poll_interval: config.idle_poll_interval(), - shutdown, - } - } - - pub fn start( - self, - ) -> Result>, StorageOpenError> { - let _ = Storage::open_read_only(self.db_path.as_str())?; - Ok(tokio::spawn(async move { self.run_forever().await })) - } - - async fn run_forever(self) -> Result<(), BatchSubmitterError> { - loop { - if self.shutdown.is_shutdown_requested() { - return Ok(()); - } - - match self.tick_once().await { - Ok(TickOutcome::Submitted { .. }) => continue, - Ok(TickOutcome::Idle) => {} - Err(BatchSubmitterError::Poster(source)) => { - warn!(error = %source, "batch submitter tick failed, will retry"); - } - Err(err) => return Err(err), - } - - tokio::select! { - _ = self.shutdown.wait_for_shutdown() => return Ok(()), - _ = tokio::time::sleep(self.idle_poll_interval) => {} - } - } - } - - pub(crate) async fn tick_once(&self) -> Result { - let latest_batch_opt = self.load_latest_batch_index().await?; - let Some(latest_batch_index) = latest_batch_opt else { - return Ok(TickOutcome::Idle); - }; - - if latest_batch_index == 0 { - return Ok(TickOutcome::Idle); - } - - let last_closed = latest_batch_index - 1; - let next_expected = { - let (safe_block, safe_next_expected) = - self.load_safe_next_expected_batch_nonce().await?; - - let recent_observed_nonces = self - .poster - .observed_submitted_batch_nonces(safe_block.saturating_add(1)) - .await?; - advance_expected_batch_nonce(safe_next_expected, recent_observed_nonces) - }; - let latest_submitted = next_expected.checked_sub(1); - let first_to_submit = latest_submitted.map(|s| s + 1).unwrap_or(0); - if first_to_submit > last_closed { - return Ok(TickOutcome::Idle); - } - if first_to_submit < last_closed { - let pending_batches = last_closed - first_to_submit + 1; - warn!( - first_to_submit, - last_closed, pending_batches, "multiple closed batches are pending submission" - ); - } - - let batch = self.load_batch_for_submission(first_to_submit).await?; - debug!(batch_index = first_to_submit, "submitting batch to L1"); - let tx_hash = self - .poster - .submit_batch(batch.encode_for_scheduler()) - .await?; - info!(batch_index = first_to_submit, %tx_hash, "batch submitted to L1"); - - Ok(TickOutcome::Submitted { - batch_index: first_to_submit, - tx_hash, - }) - } - - async fn load_latest_batch_index(&self) -> Result, BatchSubmitterError> { - let db_path = self.db_path.clone(); - tokio::task::spawn_blocking(move || { - let mut storage = Storage::open_read_only(&db_path)?; - storage - .latest_batch_index() - .map_err(BatchSubmitterError::from) - }) - .await - .map_err(|err| BatchSubmitterError::Join(err.to_string()))? - } - - const SAFE_NONCE_PAGE_SIZE: u64 = 256; - - async fn load_safe_next_expected_batch_nonce(&self) -> Result<(u64, u64), BatchSubmitterError> { - let db_path = self.db_path.clone(); - let batch_submitter_address = self.batch_submitter_address; - tokio::task::spawn_blocking(move || { - let mut storage = Storage::open_read_only(&db_path)?; - storage - .advance_safe_batch_nonce_for_sender( - batch_submitter_address, - Self::SAFE_NONCE_PAGE_SIZE, - ) - .map_err(BatchSubmitterError::from) - }) - .await - .map_err(|err| BatchSubmitterError::Join(err.to_string()))? - } - - async fn load_batch_for_submission( - &self, - batch_index: u64, - ) -> Result { - let db_path = self.db_path.clone(); - tokio::task::spawn_blocking(move || { - let mut storage = Storage::open_read_only(&db_path)?; - storage - .load_batch_for_submission(batch_index) - .map_err(BatchSubmitterError::from) - }) - .await - .map_err(|err| BatchSubmitterError::Join(err.to_string()))? - } -} - -fn advance_expected_batch_nonce( - mut expected: u64, - observed_nonces: impl IntoIterator, -) -> u64 { - for nonce in observed_nonces { - if nonce == expected { - expected = expected.saturating_add(1); - } - } - expected -} - -#[cfg(test)] -mod tests { - use std::sync::Arc; - - use alloy_primitives::Address; - - use crate::batch_submitter::{ - BatchSubmitterConfig, BatchSubmitterError, TickOutcome, batch_poster::mock::MockBatchPoster, - }; - use crate::shutdown::ShutdownSignal; - use crate::storage::{SafeInputRange, Storage, StoredSafeInput}; - use tempfile::TempDir; - - const SQLITE_SYNCHRONOUS_PRAGMA: &str = "NORMAL"; - const BATCH_SUBMITTER_ADDRESS: Address = Address::repeat_byte(0x11); - - fn temp_db(name: &str) -> (TempDir, String) { - let dir = tempfile::Builder::new() - .prefix(format!("sequencer-batch-submitter-{name}-").as_str()) - .tempdir() - .expect("create temporary test directory"); - let path = dir.path().join("sequencer.sqlite"); - (dir, path.to_string_lossy().into_owned()) - } - - fn seed_two_closed_batches(db_path: &str) { - let mut storage = Storage::open(db_path, SQLITE_SYNCHRONOUS_PRAGMA).expect("open storage"); - let mut head = storage - .initialize_open_state(0, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - let next_safe = head.safe_block; - storage - .close_frame_and_batch(&mut head, next_safe) - .expect("close batch 0"); - storage - .close_frame_and_batch(&mut head, next_safe) - .expect("close batch 1"); - storage - .close_frame_and_batch(&mut head, next_safe) - .expect("close batch 2"); - } - - fn seed_safe_submitted_batches(db_path: &str, safe_block: u64, nonces: &[u64]) { - let mut storage = Storage::open(db_path, SQLITE_SYNCHRONOUS_PRAGMA).expect("open storage"); - let inputs: Vec<_> = nonces - .iter() - .map(|nonce| StoredSafeInput { - sender: BATCH_SUBMITTER_ADDRESS, - payload: ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { - nonce: *nonce, - frames: Vec::new(), - }), - block_number: safe_block, - }) - .collect(); - storage - .append_safe_inputs(safe_block, inputs.as_slice()) - .expect("append safe submitted batches"); - } - - #[tokio::test] - async fn tick_once_submits_first_missing_closed_batch() { - let (_dir, path) = temp_db("tick-submits"); - seed_two_closed_batches(&path); - - let mock = Arc::new(MockBatchPoster::new()); - let config = BatchSubmitterConfig { - idle_poll_interval_ms: 1000, - }; - let submitter = super::BatchSubmitter::new( - path.clone(), - BATCH_SUBMITTER_ADDRESS, - mock.clone(), - ShutdownSignal::default(), - config, - ); - - let outcome = submitter.tick_once().await.expect("tick once"); - assert_eq!( - outcome, - TickOutcome::Submitted { - batch_index: 0, - tx_hash: alloy_primitives::B256::ZERO - } - ); - - let submissions = mock.submissions(); - assert_eq!(submissions.len(), 1); - assert_eq!(submissions[0].0, 0); - } - - #[tokio::test] - async fn tick_once_submits_nothing_when_already_caught_up() { - let (_dir, path) = temp_db("tick-caught-up"); - seed_two_closed_batches(&path); - seed_safe_submitted_batches(&path, 10, &[0, 1]); - - let mock = Arc::new(MockBatchPoster::new()); - mock.set_observed_submitted_nonces(vec![2]); - let config = BatchSubmitterConfig { - idle_poll_interval_ms: 1000, - }; - let submitter = super::BatchSubmitter::new( - path.clone(), - BATCH_SUBMITTER_ADDRESS, - mock.clone(), - ShutdownSignal::default(), - config, - ); - - let outcome = submitter.tick_once().await.expect("tick once"); - assert_eq!(outcome, TickOutcome::Idle); - assert!(mock.submissions().is_empty()); - assert_eq!(mock.last_from_block(), Some(11)); - } - - #[tokio::test] - async fn tick_once_combines_safe_prefix_with_recent_chain_suffix() { - let (_dir, path) = temp_db("tick-combines-prefix-and-suffix"); - seed_two_closed_batches(&path); - seed_safe_submitted_batches(&path, 10, &[0]); - - let mock = Arc::new(MockBatchPoster::new()); - mock.set_observed_submitted_nonces(vec![1]); - let submitter = super::BatchSubmitter::new( - path.clone(), - BATCH_SUBMITTER_ADDRESS, - mock.clone(), - ShutdownSignal::default(), - BatchSubmitterConfig { - idle_poll_interval_ms: 1000, - }, - ); - - let outcome = submitter.tick_once().await.expect("tick once"); - assert_eq!( - outcome, - TickOutcome::Submitted { - batch_index: 2, - tx_hash: alloy_primitives::B256::ZERO - } - ); - assert_eq!(mock.last_from_block(), Some(11)); - } - - #[tokio::test] - async fn tick_once_propagates_poster_errors() { - let (_dir, path) = temp_db("tick-poster-error"); - seed_two_closed_batches(&path); - - let mock = Arc::new(MockBatchPoster::new()); - mock.set_observed_submitted_error(Some("rpc fail")); - let submitter = super::BatchSubmitter::new( - path, - BATCH_SUBMITTER_ADDRESS, - mock, - ShutdownSignal::default(), - BatchSubmitterConfig { - idle_poll_interval_ms: 1000, - }, - ); - - let err = submitter - .tick_once() - .await - .expect_err("poster error should propagate"); - assert!(matches!(err, BatchSubmitterError::Poster(_))); - } - - #[test] - fn advance_expected_batch_nonce_matches_scheduler_nonce_rule() { - assert_eq!(super::advance_expected_batch_nonce(0, Vec::::new()), 0); - assert_eq!(super::advance_expected_batch_nonce(0, vec![0, 1, 2]), 3); - assert_eq!(super::advance_expected_batch_nonce(0, vec![0, 2, 3]), 1); - assert_eq!(super::advance_expected_batch_nonce(0, vec![1, 2, 3]), 0); - assert_eq!(super::advance_expected_batch_nonce(0, vec![0, 1, 1, 2]), 3); - assert_eq!( - super::advance_expected_batch_nonce(0, vec![6, 4, 3, 2, 2, 0, 1]), - 2 - ); - assert_eq!(super::advance_expected_batch_nonce(0, vec![0, 2, 1]), 2); - assert_eq!(super::advance_expected_batch_nonce(2, vec![2, 3]), 4); - } -} diff --git a/sequencer/src/egress/api/health.rs b/sequencer/src/egress/api/health.rs new file mode 100644 index 0000000..ac783c7 --- /dev/null +++ b/sequencer/src/egress/api/health.rs @@ -0,0 +1,116 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Health probes (k8s-style): +//! +//! - `GET /livez` — process is up. Always 200. +//! - `GET /readyz` — ready to accept new transactions. 503 if shutdown is in +//! progress or the inclusion lane has dropped its receiver. +//! - `GET /healthz` — JSON status report. 200 / 503 mirroring `/readyz`. +//! +//! Lives on egress because operators (and kubelet, in practice) probe from the +//! internal-cluster side. + +use std::sync::Arc; + +use axum::Json; +use axum::extract::State; +use axum::http::StatusCode; +use axum::response::IntoResponse; +use serde::Serialize; +use tokio::sync::mpsc; + +use crate::ingress::inclusion_lane::PendingUserOp; +use crate::runtime::shutdown::ShutdownSignal; + +/// Narrow health-check state. Holds only the signals the probes inspect; the +/// `tx_sender` is a clone of the inclusion-lane channel and is closed iff the +/// lane has dropped its receiver. +#[derive(Clone)] +pub(crate) struct HealthState { + pub tx_sender: mpsc::Sender, + pub shutdown: ShutdownSignal, +} + +#[derive(Serialize)] +struct HealthStatus { + status: &'static str, + inclusion_lane: &'static str, +} + +pub(crate) async fn livez() -> StatusCode { + StatusCode::OK +} + +pub(crate) async fn readyz(State(state): State>) -> StatusCode { + if state.shutdown.is_shutdown_requested() || state.tx_sender.is_closed() { + StatusCode::SERVICE_UNAVAILABLE + } else { + StatusCode::OK + } +} + +pub(crate) async fn healthz(State(state): State>) -> impl IntoResponse { + let lane_ok = !state.tx_sender.is_closed(); + let shutting_down = state.shutdown.is_shutdown_requested(); + let all_ok = lane_ok && !shutting_down; + + let body = HealthStatus { + status: if all_ok { "ok" } else { "degraded" }, + inclusion_lane: if lane_ok { "ok" } else { "stopped" }, + }; + + let status = if all_ok { + StatusCode::OK + } else { + StatusCode::SERVICE_UNAVAILABLE + }; + (status, Json(body)) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn fresh_state() -> (Arc, mpsc::Receiver) { + let (tx_sender, rx) = mpsc::channel::(1); + let state = Arc::new(HealthState { + tx_sender, + shutdown: ShutdownSignal::default(), + }); + (state, rx) + } + + #[tokio::test] + async fn livez_is_always_ok() { + assert_eq!(livez().await, StatusCode::OK); + } + + #[tokio::test] + async fn readyz_is_ok_when_lane_alive_and_not_shutting_down() { + let (state, _rx) = fresh_state(); + assert_eq!(readyz(State(state)).await, StatusCode::OK); + } + + #[tokio::test] + async fn readyz_is_unavailable_when_shutdown_requested() { + let (state, _rx) = fresh_state(); + state.shutdown.request_shutdown(); + assert_eq!(readyz(State(state)).await, StatusCode::SERVICE_UNAVAILABLE); + } + + #[tokio::test] + async fn readyz_is_unavailable_when_lane_dropped() { + let (state, rx) = fresh_state(); + drop(rx); + assert_eq!(readyz(State(state)).await, StatusCode::SERVICE_UNAVAILABLE); + } + + #[tokio::test] + async fn healthz_reports_lane_stopped_after_lane_drop() { + let (state, rx) = fresh_state(); + drop(rx); + let response = healthz(State(state)).await.into_response(); + assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE); + } +} diff --git a/sequencer/src/egress/api/mod.rs b/sequencer/src/egress/api/mod.rs new file mode 100644 index 0000000..5d112ea --- /dev/null +++ b/sequencer/src/egress/api/mod.rs @@ -0,0 +1,36 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Egress HTTP API routes: WebSocket subscribe + k8s-style health probes. +//! Additional read endpoints will land here. + +mod health; +mod state; +mod subscribe; + +use std::sync::Arc; + +use axum::Router; +use axum::routing::get; + +pub(crate) use health::HealthState; +pub(crate) use state::SubscribeState; + +/// Build the egress router. Each subrouter has its own state; the merge is +/// transparent to axum's routing. +pub(crate) fn router( + subscribe_state: Arc, + health_state: Arc, +) -> Router { + let subscribe_router = Router::new() + .route("/ws/subscribe", get(subscribe::subscribe_l2_txs)) + .with_state(subscribe_state); + + let health_router = Router::new() + .route("/livez", get(health::livez)) + .route("/readyz", get(health::readyz)) + .route("/healthz", get(health::healthz)) + .with_state(health_state); + + subscribe_router.merge(health_router) +} diff --git a/sequencer/src/api/state.rs b/sequencer/src/egress/api/state.rs similarity index 55% rename from sequencer/src/api/state.rs rename to sequencer/src/egress/api/state.rs index 752e254..de15396 100644 --- a/sequencer/src/api/state.rs +++ b/sequencer/src/egress/api/state.rs @@ -1,43 +1,36 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) +//! Egress-side axum state — feeds the WS subscribe handler today; will grow as +//! more egress routes are added. + use std::sync::Arc; -use alloy_sol_types::Eip712Domain; -use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc}; +use tokio::sync::{OwnedSemaphorePermit, Semaphore}; -use super::{ApiConfig, ApiError}; -use crate::inclusion_lane::PendingUserOp; -use crate::l2_tx_feed::L2TxFeed; -use crate::shutdown::ShutdownSignal; +use crate::egress::l2_tx_feed::L2TxFeed; +use crate::http::ApiError; +use crate::runtime::shutdown::ShutdownSignal; #[derive(Clone)] -pub(super) struct ApiState { - pub tx_sender: mpsc::Sender, - pub domain: Eip712Domain, - pub max_user_op_data_bytes: usize, +pub(crate) struct SubscribeState { pub shutdown: ShutdownSignal, pub ws_subscriber_limit: Arc, pub ws_max_catchup_events: u64, pub tx_feed: L2TxFeed, } -impl ApiState { - pub(super) fn new( - tx_sender: mpsc::Sender, - domain: Eip712Domain, - max_user_op_data_bytes: usize, +impl SubscribeState { + pub(crate) fn new( shutdown: ShutdownSignal, tx_feed: L2TxFeed, - config: ApiConfig, + ws_max_subscribers: usize, + ws_max_catchup_events: u64, ) -> Self { Self { - tx_sender, - domain, - max_user_op_data_bytes, shutdown, - ws_subscriber_limit: Arc::new(Semaphore::new(config.ws_max_subscribers)), - ws_max_catchup_events: config.ws_max_catchup_events, + ws_subscriber_limit: Arc::new(Semaphore::new(ws_max_subscribers)), + ws_max_catchup_events, tx_feed, } } diff --git a/sequencer/src/api/ws.rs b/sequencer/src/egress/api/subscribe.rs similarity index 77% rename from sequencer/src/api/ws.rs rename to sequencer/src/egress/api/subscribe.rs index 23aacf8..897f73f 100644 --- a/sequencer/src/api/ws.rs +++ b/sequencer/src/egress/api/subscribe.rs @@ -1,6 +1,10 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) +//! `GET /ws/subscribe` — replay-then-live stream of ordered L2 txs. +//! Acquires a subscriber permit before upgrading; permit is held for the +//! lifetime of the session and released on disconnect via `Drop`. + use std::sync::Arc; use axum::extract::ws::{CloseFrame, Message, WebSocket, WebSocketUpgrade, close_code}; @@ -10,20 +14,21 @@ use serde::Deserialize; use tokio::sync::OwnedSemaphorePermit; use tracing::warn; -use crate::l2_tx_feed::{BroadcastTxMessage, L2TxFeed, SubscribeError}; +use crate::egress::l2_tx_feed::{BroadcastTxMessage, L2TxFeed, SubscribeError}; +use crate::http::WS_CATCHUP_WINDOW_EXCEEDED_REASON; -use super::{ApiState, WS_CATCHUP_WINDOW_EXCEEDED_REASON}; +use super::SubscribeState; const MAX_INBOUND_WS_MESSAGE_SIZE: usize = 8 * 1024; const MAX_INBOUND_WS_FRAME_SIZE: usize = 8 * 1024; #[derive(Debug, Deserialize)] -pub(super) struct SubscribeQuery { +pub(crate) struct SubscribeQuery { from_offset: Option, } -pub(super) async fn subscribe_l2_txs( - State(state): State>, +pub(crate) async fn subscribe_l2_txs( + State(state): State>, Query(query): Query, ws: WebSocketUpgrade, ) -> Response { @@ -67,32 +72,22 @@ async fn run_ws_session( max_catchup_events, "ws catch-up window exceeded; closing subscriber" ); - let _ = socket - .send(Message::Close(Some(CloseFrame { - code: close_code::POLICY, - reason: WS_CATCHUP_WINDOW_EXCEEDED_REASON.into(), - }))) - .await; + close_with_frame( + &mut socket, + close_code::POLICY, + WS_CATCHUP_WINDOW_EXCEEDED_REASON, + ) + .await; return; } Err(SubscribeError::OpenStorage { source }) => { warn!(error = %source, "ws subscription failed to open replay storage"); - let _ = socket - .send(Message::Close(Some(CloseFrame { - code: close_code::ERROR, - reason: "subscription unavailable".into(), - }))) - .await; + close_with_frame(&mut socket, close_code::ERROR, "subscription unavailable").await; return; } Err(SubscribeError::LoadHeadOffset { source }) => { warn!(error = %source, "ws subscription failed to read replay head"); - let _ = socket - .send(Message::Close(Some(CloseFrame { - code: close_code::ERROR, - reason: "subscription unavailable".into(), - }))) - .await; + close_with_frame(&mut socket, close_code::ERROR, "subscription unavailable").await; return; } }; @@ -127,6 +122,15 @@ async fn run_ws_session( } } +async fn close_with_frame(socket: &mut WebSocket, code: u16, reason: &str) { + let _ = socket + .send(Message::Close(Some(CloseFrame { + code, + reason: reason.into(), + }))) + .await; +} + async fn send_ws_event(socket: &mut WebSocket, event: &BroadcastTxMessage) -> Result<(), ()> { let payload = match serde_json::to_string(event) { Ok(value) => value, diff --git a/sequencer/src/l2_tx_feed/error.rs b/sequencer/src/egress/l2_tx_feed/error.rs similarity index 100% rename from sequencer/src/l2_tx_feed/error.rs rename to sequencer/src/egress/l2_tx_feed/error.rs diff --git a/sequencer/src/l2_tx_feed/feed.rs b/sequencer/src/egress/l2_tx_feed/mod.rs similarity index 78% rename from sequencer/src/l2_tx_feed/feed.rs rename to sequencer/src/egress/l2_tx_feed/mod.rs index 15c5e49..c7cf616 100644 --- a/sequencer/src/l2_tx_feed/feed.rs +++ b/sequencer/src/egress/l2_tx_feed/mod.rs @@ -1,15 +1,23 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) +//! DB-backed ordered-L2-tx feed used by WS subscriptions and catch-up replay. + +mod error; + +#[cfg(test)] +mod tests; + +pub use error::{SubscribeError, SubscriptionError}; +pub use sequencer_core::broadcast::BroadcastTxMessage; + use std::time::Duration; use alloy_primitives::Address; -pub use sequencer_core::broadcast::BroadcastTxMessage; use sequencer_core::l2_tx::SequencedL2Tx; use tokio::sync::mpsc; -use super::{SubscribeError, SubscriptionError}; -use crate::shutdown::ShutdownSignal; +use crate::runtime::shutdown::ShutdownSignal; use crate::storage::Storage; #[derive(Debug, Clone, Copy)] @@ -66,8 +74,12 @@ impl L2TxFeed { from_offset: u64, max_catchup_events: u64, ) -> Result { - let head_offset = load_head_offset(self.db_path.as_str())?; - let catchup_events = head_offset.saturating_sub(from_offset); + let (head_offset, catchup_events) = load_catchup_info( + self.db_path.as_str(), + from_offset, + max_catchup_events, + self.batch_submitter_address, + )?; if catchup_events > max_catchup_events { return Err(SubscribeError::CatchUpWindowExceeded { requested_offset: from_offset, @@ -126,12 +138,29 @@ impl Subscription { } } -fn load_head_offset(db_path: &str) -> Result { +/// Returns `(head_offset, broadcastable_event_count_after_from_offset)`. +/// +/// Counts events the client will actually receive — excludes invalidated batches +/// and batch-submitter direct inputs (which are filtered before WS delivery). +fn load_catchup_info( + db_path: &str, + from_offset: u64, + max_catchup_events: u64, + batch_submitter_address: Option

, +) -> Result<(u64, u64), SubscribeError> { let mut storage = Storage::open_read_only(db_path) .map_err(|source| SubscribeError::OpenStorage { source })?; - storage - .ordered_l2_tx_count() - .map_err(|source| SubscribeError::LoadHeadOffset { source }) + let head_offset = storage + .ordered_l2_tx_head_offset() + .map_err(|source| SubscribeError::LoadHeadOffset { source })?; + let catchup_count = storage + .count_broadcastable_events_after( + from_offset, + max_catchup_events.saturating_add(1), + batch_submitter_address, + ) + .map_err(|source| SubscribeError::LoadHeadOffset { source })?; + Ok((head_offset, catchup_count)) } fn run_subscription( @@ -153,7 +182,7 @@ fn run_subscription( } let txs = storage - .load_ordered_l2_txs_page_from(next_offset, page_size) + .ordered_l2_txs_page_from(next_offset, page_size) .map_err(|source| SubscriptionError::LoadReplay { offset: next_offset, source, @@ -164,18 +193,18 @@ fn run_subscription( continue; } - for tx in txs { + for (db_offset, tx) in txs { if shutdown.is_shutdown_requested() || events_tx.is_closed() { return Ok(()); } + next_offset = db_offset; + if should_filter_from_broadcast(&tx, batch_submitter_address) { - next_offset = next_offset.saturating_add(1); continue; } - let event = BroadcastTxMessage::from_offset_and_tx(next_offset, tx); - next_offset = next_offset.saturating_add(1); + let event = BroadcastTxMessage::from_offset_and_tx(db_offset, tx); if events_tx.blocking_send(event).is_err() { return Ok(()); } diff --git a/sequencer/src/egress/l2_tx_feed/tests.rs b/sequencer/src/egress/l2_tx_feed/tests.rs new file mode 100644 index 0000000..e298e91 --- /dev/null +++ b/sequencer/src/egress/l2_tx_feed/tests.rs @@ -0,0 +1,375 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +use std::time::{Duration, SystemTime}; + +use alloy_primitives::{Address, Signature}; +use tokio::sync::oneshot; + +use super::{BroadcastTxMessage, L2TxFeed, L2TxFeedConfig, SubscribeError}; +use crate::ingress::inclusion_lane::{PendingUserOp, SequencerError}; +use crate::runtime::shutdown::ShutdownSignal; +use crate::storage::test_helpers::temp_db; +use crate::storage::{SafeInputRange, Storage, StoredSafeInput}; +use sequencer_core::l2_tx::{DirectInput, SequencedL2Tx, ValidUserOp}; +use sequencer_core::user_op::UserOp; + +#[test] +fn broadcast_user_op_serializes_with_hex_data() { + let msg = BroadcastTxMessage::from_offset_and_tx( + 7, + SequencedL2Tx::UserOp(ValidUserOp { + sender: Address::from_slice(&[0x11; 20]), + fee: 3, + data: vec![0xaa, 0xbb], + }), + ); + let json = serde_json::to_string(&msg).expect("serialize"); + assert!(json.contains("\"kind\":\"user_op\"")); + assert!(json.contains("\"offset\":7")); + assert!(json.contains("\"fee\":3")); + assert!(json.contains("\"data\":\"0xaabb\"")); +} + +#[test] +fn broadcast_direct_input_serializes_with_hex_payload() { + let msg = BroadcastTxMessage::from_offset_and_tx( + 9, + SequencedL2Tx::Direct(DirectInput { + sender: Address::ZERO, + block_number: 42, + payload: vec![0xcc, 0xdd], + }), + ); + let json = serde_json::to_string(&msg).expect("serialize"); + assert!(json.contains("\"kind\":\"direct_input\"")); + assert!(json.contains("\"offset\":9")); + assert!(json.contains("\"sender\":\"0x0000000000000000000000000000000000000000\"")); + assert!(json.contains("\"block_number\":42")); + assert!(json.contains("\"payload\":\"0xccdd\"")); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn subscribe_from_rejects_catchup_window() { + let db = temp_db("catchup-window"); + seed_ordered_txs(db.path.as_str()); + let feed = test_feed(db.path.as_str(), ShutdownSignal::default()); + + let result = feed.subscribe_from(0, 1); + + assert!(matches!( + result, + Err(SubscribeError::CatchUpWindowExceeded { + requested_offset: 0, + live_start_offset: 2, + max_catchup_events: 1, + }) + )); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn subscribe_from_accepts_exact_catchup_window() { + let db = temp_db("catchup-window-exact"); + seed_ordered_txs(db.path.as_str()); + let feed = test_feed(db.path.as_str(), ShutdownSignal::default()); + + let subscription = feed.subscribe_from(0, 2); + + assert!( + subscription.is_ok(), + "exactly 2 replayable events should be allowed" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn subscription_replays_existing_rows_in_order() { + let db = temp_db("replay-existing"); + seed_ordered_txs(db.path.as_str()); + let feed = test_feed(db.path.as_str(), ShutdownSignal::default()); + + let mut subscription = feed.subscribe_from(0, u64::MAX).expect("subscribe"); + + let first = tokio::time::timeout(Duration::from_secs(1), subscription.recv()) + .await + .expect("wait first event") + .expect("first event"); + let second = tokio::time::timeout(Duration::from_secs(1), subscription.recv()) + .await + .expect("wait second event") + .expect("second event"); + + // DB offsets (SQLite rowid) start at 1. + assert_eq!(first.offset(), 1); + assert_eq!(second.offset(), 2); + + subscription.finish().await.expect("finish subscription"); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn subscription_filters_batch_submitter_safe_inputs() { + let db = temp_db("filters-batch-submitter-inputs"); + let batch_submitter_address = Address::from([0xfe; 20]); + seed_ordered_txs_with_sender(db.path.as_str(), batch_submitter_address); + let feed = L2TxFeed::new( + db.path.clone(), + ShutdownSignal::default(), + L2TxFeedConfig { + idle_poll_interval: Duration::from_millis(2), + page_size: 64, + batch_submitter_address: Some(batch_submitter_address), + }, + ); + + let mut subscription = feed.subscribe_from(0, u64::MAX).expect("subscribe"); + let first = tokio::time::timeout(Duration::from_secs(1), subscription.recv()) + .await + .expect("wait first event") + .expect("first event"); + + // DB offsets start at 1. The user op is the first sequenced tx (offset=1), + // and the batch submitter's safe input (offset=2) is filtered out. + assert!(matches!( + first, + BroadcastTxMessage::UserOp { offset: 1, .. } + )); + + let no_second = tokio::time::timeout(Duration::from_millis(50), subscription.recv()).await; + assert!( + no_second.is_err(), + "filtered batch-submitter input should not be broadcast" + ); + + subscription.finish().await.expect("finish subscription"); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn shutdown_signal_closes_subscription() { + let db = temp_db("shutdown-closes"); + seed_ordered_txs(db.path.as_str()); + let shutdown = ShutdownSignal::default(); + let feed = test_feed(db.path.as_str(), shutdown.clone()); + + let mut subscription = feed.subscribe_from(u64::MAX, u64::MAX).expect("subscribe"); + + shutdown.request_shutdown(); + + assert!( + tokio::time::timeout(Duration::from_secs(1), subscription.recv()) + .await + .expect("wait for subscription close") + .is_none() + ); + subscription.finish().await.expect("clean shutdown"); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn catchup_window_not_inflated_by_invalidated_batch_holes() { + // Regression test: after batch invalidation, offset holes in sequenced_l2_txs + // must not inflate the catch-up event count. The check should count actual + // valid events, not subtract rowids. + let db = temp_db("catchup-holes"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + // Create two closed batches, each with one direct input. + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .append_safe_inputs( + 10, + &[StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xaa], + block_number: 10, + }], + &sequencer_core::protocol::ProtocolConfig { + batch_submitter: Address::ZERO, + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + seconds_per_block: 12, + }, + ) + .expect("append direct 0"); + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, 1)) + .expect("close frame"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xbb], + block_number: 20, + }], + &sequencer_core::protocol::ProtocolConfig { + batch_submitter: Address::ZERO, + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + seconds_per_block: 12, + }, + ) + .expect("append direct 1"); + storage + .close_frame_only(&mut head, 20, SafeInputRange::new(1, 2)) + .expect("close frame"); + drop(storage); + + // Before invalidation: 2 valid events. + // With max_catchup_events=1, subscribing from 0 should fail. + let feed = test_feed(db.path.as_str(), ShutdownSignal::default()); + assert!( + feed.subscribe_from(0, 1).is_err(), + "should reject: 2 valid events > max 1" + ); + + // Invalidate batch 0 — this creates a hole in the offset space. + // Now only 1 valid event remains (from batch 1). + let mut storage = Storage::open(db.path.as_str()).expect("reopen storage"); + storage.insert_invalid_batch(0).expect("invalidate batch 0"); + drop(storage); + + // After invalidation: only 1 valid event, so max_catchup_events=1 should succeed. + let feed = test_feed(db.path.as_str(), ShutdownSignal::default()); + assert!( + feed.subscribe_from(0, 1).is_ok(), + "should accept: only 1 valid event after invalidation, despite rowid hole" + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn catchup_window_excludes_batch_submitter_direct_inputs() { + // Regression test: batch-submitter direct inputs are filtered before WS + // delivery, so the catch-up window must not count them. Otherwise a + // reconnecting client could be rejected even when the number of + // replayable messages is within the limit. + let db = temp_db("catchup-submitter-filter"); + let batch_submitter = Address::from([0xfe; 20]); + let user_address = Address::from([0x01; 20]); + + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + + // Two direct inputs: one from the batch submitter, one from a user. + storage + .append_safe_inputs( + 10, + &[ + StoredSafeInput { + sender: batch_submitter, + payload: vec![0xaa], + block_number: 10, + }, + StoredSafeInput { + sender: user_address, + payload: vec![0xbb], + block_number: 10, + }, + ], + &sequencer_core::protocol::ProtocolConfig { + batch_submitter: Address::ZERO, + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + seconds_per_block: 12, + }, + ) + .expect("append directs"); + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, 2)) + .expect("close frame"); + drop(storage); + + // Without batch_submitter_address filtering: 2 events, max=1 should reject. + let feed_no_filter = L2TxFeed::new( + db.path.clone(), + ShutdownSignal::default(), + L2TxFeedConfig { + batch_submitter_address: None, + ..L2TxFeedConfig::default() + }, + ); + assert!( + feed_no_filter.subscribe_from(0, 1).is_err(), + "without filter: 2 events > max 1" + ); + + // With batch_submitter_address filtering: only the user's event counts. + let feed_filtered = L2TxFeed::new( + db.path.clone(), + ShutdownSignal::default(), + L2TxFeedConfig { + batch_submitter_address: Some(batch_submitter), + ..L2TxFeedConfig::default() + }, + ); + assert!( + feed_filtered.subscribe_from(0, 1).is_ok(), + "with filter: only 1 broadcastable event, should accept" + ); +} + +fn test_feed(db_path: &str, shutdown: ShutdownSignal) -> L2TxFeed { + L2TxFeed::new( + db_path.to_string(), + shutdown, + L2TxFeedConfig { + idle_poll_interval: Duration::from_millis(2), + page_size: 64, + batch_submitter_address: None, + }, + ) +} + +fn seed_ordered_txs(db_path: &str) { + seed_ordered_txs_with_sender(db_path, Address::ZERO); +} + +fn seed_ordered_txs_with_sender(db_path: &str, direct_sender: Address) { + let mut storage = Storage::open(db_path).expect("open storage"); + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + + let (respond_to, _recv) = oneshot::channel::>(); + let pending = PendingUserOp { + signed: sequencer_core::user_op::SignedUserOp { + sender: Address::from_slice(&[0x11; 20]), + signature: Signature::test_signature(), + user_op: UserOp { + nonce: 0, + max_fee: 3, + data: vec![0x42].into(), + }, + }, + respond_to, + received_at: SystemTime::now(), + }; + + storage + .append_user_ops_chunk(&mut head, &[pending]) + .expect("append user-op chunk"); + storage + .append_safe_inputs( + 10, + &[StoredSafeInput { + sender: direct_sender, + payload: vec![0xaa], + block_number: 10, + }], + &sequencer_core::protocol::ProtocolConfig { + batch_submitter: Address::ZERO, + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + seconds_per_block: 12, + }, + ) + .expect("append direct input"); + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, 1)) + .expect("close frame with one drained direct input"); +} diff --git a/sequencer/src/egress/mod.rs b/sequencer/src/egress/mod.rs new file mode 100644 index 0000000..ac7b75a --- /dev/null +++ b/sequencer/src/egress/mod.rs @@ -0,0 +1,9 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Outbound side: WS subscribe (today), future read-only endpoints, and the +//! L2-tx feed that backs them. Operated for internal indexers; the future api +//! split puts these on a separate port from ingress. + +pub mod api; +pub mod l2_tx_feed; diff --git a/sequencer/src/http.rs b/sequencer/src/http.rs new file mode 100644 index 0000000..3ee56f1 --- /dev/null +++ b/sequencer/src/http.rs @@ -0,0 +1,231 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Shared HTTP surface: error type + JSON response shape used by both +//! ingress (`/tx`) and egress (`/ws/subscribe`, future routes), plus the +//! `axum::serve` orchestration that wires the two side routers together. +//! +//! Today both sides serve from one listener; the planned api split puts each +//! side on its own port (same binary, two listeners). When that lands, the +//! orchestration here becomes per-side `start_*` calls. + +use std::io; +use std::sync::Arc; + +use alloy_sol_types::Eip712Domain; +use axum::Json; +use axum::Router; +use axum::extract::DefaultBodyLimit; +use axum::http::StatusCode; +use axum::response::{IntoResponse, Response}; +use serde::Serialize; +use thiserror::Error; +use tokio::sync::mpsc; +use tower_http::trace::TraceLayer; + +use crate::egress::api::SubscribeState; +use crate::egress::l2_tx_feed::L2TxFeed; +use crate::ingress::api::SubmitState; +use crate::ingress::inclusion_lane::{PendingUserOp, SequencerError}; +use crate::runtime::shutdown::ShutdownSignal; +use sequencer_core::api::{TxRequest, TxRequestError}; + +#[derive(Debug, Error, Clone)] +pub enum ApiError { + #[error("{0}")] + BadRequest(String), + #[error("{0}")] + PayloadTooLarge(String), + #[error("{0}")] + InvalidSignature(String), + #[error("{0}")] + ExecutionRejected(String), + #[error("{0}")] + Unavailable(String), + #[error("{0}")] + InternalError(String), + #[error("{0}")] + Overloaded(String), +} + +#[derive(Debug, Serialize)] +struct ErrorResponse { + ok: bool, + code: &'static str, + message: String, +} + +impl ApiError { + pub fn bad_request(message: impl Into) -> Self { + Self::BadRequest(message.into()) + } + + pub fn payload_too_large(message: impl Into) -> Self { + Self::PayloadTooLarge(message.into()) + } + + pub fn invalid_signature(message: impl Into) -> Self { + Self::InvalidSignature(message.into()) + } + + pub fn internal_error(message: impl Into) -> Self { + Self::InternalError(message.into()) + } + + pub fn unavailable(message: impl Into) -> Self { + Self::Unavailable(message.into()) + } + + pub fn overloaded(message: impl Into) -> Self { + Self::Overloaded(message.into()) + } + + pub fn status(&self) -> StatusCode { + match self { + Self::BadRequest(_) | Self::InvalidSignature(_) => StatusCode::BAD_REQUEST, + Self::PayloadTooLarge(_) => StatusCode::PAYLOAD_TOO_LARGE, + Self::ExecutionRejected(_) => StatusCode::UNPROCESSABLE_ENTITY, + Self::Unavailable(_) => StatusCode::SERVICE_UNAVAILABLE, + Self::InternalError(_) => StatusCode::INTERNAL_SERVER_ERROR, + Self::Overloaded(_) => StatusCode::TOO_MANY_REQUESTS, + } + } + + pub fn code(&self) -> &'static str { + match self { + Self::BadRequest(_) => "BAD_REQUEST", + Self::PayloadTooLarge(_) => "PAYLOAD_TOO_LARGE", + Self::InvalidSignature(_) => "INVALID_SIGNATURE", + Self::ExecutionRejected(_) => "EXECUTION_REJECTED", + Self::Unavailable(_) => "UNAVAILABLE", + Self::InternalError(_) => "INTERNAL_ERROR", + Self::Overloaded(_) => "OVERLOADED", + } + } +} + +impl From for ApiError { + fn from(value: SequencerError) -> Self { + match value { + SequencerError::Invalid(message) => Self::ExecutionRejected(message), + SequencerError::Unavailable(message) => Self::Unavailable(message), + SequencerError::Internal(message) => Self::InternalError(message), + } + } +} + +impl From for ApiError { + fn from(value: TxRequestError) -> Self { + match value { + TxRequestError::BadRequest(message) => Self::BadRequest(message), + TxRequestError::InvalidSignature(message) => Self::InvalidSignature(message), + } + } +} + +impl IntoResponse for ApiError { + fn into_response(self) -> Response { + let body = ErrorResponse { + ok: false, + code: self.code(), + message: self.to_string(), + }; + (self.status(), Json(body)).into_response() + } +} + +// ── HTTP server orchestration ──────────────────────────────────────────────── +// +// Combines ingress + egress routers into one axum::serve. The api split will +// replace this with per-side starts on different ports. + +const DEFAULT_WS_MAX_SUBSCRIBERS: usize = 64; +const DEFAULT_WS_MAX_CATCHUP_EVENTS: u64 = 50_000; +const DEFAULT_MAX_BODY_BYTES: usize = TxRequest::MAX_JSON_BYTES_RECOMMENDED; + +/// Reason returned in the WS Close frame when the subscriber's requested +/// `from_offset` is too old for the catch-up window to bridge. +pub const WS_CATCHUP_WINDOW_EXCEEDED_REASON: &str = "catch-up window exceeded"; + +pub type ApiServerTask = tokio::task::JoinHandle>; + +#[derive(Debug, Clone, Copy)] +pub struct ApiConfig { + pub max_body_bytes: usize, + pub ws_max_subscribers: usize, + pub ws_max_catchup_events: u64, +} + +impl Default for ApiConfig { + fn default() -> Self { + Self { + max_body_bytes: DEFAULT_MAX_BODY_BYTES, + ws_max_subscribers: DEFAULT_WS_MAX_SUBSCRIBERS, + ws_max_catchup_events: DEFAULT_WS_MAX_CATCHUP_EVENTS, + } + } +} + +#[allow(clippy::too_many_arguments)] +pub async fn start( + http_addr: impl tokio::net::ToSocketAddrs, + tx_sender: mpsc::Sender, + domain: Eip712Domain, + max_user_op_data_bytes: usize, + shutdown: ShutdownSignal, + tx_feed: L2TxFeed, + config: ApiConfig, +) -> io::Result { + let listener = tokio::net::TcpListener::bind(http_addr).await?; + Ok(start_on_listener( + listener, + tx_sender, + domain, + max_user_op_data_bytes, + shutdown, + tx_feed, + config, + )) +} + +#[allow(clippy::too_many_arguments)] +pub fn start_on_listener( + listener: tokio::net::TcpListener, + tx_sender: mpsc::Sender, + domain: Eip712Domain, + max_user_op_data_bytes: usize, + shutdown: ShutdownSignal, + tx_feed: L2TxFeed, + config: ApiConfig, +) -> ApiServerTask { + let health_state = Arc::new(crate::egress::api::HealthState { + tx_sender: tx_sender.clone(), + shutdown: shutdown.clone(), + }); + let submit_state = Arc::new(SubmitState::new( + tx_sender, + domain, + max_user_op_data_bytes, + shutdown.clone(), + )); + let subscribe_state = Arc::new(SubscribeState::new( + shutdown.clone(), + tx_feed, + config.ws_max_subscribers, + config.ws_max_catchup_events, + )); + + let app: Router = crate::ingress::api::router(submit_state) + .merge(crate::egress::api::router(subscribe_state, health_state)) + // Enforces a raw request-body cap before JSON deserialization, including whitespace. + .layer(DefaultBodyLimit::max(config.max_body_bytes)) + .layer(TraceLayer::new_for_http()); + + tokio::spawn(async move { + axum::serve(listener, app) + .with_graceful_shutdown(async move { + shutdown.wait_for_shutdown().await; + }) + .await + }) +} diff --git a/sequencer/src/inclusion_lane/config.rs b/sequencer/src/inclusion_lane/config.rs deleted file mode 100644 index fff90d8..0000000 --- a/sequencer/src/inclusion_lane/config.rs +++ /dev/null @@ -1,32 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use std::time::Duration; - -use alloy_primitives::Address; - -const DEFAULT_MAX_USER_OPS_PER_CHUNK: usize = 64; -const DEFAULT_SAFE_INPUT_BUFFER_CAPACITY: usize = 2048; -const DEFAULT_MAX_BATCH_OPEN: Duration = Duration::from_secs(2 * 60 * 60); -const DEFAULT_IDLE_POLL_INTERVAL: Duration = Duration::from_millis(10); - -#[derive(Debug, Clone, Copy)] -pub struct InclusionLaneConfig { - pub batch_submitter_address: Address, - pub max_user_ops_per_chunk: usize, - pub safe_input_buffer_capacity: usize, - pub max_batch_open: Duration, - pub idle_poll_interval: Duration, -} - -impl InclusionLaneConfig { - pub fn new(batch_submitter_address: Address) -> Self { - Self { - batch_submitter_address, - max_user_ops_per_chunk: DEFAULT_MAX_USER_OPS_PER_CHUNK, - safe_input_buffer_capacity: DEFAULT_SAFE_INPUT_BUFFER_CAPACITY, - max_batch_open: DEFAULT_MAX_BATCH_OPEN, - idle_poll_interval: DEFAULT_IDLE_POLL_INTERVAL, - } - } -} diff --git a/sequencer/src/inclusion_lane/lane.rs b/sequencer/src/inclusion_lane/lane.rs deleted file mode 100644 index 459833e..0000000 --- a/sequencer/src/inclusion_lane/lane.rs +++ /dev/null @@ -1,373 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use std::thread; -use std::time::SystemTime; - -use tokio::sync::mpsc; -use tokio::task::JoinHandle; - -use crate::shutdown::ShutdownSignal; -use crate::storage::{SafeInputRange, Storage, StoredSafeInput, WriteHead}; -use sequencer_core::application::{AppError, Application, ExecutionOutcome}; -use sequencer_core::l2_tx::DirectInput; -use sequencer_core::user_op::SignedUserOp; - -use super::catch_up::catch_up_application; -use super::config::InclusionLaneConfig; -use super::{InclusionLaneError, PendingUserOp, SequencerError}; - -pub struct InclusionLane { - rx: mpsc::Receiver, - shutdown: ShutdownSignal, - app: A, - storage: Storage, - config: InclusionLaneConfig, -} - -impl InclusionLane { - pub fn start( - queue_capacity: usize, - shutdown: ShutdownSignal, - app: A, - storage: Storage, - config: InclusionLaneConfig, - ) -> ( - mpsc::Sender, - JoinHandle>, - ) { - let (tx, rx) = mpsc::channel::(queue_capacity.max(1)); - let handle = tokio::task::spawn_blocking(move || { - let mut lane = Self { - rx, - shutdown, - app, - storage, - config, - }; - lane.run_forever() - }); - (tx, handle) - } - - fn run_forever(&mut self) -> Result<(), InclusionLaneError> { - self.run_catch_up()?; - let mut included = Vec::with_capacity(self.config.max_user_ops_per_chunk.max(1)); - let mut safe_inputs = Vec::with_capacity(self.config.safe_input_buffer_capacity.max(1)); - let mut lane_state = self.load_or_initialize_lane_state(&mut safe_inputs)?; - - loop { - if self.shutdown.is_shutdown_requested() { - self.reject_pending_user_ops_due_to_shutdown(); - return Ok(()); - } - - let advanced_safe_frontier = - self.maybe_advance_safe_frontier(&mut lane_state, &mut safe_inputs)?; - - let included_user_op_count = - self.process_user_op_chunk(&mut lane_state.head, &mut included)?; - - if should_close_batch::(&lane_state.head, &self.config) { - let next_safe_block = lane_state.head.safe_block; - self.close_frame_and_batch(&mut lane_state.head, next_safe_block)?; - } else if !advanced_safe_frontier && included_user_op_count == 0 { - thread::sleep(self.config.idle_poll_interval); - } - } - } - - fn run_catch_up(&mut self) -> Result<(), InclusionLaneError> { - catch_up_application( - &mut self.app, - &mut self.storage, - self.config.batch_submitter_address, - ) - .map_err(|source| InclusionLaneError::CatchUp { source }) - } - - fn load_or_initialize_lane_state( - &mut self, - safe_inputs: &mut Vec, - ) -> Result { - let next_safe_input_index = self - .storage - .load_next_undrained_safe_input_index() - .map_err(|source| InclusionLaneError::LoadNextUndrainedDirectInputIndex { source })?; - - let last_drained_direct_range = SafeInputRange::empty_at(next_safe_input_index); - if let Some(head) = self - .storage - .load_open_state() - .map_err(|source| InclusionLaneError::LoadOpenState { source })? - { - return Ok(LaneState { - last_drained_direct_range, - head, - }); - } - - let frontier = self - .storage - .load_safe_frontier() - .map_err(|source| InclusionLaneError::LoadSafeInputs { source })?; - assert!( - frontier.end_exclusive >= last_drained_direct_range.end_exclusive, - "safe-input head regressed during lane initialization: safe_end={}, next={}", - frontier.end_exclusive, - last_drained_direct_range.end_exclusive - ); - - let leading_direct_range = last_drained_direct_range.advance_to(frontier.end_exclusive); - self.execute_safe_inputs_range(leading_direct_range, safe_inputs)?; - let head = self - .storage - .initialize_open_state(frontier.safe_block, leading_direct_range) - .map_err(|source| InclusionLaneError::LoadOpenState { source })?; - - Ok(LaneState { - last_drained_direct_range: leading_direct_range, - head, - }) - } - - fn process_user_op_chunk( - &mut self, - head: &mut WriteHead, - included: &mut Vec, - ) -> Result { - included.clear(); - dequeue_and_execute_user_op_chunk( - &mut self.rx, - &mut self.app, - head.frame_fee, - self.config.max_user_ops_per_chunk.max(1), - included, - )?; - let included_count = included.len(); - - self.persist_included_user_ops(head, included)?; - - for item in included.drain(..) { - let _ = item.respond_to.send(Ok(())); - } - - Ok(included_count) - } - - fn maybe_advance_safe_frontier( - &mut self, - lane_state: &mut LaneState, - safe_inputs: &mut Vec, - ) -> Result { - let frontier = self - .storage - .load_safe_frontier() - .map_err(|source| InclusionLaneError::LoadSafeInputs { source })?; - assert!( - frontier.end_exclusive >= lane_state.last_drained_direct_range.end_exclusive, - "safe-input head regressed: safe_end={}, next={}", - frontier.end_exclusive, - lane_state.last_drained_direct_range.end_exclusive - ); - if frontier.safe_block <= lane_state.head.safe_block { - return Ok(false); - } - - let leading_direct_range = lane_state - .last_drained_direct_range - .advance_to(frontier.end_exclusive); - self.execute_safe_inputs_range(leading_direct_range, safe_inputs)?; - self.close_frame_only( - &mut lane_state.head, - frontier.safe_block, - leading_direct_range, - )?; - lane_state.last_drained_direct_range = leading_direct_range; - Ok(true) - } - - fn persist_included_user_ops( - &mut self, - head: &mut WriteHead, - included: &mut Vec, - ) -> Result<(), InclusionLaneError> { - self.storage - .append_user_ops_chunk(head, included.as_slice()) - .map_err(|source| { - Self::respond_internal_to_all(included, format!("db error: {source}")); - InclusionLaneError::AppendUserOps { source } - }) - } - - fn execute_safe_inputs_range( - &mut self, - direct_range: SafeInputRange, - chunk: &mut Vec, - ) -> Result { - let max_chunk_len = self.config.safe_input_buffer_capacity.max(1) as u64; - let mut chunk_start = direct_range.start_inclusive; - while chunk_start < direct_range.end_exclusive { - let chunk_end_exclusive = direct_range - .end_exclusive - .min(chunk_start.saturating_add(max_chunk_len)); - self.load_safe_inputs_chunk(chunk_start, chunk_end_exclusive, chunk)?; - self.execute_safe_inputs_chunk(chunk.as_slice())?; - chunk_start = chunk_end_exclusive; - } - - Ok(direct_range) - } - - fn close_frame_and_batch( - &mut self, - head: &mut WriteHead, - next_safe_block: u64, - ) -> Result<(), InclusionLaneError> { - self.storage - .close_frame_and_batch(head, next_safe_block) - .map_err(|source| InclusionLaneError::CloseFrameRotate { source }) - } - - fn close_frame_only( - &mut self, - head: &mut WriteHead, - next_safe_block: u64, - leading_direct_range: SafeInputRange, - ) -> Result<(), InclusionLaneError> { - self.storage - .close_frame_only(head, next_safe_block, leading_direct_range) - .map_err(|source| InclusionLaneError::CloseFrameRotate { source }) - } - - fn load_safe_inputs_chunk( - &mut self, - start_inclusive: u64, - end_exclusive: u64, - chunk: &mut Vec, - ) -> Result<(), InclusionLaneError> { - chunk.clear(); - self.storage - .fill_safe_inputs(start_inclusive, end_exclusive, chunk) - .map_err(|source| InclusionLaneError::LoadSafeInputs { source }) - } - - fn execute_safe_inputs_chunk( - &mut self, - chunk: &[StoredSafeInput], - ) -> Result<(), InclusionLaneError> { - for input in chunk { - if input.sender == self.config.batch_submitter_address { - continue; - } - let direct_input = DirectInput { - sender: input.sender, - block_number: input.block_number, - payload: input.payload.clone(), - }; - - self.app - .execute_direct_input(&direct_input) - .map_err(|source| InclusionLaneError::ExecuteDirectInput { source })?; - } - Ok(()) - } - - fn respond_internal_to_all(pending: &mut Vec, message: String) { - for item in pending.drain(..) { - let _ = item - .respond_to - .send(Err(SequencerError::internal(message.clone()))); - } - } - - fn reject_pending_user_ops_due_to_shutdown(&mut self) { - loop { - match self.rx.try_recv() { - Ok(item) => { - let _ = item - .respond_to - .send(Err(SequencerError::unavailable("sequencer shutting down"))); - } - Err(mpsc::error::TryRecvError::Empty) - | Err(mpsc::error::TryRecvError::Disconnected) => return, - } - } - } -} - -fn should_close_batch(head: &WriteHead, config: &InclusionLaneConfig) -> bool { - should_close_batch_by_time(head, config) || should_close_batch_by_size::(head) -} - -fn should_close_batch_by_time(head: &WriteHead, config: &InclusionLaneConfig) -> bool { - let age = SystemTime::now() - .duration_since(head.batch_created_at) - .unwrap_or_default(); - age >= config.max_batch_open -} - -fn should_close_batch_by_size(head: &WriteHead) -> bool { - user_op_count_to_bytes::(head.batch_user_op_count) >= head.max_batch_user_op_bytes -} - -fn execute_user_op( - app: &mut impl Application, - item: PendingUserOp, - current_frame_fee: u16, - included: &mut Vec, -) { - match app.validate_and_execute_user_op( - item.signed.sender, - &item.signed.user_op, - current_frame_fee, - ) { - Ok(ExecutionOutcome::Included { .. }) => included.push(item), - Ok(ExecutionOutcome::Invalid(reason)) => { - let _ = item - .respond_to - .send(Err(SequencerError::invalid(reason.to_string()))); - } - Err(AppError::Internal { reason }) => { - let _ = item.respond_to.send(Err(SequencerError::internal(reason))); - } - } -} - -pub(super) fn dequeue_and_execute_user_op_chunk( - rx: &mut mpsc::Receiver, - app: &mut impl Application, - current_frame_fee: u16, - max_chunk: usize, - included: &mut Vec, -) -> Result<(), InclusionLaneError> { - let mut executed_user_ops = 0_usize; - - while executed_user_ops < max_chunk { - match rx.try_recv() { - Ok(item) => { - execute_user_op(app, item, current_frame_fee, included); - executed_user_ops = executed_user_ops.saturating_add(1); - } - Err(mpsc::error::TryRecvError::Empty) => return Ok(()), - Err(mpsc::error::TryRecvError::Disconnected) => { - if executed_user_ops == 0 { - return Err(InclusionLaneError::ChannelClosed); - } - return Ok(()); - } - } - } - - Ok(()) -} - -fn user_op_count_to_bytes(user_op_count: u64) -> u64 { - let one_user_op_bytes = SignedUserOp::max_batch_metadata() + A::MAX_METHOD_PAYLOAD_BYTES; - user_op_count.saturating_mul(one_user_op_bytes as u64) -} - -struct LaneState { - last_drained_direct_range: SafeInputRange, - head: WriteHead, -} diff --git a/sequencer/src/inclusion_lane/mod.rs b/sequencer/src/inclusion_lane/mod.rs deleted file mode 100644 index 7e52786..0000000 --- a/sequencer/src/inclusion_lane/mod.rs +++ /dev/null @@ -1,16 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -mod catch_up; -mod config; -mod error; -mod lane; -mod types; - -pub use config::InclusionLaneConfig; -pub use error::InclusionLaneError; -pub use lane::InclusionLane; -pub use types::{PendingUserOp, SequencerError}; - -#[cfg(test)] -mod tests; diff --git a/sequencer/src/ingress/api.rs b/sequencer/src/ingress/api.rs new file mode 100644 index 0000000..00f6940 --- /dev/null +++ b/sequencer/src/ingress/api.rs @@ -0,0 +1,285 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! `POST /tx` — validate a signed user op, enqueue it for the inclusion lane, +//! and wait for the lane's commit ack before responding. Synchronous from the +//! client's perspective: 200 means included. + +use std::sync::Arc; +use std::time::SystemTime; + +use alloy_sol_types::Eip712Domain; +use axum::Router; +use axum::extract::{Json, State}; +use axum::http::StatusCode; +use axum::routing::post; +use tokio::sync::mpsc::{self, error::TrySendError}; +use tokio::sync::oneshot; +use tracing::debug; + +use crate::http::ApiError; +use crate::ingress::inclusion_lane::PendingUserOp; +use crate::runtime::shutdown::ShutdownSignal; +use sequencer_core::api::{TxRequest, TxResponse}; +use sequencer_core::user_op::SignedUserOp; + +/// State for the submit endpoint. Kept narrow — only what `/tx` actually needs. +#[derive(Clone)] +pub(crate) struct SubmitState { + pub tx_sender: mpsc::Sender, + pub domain: Eip712Domain, + pub max_user_op_data_bytes: usize, + pub shutdown: ShutdownSignal, +} + +impl SubmitState { + pub(crate) fn new( + tx_sender: mpsc::Sender, + domain: Eip712Domain, + max_user_op_data_bytes: usize, + shutdown: ShutdownSignal, + ) -> Self { + Self { + tx_sender, + domain, + max_user_op_data_bytes, + shutdown, + } + } + + fn reject_if_shutting_down(&self) -> Result<(), ApiError> { + if self.shutdown.is_shutdown_requested() { + Err(ApiError::unavailable("sequencer shutting down")) + } else { + Ok(()) + } + } +} + +/// Build the ingress router. Caller wires it into an `axum::serve` listener. +pub(crate) fn router(state: Arc) -> Router { + Router::new() + .route("/tx", post(submit_tx)) + .with_state(state) +} + +async fn submit_tx( + State(state): State>, + req: Result, axum::extract::rejection::JsonRejection>, +) -> Result, ApiError> { + let Json(req) = req.map_err(map_json_rejection)?; + + let signed = req + .into_signed_user_op(&state.domain, state.max_user_op_data_bytes) + .map_err(ApiError::from)?; + let nonce = signed.user_op.nonce; + let sender = signed.sender; + let ack = enqueue_verified_tx(state.as_ref(), signed)?; + + let commit_result = ack + .await + .map_err(|_| ApiError::internal_error("inclusion lane dropped response"))?; + commit_result.map_err(ApiError::from)?; + debug!(sender = %sender, nonce, "tx committed"); + + Ok(Json(TxResponse { + ok: true, + sender: sender.to_string(), + nonce, + })) +} + +/// Normalize JSON-extractor failures into fixed client-facing messages. +/// Keeps the public API contract stable across axum upgrades and avoids +/// reflecting parser internals (serde line/column, token excerpts) to callers. +fn map_json_rejection(err: axum::extract::rejection::JsonRejection) -> ApiError { + use axum::extract::rejection::JsonRejection; + + tracing::debug!(error = %err, "JSON extraction failed"); + + if err.status() == StatusCode::PAYLOAD_TOO_LARGE { + ApiError::payload_too_large("request body too large") + } else { + match err { + JsonRejection::MissingJsonContentType(_) => { + ApiError::bad_request("missing content type") + } + _ => ApiError::bad_request("invalid JSON"), + } + } +} + +fn enqueue_verified_tx( + state: &SubmitState, + signed: SignedUserOp, +) -> Result>, ApiError> +{ + state.reject_if_shutting_down()?; + + let (respond_to, recv) = oneshot::channel(); + let pending = PendingUserOp { + signed, + respond_to, + received_at: SystemTime::now(), + }; + + match state.tx_sender.try_send(pending) { + Ok(()) => Ok(recv), + Err(TrySendError::Full(_)) => Err(ApiError::overloaded("queue full")), + Err(TrySendError::Closed(_)) => Err(ApiError::internal_error("inclusion lane unavailable")), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use alloy_primitives::{Address, Signature}; + use alloy_sol_types::Eip712Domain; + use alloy_sol_types::SolStruct; + use axum::http::StatusCode; + use k256::ecdsa::SigningKey; + use k256::ecdsa::signature::hazmat::PrehashSigner; + use std::sync::Arc; + use tempfile::TempDir; + use tokio::sync::mpsc; + + use crate::storage::Storage; + use sequencer_core::user_op::UserOp; + + #[tokio::test(flavor = "current_thread")] + async fn submit_tx_rejects_when_shutdown_has_started() { + let db = TempDir::new().expect("create temp dir"); + let db_path = db.path().join("sequencer.db"); + let _storage = Storage::open(&db_path.to_string_lossy()).expect("create db"); + let shutdown = ShutdownSignal::default(); + shutdown.request_shutdown(); + + let (tx_sender, _rx) = mpsc::channel::(1); + let state = Arc::new(SubmitState::new( + tx_sender, + Eip712Domain { + name: None, + version: None, + chain_id: None, + verifying_contract: None, + salt: None, + }, + 128, + shutdown, + )); + + let signing_key = SigningKey::from_bytes((&[7_u8; 32]).into()).expect("create signing key"); + let sender = address_from_signing_key(&signing_key); + let user_op = UserOp { + nonce: 0, + max_fee: 0, + data: Vec::new().into(), + }; + let request = TxRequest { + message: user_op.clone(), + signature: sign_user_op_hex(&state.domain, &user_op, &signing_key), + sender: sender.to_string(), + }; + + let result = submit_tx(State(state), Ok(Json(request))).await; + + let err = result.expect_err("submit should be rejected during shutdown"); + assert_eq!(err.status(), StatusCode::SERVICE_UNAVAILABLE); + assert_eq!(err.code(), "UNAVAILABLE"); + } + + fn sign_user_op_hex( + domain: &Eip712Domain, + user_op: &UserOp, + signing_key: &SigningKey, + ) -> String { + let hash = user_op.eip712_signing_hash(domain); + let k256_sig = signing_key + .sign_prehash(hash.as_slice()) + .expect("sign user op hash"); + + let sender = address_from_signing_key(signing_key); + let signature = [false, true] + .into_iter() + .map(|parity| Signature::from_signature_and_parity(k256_sig, parity)) + .find(|candidate| { + candidate + .recover_address_from_prehash(&hash) + .ok() + .map(|value| value == sender) + .unwrap_or(false) + }) + .expect("recoverable parity for signature"); + + alloy_primitives::hex::encode_prefixed(signature.as_bytes()) + } + + fn address_from_signing_key(signing_key: &SigningKey) -> Address { + let verifying = signing_key.verifying_key().to_encoded_point(false); + Address::from_raw_public_key(&verifying.as_bytes()[1..]) + } + + // ── §1.7 S-malleability — no alternate signature can recover a different + // address at our boundary. Structurally guaranteed by alloy+k256; this is + // a regression lock. + + #[test] + fn s_malleable_signature_cannot_recover_a_different_address() { + use alloy_primitives::{B256, U256}; + + // secp256k1 curve order `n`. s' = n - s is the canonical malleable + // transform that pairs with flipped parity to produce an alternate + // signature recovering the same public key. + const SECP256K1_N: U256 = U256::from_be_slice(&[ + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFE, 0xBA, 0xAE, 0xDC, 0xE6, 0xAF, 0x48, 0xA0, 0x3B, 0xBF, 0xD2, 0x5E, 0x8C, + 0xD0, 0x36, 0x41, 0x41, + ]); + + let signing_key = SigningKey::from_bytes((&[0x42_u8; 32]).into()).expect("key"); + let expected_sender = address_from_signing_key(&signing_key); + + let msg_hash = B256::from([0xfe_u8; 32]); + let k256_sig = signing_key + .sign_prehash(msg_hash.as_slice()) + .expect("sign prehash"); + + // k256's `sign_prehash` returns a low-s signature by default. Find the + // parity that pairs with it to recover the expected signer. + let valid_sig = [false, true] + .into_iter() + .map(|p| Signature::from_signature_and_parity(k256_sig, p)) + .find(|s| { + s.recover_address_from_prehash(&msg_hash) + .ok() + .is_some_and(|a| a == expected_sender) + }) + .expect("low-s signature must recover the signer with one parity"); + + // Construct the S-malleable variant: same r, s' = n - s, flipped parity. + let malleable_sig = + Signature::new(valid_sig.r(), SECP256K1_N - valid_sig.s(), !valid_sig.v()); + assert_ne!( + malleable_sig.s(), + valid_sig.s(), + "malleable transform must actually change the signature", + ); + + match malleable_sig.recover_address_from_prehash(&msg_hash) { + Err(_) => { + // alloy rejected the high-s form (EIP-2 style). Impersonation + // via malleability is structurally impossible at recovery. + } + Ok(addr) => { + // alloy accepted high-s; it MUST return the same signer. + // Any other outcome would let an attacker grind a distinct + // signature that recovers a different address. + assert_eq!( + addr, expected_sender, + "malleable signature recovered a DIFFERENT address — impersonation possible", + ); + } + } + } +} diff --git a/sequencer/src/inclusion_lane/catch_up.rs b/sequencer/src/ingress/inclusion_lane/catch_up.rs similarity index 79% rename from sequencer/src/inclusion_lane/catch_up.rs rename to sequencer/src/ingress/inclusion_lane/catch_up.rs index 8515b34..b01cff0 100644 --- a/sequencer/src/inclusion_lane/catch_up.rs +++ b/sequencer/src/ingress/inclusion_lane/catch_up.rs @@ -1,6 +1,10 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) +//! Startup-only replay: walk the persisted ordered-L2-tx stream and feed it +//! to the application so its in-memory state matches the DB before the lane +//! starts taking new work. Runs once, before the hot loop. + use alloy_primitives::Address; use crate::storage::Storage; @@ -30,12 +34,14 @@ pub(super) fn catch_up_application_paged( batch_submitter_address: Address, page_size: usize, ) -> Result<(), CatchUpError> { - let mut next_offset = 0; + // Cursor tracks the DB offset of the last processed item. + // SQLite rowids start at 1, so 0 means "before all items". + let mut next_offset: u64 = 0; let page_size = page_size.max(1); loop { let replay = storage - .load_ordered_l2_txs_page_from(next_offset, page_size) + .ordered_l2_txs_page_from(next_offset, page_size) .map_err(|source| CatchUpError::LoadReplay { offset: next_offset, source, @@ -45,9 +51,9 @@ pub(super) fn catch_up_application_paged( return Ok(()); } - for item in replay { + for (db_offset, item) in replay { replay_sequenced_l2_tx(app, batch_submitter_address, item)?; - next_offset = next_offset.saturating_add(1); + next_offset = db_offset; } } } diff --git a/sequencer/src/ingress/inclusion_lane/config.rs b/sequencer/src/ingress/inclusion_lane/config.rs new file mode 100644 index 0000000..1bc9f69 --- /dev/null +++ b/sequencer/src/ingress/inclusion_lane/config.rs @@ -0,0 +1,52 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Runtime knobs for the inclusion lane. Defaults tuned for low-latency +//! Ethereum L1 deployment; tests override individual fields directly. + +use std::time::Duration; + +use alloy_primitives::Address; + +const DEFAULT_MAX_USER_OPS_PER_CHUNK: usize = 64; +const DEFAULT_SAFE_INPUT_BUFFER_CAPACITY: usize = 2048; +const DEFAULT_MAX_BATCH_OPEN: Duration = Duration::from_secs(2 * 60 * 60); +const DEFAULT_IDLE_POLL_INTERVAL: Duration = Duration::from_millis(10); +/// Minimum gap between L1 safe-frontier polls. Bounds the SQL load when the +/// lane is otherwise idle. L1 safe head advances at ~12s cadence, so 1s is +/// well inside the responsiveness budget. +const DEFAULT_FRONTIER_MIN_INTERVAL: Duration = Duration::from_secs(1); + +#[derive(Debug, Clone, Copy)] +pub struct InclusionLaneConfig { + /// Address of the batch submitter wallet. Direct inputs from this sender + /// are skipped during application execution (they're our own batch + /// submissions; the application doesn't apply them as user-level inputs). + pub batch_submitter_address: Address, + /// Cap on user ops dequeued per chunk. Bounds per-chunk SQL transaction + /// size and (more importantly) ack latency for the first op in each chunk. + pub max_user_ops_per_chunk: usize, + /// Reusable buffer size for safe-input loading. Doesn't bound work; just + /// the memory ceiling for the read-and-execute scratch buffer. + pub safe_input_buffer_capacity: usize, + /// Force a batch close after this much wall time, regardless of size. + pub max_batch_open: Duration, + /// Sleep duration when the lane has nothing to do (no queue, no advance). + pub idle_poll_interval: Duration, + /// Minimum gap between L1 safe-frontier polls. Bounds idle SQL load. See + /// `DEFAULT_FRONTIER_MIN_INTERVAL` for the rationale on the default. + pub frontier_min_interval: Duration, +} + +impl InclusionLaneConfig { + pub fn new(batch_submitter_address: Address) -> Self { + Self { + batch_submitter_address, + max_user_ops_per_chunk: DEFAULT_MAX_USER_OPS_PER_CHUNK, + safe_input_buffer_capacity: DEFAULT_SAFE_INPUT_BUFFER_CAPACITY, + max_batch_open: DEFAULT_MAX_BATCH_OPEN, + idle_poll_interval: DEFAULT_IDLE_POLL_INTERVAL, + frontier_min_interval: DEFAULT_FRONTIER_MIN_INTERVAL, + } + } +} diff --git a/sequencer/src/inclusion_lane/error.rs b/sequencer/src/ingress/inclusion_lane/error.rs similarity index 60% rename from sequencer/src/inclusion_lane/error.rs rename to sequencer/src/ingress/inclusion_lane/error.rs index 03333db..7849c75 100644 --- a/sequencer/src/inclusion_lane/error.rs +++ b/sequencer/src/ingress/inclusion_lane/error.rs @@ -1,6 +1,9 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) +//! Lane-level error types. Returned from the lane's join handle; the runtime +//! logs them and may shut down depending on severity. + use sequencer_core::application::AppError; use thiserror::Error; @@ -13,36 +16,18 @@ pub enum InclusionLaneError { #[source] source: CatchUpError, }, - #[error("cannot load next undrained safe-input index")] - LoadNextUndrainedDirectInputIndex { - #[source] - source: rusqlite::Error, - }, - #[error("cannot load safe inputs")] - LoadSafeInputs { + #[error(transparent)] + Storage(#[from] rusqlite::Error), + #[error("user op execution failed")] + ExecuteUserOp { #[source] - source: rusqlite::Error, - }, - #[error("cannot load/create open batch/frame")] - LoadOpenState { - #[source] - source: rusqlite::Error, - }, - #[error("append user ops failed")] - AppendUserOps { - #[source] - source: rusqlite::Error, + source: AppError, }, #[error("direct input execution failed")] ExecuteDirectInput { #[source] source: AppError, }, - #[error("failed to close/rotate frame")] - CloseFrameRotate { - #[source] - source: rusqlite::Error, - }, } #[derive(Debug, Error)] diff --git a/sequencer/src/ingress/inclusion_lane/mod.rs b/sequencer/src/ingress/inclusion_lane/mod.rs new file mode 100644 index 0000000..022376c --- /dev/null +++ b/sequencer/src/ingress/inclusion_lane/mod.rs @@ -0,0 +1,455 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Hot-path loop. The lane runs three layers of amortization on each iteration: +//! +//! - **Frontier check** (time-gated by `frontier_min_interval`): polls L1's +//! safe head; advances frame boundary if it moved. +//! - **Inner drain loop** (`run_inner_drain`): processes user-op chunks until +//! the queue empties or the batch hits its size target. +//! - **Per-chunk persistence** (`max_user_ops_per_chunk`): each chunk commits +//! in one SQL transaction, bounding ack latency for the first op in it. +//! +//! The lane is a single-thread `spawn_blocking` task. SQLite is the only +//! synchronization with other components (input reader, batch submitter). + +mod catch_up; +mod config; +mod error; +mod types; + +#[cfg(test)] +mod tests; + +pub use config::InclusionLaneConfig; +pub use error::InclusionLaneError; +pub use types::{PendingUserOp, SequencerError}; + +use std::thread; +use std::time::{Duration, Instant, SystemTime}; + +use tokio::sync::mpsc; +use tokio::task::JoinHandle; + +use crate::runtime::shutdown::ShutdownSignal; +use crate::storage::{SafeInputRange, Storage, StoredSafeInput, WriteHead}; +use sequencer_core::application::{AppError, Application, ExecutionOutcome}; +use sequencer_core::l2_tx::DirectInput; +use sequencer_core::user_op::SignedUserOp; + +use catch_up::catch_up_application; + +/// Owns the application instance, the `Storage` write handle, and the user-op +/// receiver for the lifetime of the sequencer process. +pub struct InclusionLane { + rx: mpsc::Receiver, + shutdown: ShutdownSignal, + app: A, + storage: Storage, + config: InclusionLaneConfig, +} + +impl InclusionLane { + /// Spawn the lane on a blocking thread. Returns the input MPSC sender (for + /// the API to enqueue user ops) and the join handle (for the runtime to + /// observe lane shutdown). + /// + /// The handle resolves to `Ok(())` on graceful shutdown, or an + /// `InclusionLaneError` if the lane crashed. + pub fn start( + queue_capacity: usize, + shutdown: ShutdownSignal, + app: A, + storage: Storage, + config: InclusionLaneConfig, + ) -> ( + mpsc::Sender, + JoinHandle>, + ) { + let (tx, rx) = mpsc::channel::(queue_capacity.max(1)); + let handle = tokio::task::spawn_blocking(move || { + let mut lane = Self { + rx, + shutdown, + app, + storage, + config, + }; + lane.run_forever() + }); + (tx, handle) + } + + fn run_forever(&mut self) -> Result<(), InclusionLaneError> { + self.run_catch_up()?; + let mut included = Vec::with_capacity(self.config.max_user_ops_per_chunk.max(1)); + let mut safe_inputs = Vec::with_capacity(self.config.safe_input_buffer_capacity.max(1)); + let mut lane_state = self.load_or_initialize_lane_state(&mut safe_inputs)?; + + loop { + if self.shutdown.is_shutdown_requested() { + self.reject_pending_user_ops_due_to_shutdown(); + return Ok(()); + } + + self.maybe_advance_safe_frontier(&mut lane_state, &mut safe_inputs)?; + let drain = self.run_inner_drain(&mut lane_state.head, &mut included)?; + + if drain.hit_batch_target() + || should_close_batch_by_time(&lane_state.head, &self.config) + { + let next_safe_block = lane_state.head.safe_block; + self.storage + .close_frame_and_batch(&mut lane_state.head, next_safe_block)?; + } else if !drain.drained_any() { + thread::sleep(self.config.idle_poll_interval); + } + } + } + + fn run_catch_up(&mut self) -> Result<(), InclusionLaneError> { + catch_up_application( + &mut self.app, + &mut self.storage, + self.config.batch_submitter_address, + ) + .map_err(|source| InclusionLaneError::CatchUp { source }) + } + + fn load_or_initialize_lane_state( + &mut self, + safe_inputs: &mut Vec, + ) -> Result { + let next_safe_input_index = self.storage.next_undrained_safe_input_index()?; + + let last_drained_direct_range = SafeInputRange::empty_at(next_safe_input_index); + if let Some(head) = self.storage.open_state()? { + return Ok(LaneState::new(last_drained_direct_range, head)); + } + + let frontier = self.storage.safe_input_frontier()?; + assert!( + frontier.end_exclusive >= last_drained_direct_range.end(), + "safe-input head regressed during lane initialization: safe_end={}, next={}", + frontier.end_exclusive, + last_drained_direct_range.end() + ); + + let leading_direct_range = last_drained_direct_range.advance_to(frontier.end_exclusive); + self.execute_safe_inputs_range(leading_direct_range, safe_inputs)?; + let head = self + .storage + .initialize_open_state(frontier.safe_block, leading_direct_range)?; + + Ok(LaneState::new(leading_direct_range, head)) + } + + /// Drain user ops in chunks until the queue empties or we cross the batch + /// size target. Each chunk persists separately so ack latency stays bounded + /// by `max_user_ops_per_chunk`. + fn run_inner_drain( + &mut self, + head: &mut WriteHead, + included: &mut Vec, + ) -> Result { + let mut drained_any = false; + loop { + let (count, outcome) = self.process_user_op_chunk(head, included)?; + if count > 0 { + drained_any = true; + } + match outcome { + ChunkOutcome::QueueEmpty => { + return Ok(if drained_any { + DrainSummary::DrainedQueue + } else { + DrainSummary::Idle + }); + } + ChunkOutcome::HitBatchTarget => return Ok(DrainSummary::HitBatchTarget), + ChunkOutcome::MoreToProcess => continue, + } + } + } + + fn process_user_op_chunk( + &mut self, + head: &mut WriteHead, + included: &mut Vec, + ) -> Result<(usize, ChunkOutcome), InclusionLaneError> { + included.clear(); + let outcome = match dequeue_and_execute_user_op_chunk::( + &mut self.rx, + &mut self.app, + head.frame_fee, + self.config.max_user_ops_per_chunk.max(1), + head, + included, + ) { + Ok(outcome) => outcome, + Err(err) => { + Self::respond_internal_to_all(included, "application internal error".to_string()); + return Err(err); + } + }; + let included_count = included.len(); + + self.persist_included_user_ops(head, included)?; + + for item in included.drain(..) { + let _ = item.respond_to.send(Ok(())); + } + + Ok((included_count, outcome)) + } + + /// Time-gated to bound idle SQL load. High-throughput batches can delay + /// this past the gate, but a full batch is far less than 1s of work in + /// practice. + fn maybe_advance_safe_frontier( + &mut self, + lane_state: &mut LaneState, + safe_inputs: &mut Vec, + ) -> Result<(), InclusionLaneError> { + if !lane_state.frontier_check_due(self.config.frontier_min_interval) { + return Ok(()); + } + lane_state.mark_frontier_checked(); + + let frontier = self.storage.safe_input_frontier()?; + assert!( + frontier.end_exclusive >= lane_state.last_drained_direct_range.end(), + "safe-input head regressed: safe_end={}, next={}", + frontier.end_exclusive, + lane_state.last_drained_direct_range.end() + ); + if frontier.safe_block <= lane_state.head.safe_block { + return Ok(()); + } + + let leading_direct_range = lane_state + .last_drained_direct_range + .advance_to(frontier.end_exclusive); + self.execute_safe_inputs_range(leading_direct_range, safe_inputs)?; + self.storage.close_frame_only( + &mut lane_state.head, + frontier.safe_block, + leading_direct_range, + )?; + lane_state.last_drained_direct_range = leading_direct_range; + Ok(()) + } + + fn persist_included_user_ops( + &mut self, + head: &mut WriteHead, + included: &mut Vec, + ) -> Result<(), InclusionLaneError> { + self.storage + .append_user_ops_chunk(head, included.as_slice()) + .map_err(|err| { + Self::respond_internal_to_all(included, "internal storage error".to_string()); + InclusionLaneError::Storage(err) + }) + } + + fn execute_safe_inputs_range( + &mut self, + direct_range: SafeInputRange, + chunk: &mut Vec, + ) -> Result<(), InclusionLaneError> { + let max_chunk_len = self.config.safe_input_buffer_capacity.max(1) as u64; + for chunk_range in direct_range.chunks(max_chunk_len) { + self.storage.fill_safe_inputs(chunk_range, chunk)?; + self.execute_safe_inputs_chunk(chunk.as_slice())?; + } + Ok(()) + } + + fn execute_safe_inputs_chunk( + &mut self, + chunk: &[StoredSafeInput], + ) -> Result<(), InclusionLaneError> { + for input in chunk { + if input.sender == self.config.batch_submitter_address { + continue; + } + let direct_input = DirectInput { + sender: input.sender, + block_number: input.block_number, + payload: input.payload.clone(), + }; + + self.app + .execute_direct_input(&direct_input) + .map_err(|source| InclusionLaneError::ExecuteDirectInput { source })?; + } + Ok(()) + } + + fn respond_internal_to_all(pending: &mut Vec, message: String) { + for item in pending.drain(..) { + let _ = item + .respond_to + .send(Err(SequencerError::internal(message.clone()))); + } + } + + fn reject_pending_user_ops_due_to_shutdown(&mut self) { + while let Ok(item) = self.rx.try_recv() { + let _ = item + .respond_to + .send(Err(SequencerError::unavailable("sequencer shutting down"))); + } + } +} + +#[derive(Debug, PartialEq, Eq)] +enum DrainSummary { + /// Queue was empty; nothing was drained this pass. + Idle, + /// Drained the queue, no batch close needed (size-wise). + DrainedQueue, + /// Drained at least one op AND crossed the batch size target. + /// (`(false, true)` is unreachable: the size check fires only after a + /// successful execution, so `HitBatchTarget` always implies `drained_any`.) + HitBatchTarget, +} + +impl DrainSummary { + fn hit_batch_target(&self) -> bool { + matches!(self, Self::HitBatchTarget) + } + + fn drained_any(&self) -> bool { + !matches!(self, Self::Idle) + } +} + +#[derive(Debug, PartialEq, Eq)] +pub(super) enum ChunkOutcome { + /// Queue drained or sender disconnected with at least one op processed. + QueueEmpty, + /// Including the latest op pushed the batch over `max_batch_user_op_bytes`. + HitBatchTarget, + /// Hit `max_user_ops_per_chunk` cap; queue may still have more. + MoreToProcess, +} + +fn should_close_batch_by_time(head: &WriteHead, config: &InclusionLaneConfig) -> bool { + let age = SystemTime::now() + .duration_since(head.batch_created_at) + .unwrap_or_default(); + age >= config.max_batch_open +} + +fn execute_user_op( + app: &mut impl Application, + item: PendingUserOp, + current_frame_fee: u16, + included: &mut Vec, +) -> Result<(), InclusionLaneError> { + match app.validate_and_execute_user_op( + item.signed.sender, + &item.signed.user_op, + current_frame_fee, + ) { + Ok(ExecutionOutcome::Included { .. }) => included.push(item), + Ok(ExecutionOutcome::Invalid(reason)) => { + let _ = item + .respond_to + .send(Err(SequencerError::invalid(reason.to_string()))); + } + Err(AppError::Internal { reason }) => { + let _ = item + .respond_to + .send(Err(SequencerError::internal(reason.clone()))); + return Err(InclusionLaneError::ExecuteUserOp { + source: AppError::Internal { reason }, + }); + } + } + Ok(()) +} + +/// Dequeue and execute up to `max_chunk` user ops, stopping early if the batch +/// would cross its size target. Returns the outcome that drove the stop. +/// +/// `head.batch_user_op_count` reflects already-persisted ops; `included.len()` +/// is the count we'd add by persisting now. When their sum's bytes equal or +/// exceed `head.max_batch_user_op_bytes`, we stop and the caller closes the +/// batch. +pub(super) fn dequeue_and_execute_user_op_chunk( + rx: &mut mpsc::Receiver, + app: &mut A, + current_frame_fee: u16, + max_chunk: usize, + head: &WriteHead, + included: &mut Vec, +) -> Result { + let mut executed = 0_usize; + + while executed < max_chunk { + match rx.try_recv() { + Ok(item) => { + execute_user_op(app, item, current_frame_fee, included)?; + executed = executed.saturating_add(1); + + let projected = head + .batch_user_op_count + .saturating_add(included.len() as u64); + if user_op_count_to_bytes::(projected) >= head.max_batch_user_op_bytes { + return Ok(ChunkOutcome::HitBatchTarget); + } + } + Err(mpsc::error::TryRecvError::Empty) => return Ok(ChunkOutcome::QueueEmpty), + Err(mpsc::error::TryRecvError::Disconnected) => { + if executed == 0 { + return Err(InclusionLaneError::ChannelClosed); + } + return Ok(ChunkOutcome::QueueEmpty); + } + } + } + + Ok(ChunkOutcome::MoreToProcess) +} + +fn user_op_count_to_bytes(user_op_count: u64) -> u64 { + let one_user_op_bytes = SignedUserOp::max_batch_metadata() + A::MAX_METHOD_PAYLOAD_BYTES; + user_op_count.saturating_mul(one_user_op_bytes as u64) +} + +/// Lane-local state threaded through every loop iteration. +/// +/// `head` and `last_drained_direct_range` stay in lockstep — every safe-frontier +/// advance updates both `head.safe_block` (persisted in the open frame) and +/// `last_drained_direct_range.end()` (in-memory drain cursor). +/// +/// `last_frontier_check` is the time gate's bookkeeping; `None` initially so +/// the first iteration always polls. +struct LaneState { + last_drained_direct_range: SafeInputRange, + head: WriteHead, + last_frontier_check: Option, +} + +impl LaneState { + fn new(last_drained_direct_range: SafeInputRange, head: WriteHead) -> Self { + Self { + last_drained_direct_range, + head, + last_frontier_check: None, + } + } + + fn frontier_check_due(&self, min_interval: Duration) -> bool { + self.last_frontier_check + .map(|t| t.elapsed() >= min_interval) + .unwrap_or(true) + } + + fn mark_frontier_checked(&mut self) { + self.last_frontier_check = Some(Instant::now()); + } +} diff --git a/sequencer/src/inclusion_lane/tests.rs b/sequencer/src/ingress/inclusion_lane/tests.rs similarity index 80% rename from sequencer/src/inclusion_lane/tests.rs rename to sequencer/src/ingress/inclusion_lane/tests.rs index 4778dcf..9f6069c 100644 --- a/sequencer/src/inclusion_lane/tests.rs +++ b/sequencer/src/ingress/inclusion_lane/tests.rs @@ -9,18 +9,18 @@ use std::time::{Duration, SystemTime}; use alloy_primitives::{Address, Signature, U256}; use app_core::application::MAX_METHOD_PAYLOAD_BYTES as WALLET_MAX_METHOD_PAYLOAD_BYTES; use rusqlite::params; -use tempfile::TempDir; use tokio::sync::{mpsc, oneshot}; -use crate::shutdown::ShutdownSignal; -use crate::storage::{SafeInputRange, Storage, StoredSafeInput}; +use crate::runtime::shutdown::ShutdownSignal; +use crate::storage::test_helpers::{default_protocol_config, temp_db}; +use crate::storage::{SafeInputRange, Storage, StoredSafeInput, WriteHead}; use sequencer_core::application::{AppError, AppOutputs, Application, InvalidReason}; use sequencer_core::l2_tx::{DirectInput, SequencedL2Tx, ValidUserOp}; use sequencer_core::user_op::{SignedUserOp, UserOp}; use super::catch_up::catch_up_application_paged; +use super::dequeue_and_execute_user_op_chunk; use super::error::CatchUpError; -use super::lane::dequeue_and_execute_user_op_chunk; use super::{InclusionLane, InclusionLaneConfig, InclusionLaneError, PendingUserOp}; #[derive(Default)] @@ -66,9 +66,33 @@ impl Application for TestApp { } } -struct TestDb { - _dir: TempDir, - path: String, +struct InternalUserOpApp; + +impl Application for InternalUserOpApp { + const MAX_METHOD_PAYLOAD_BYTES: usize = WALLET_MAX_METHOD_PAYLOAD_BYTES; + + fn current_user_nonce(&self, _sender: Address) -> u32 { + 0 + } + + fn current_user_balance(&self, _sender: Address) -> U256 { + U256::MAX + } + + fn validate_user_op( + &self, + _sender: Address, + _user_op: &UserOp, + _current_fee: u16, + ) -> Result<(), InvalidReason> { + Ok(()) + } + + fn execute_valid_user_op(&mut self, _user_op: &ValidUserOp) -> Result { + Err(AppError::Internal { + reason: "app invariant failed".to_string(), + }) + } } #[derive(Debug, Clone, PartialEq, Eq)] @@ -182,18 +206,6 @@ impl Application for ReplayRecordingApp { } } -fn temp_db(name: &str) -> TestDb { - let dir = tempfile::Builder::new() - .prefix(format!("sequencer-inclusion-lane-{name}-").as_str()) - .tempdir() - .expect("create temporary test directory"); - let path = dir.path().join("sequencer.sqlite"); - TestDb { - _dir: dir, - path: path.to_string_lossy().into_owned(), - } -} - fn default_test_config() -> InclusionLaneConfig { InclusionLaneConfig { batch_submitter_address: Address::from_slice(&[0xff; 20]), @@ -201,6 +213,9 @@ fn default_test_config() -> InclusionLaneConfig { safe_input_buffer_capacity: 16, max_batch_open: Duration::MAX, idle_poll_interval: Duration::from_millis(2), + // Tests should observe frontier changes immediately rather than wait + // for the production gate. + frontier_min_interval: Duration::ZERO, } } @@ -212,16 +227,13 @@ async fn start_lane( ShutdownSignal, tokio::task::JoinHandle>, ) { - let storage = Storage::open(db_path, "NORMAL").expect("open storage"); + let storage = Storage::open(db_path).expect("open storage"); let shutdown = ShutdownSignal::default(); let (tx, handle) = InclusionLane::start(128, shutdown.clone(), TestApp::default(), storage, config); let initialized = wait_until(Duration::from_secs(2), || { - let mut storage = Storage::open(db_path, "NORMAL").expect("open storage"); - storage - .load_open_state() - .expect("load open state") - .is_some() + let mut storage = Storage::open(db_path).expect("open storage"); + storage.open_state().expect("load open state").is_some() }) .await; assert!(initialized, "lane should initialize its first open state"); @@ -238,7 +250,9 @@ fn make_pending_user_op( let (respond_to, recv) = oneshot::channel(); let user_op = UserOp { nonce: 0, - max_fee: 1, + // Must be >= the DB default recommended_fee (1060) to pass the + // protocol-level max_fee >= fee_price check in the trait default. + max_fee: u16::MAX, data: vec![seed; 4].into(), }; ( @@ -256,7 +270,7 @@ fn make_pending_user_op( } fn seed_replay_fixture(db_path: &str) -> Vec { - let mut storage = Storage::open(db_path, "NORMAL").expect("open storage"); + let mut storage = Storage::open(db_path).expect("open storage"); let mut head = storage .initialize_open_state(0, SafeInputRange::empty_at(0)) .expect("initialize open state"); @@ -274,6 +288,7 @@ fn seed_replay_fixture(db_path: &str) -> Vec { payload: vec![0xaa], block_number: 10, }], + &default_protocol_config(), ) .expect("append first direct input"); storage @@ -292,6 +307,7 @@ fn seed_replay_fixture(db_path: &str) -> Vec { payload: vec![0xbb], block_number: 20, }], + &default_protocol_config(), ) .expect("append second direct input"); storage @@ -306,6 +322,7 @@ fn seed_replay_fixture(db_path: &str) -> Vec { payload: vec![0xcc], block_number: 30, }], + &default_protocol_config(), ) .expect("append third direct input"); storage @@ -344,14 +361,14 @@ fn seed_replay_fixture(db_path: &str) -> Vec { } fn read_count(db_path: &str, table: &str) -> i64 { - let conn = Storage::open_connection(db_path, "NORMAL").expect("open sqlite reader"); + let conn = Storage::open_connection(db_path).expect("open sqlite reader"); let sql = format!("SELECT COUNT(*) FROM {table}"); conn.query_row(sql.as_str(), [], |row| row.get(0)) .expect("count rows") } fn read_frame_direct_count(db_path: &str, batch_index: i64, frame_in_batch: i64) -> i64 { - let conn = Storage::open_connection(db_path, "NORMAL").expect("open sqlite reader"); + let conn = Storage::open_connection(db_path).expect("open sqlite reader"); conn.query_row( "SELECT COUNT(*) FROM sequenced_l2_txs WHERE batch_index = ?1 @@ -413,8 +430,7 @@ async fn ack_happens_after_chunk_commit_without_closing_frame() { async fn direct_inputs_close_frame_and_persist_drain() { let db = temp_db("directs-close-frame"); let (_tx, shutdown, lane_handle) = start_lane(db.path.as_str(), default_test_config()).await; - let mut feeder_storage = - Storage::open(db.path.as_str(), "NORMAL").expect("open feeder storage"); + let mut feeder_storage = Storage::open(db.path.as_str()).expect("open feeder storage"); feeder_storage .append_safe_inputs( @@ -424,6 +440,7 @@ async fn direct_inputs_close_frame_and_persist_drain() { payload: vec![0xaa], block_number: 10, }], + &default_protocol_config(), ) .expect("append safe direct input"); @@ -443,9 +460,9 @@ async fn sequenced_safe_inputs_are_drained_but_not_executed() { let db = temp_db("sequenced-safe-inputs-skip"); let batch_submitter_address = Address::from([0xfe; 20]); let executed_direct_inputs = Arc::new(AtomicU64::new(0)); - let storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); + let storage = Storage::open(db.path.as_str()).expect("open storage"); let shutdown = ShutdownSignal::default(); - let (tx, lane_handle) = InclusionLane::start( + let (_tx, lane_handle) = InclusionLane::start( 128, shutdown.clone(), SharedCountingApp { @@ -458,17 +475,13 @@ async fn sequenced_safe_inputs_are_drained_but_not_executed() { }, ); let initialized = wait_until(Duration::from_secs(2), || { - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - storage - .load_open_state() - .expect("load open state") - .is_some() + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + storage.open_state().expect("load open state").is_some() }) .await; assert!(initialized, "lane should initialize open state"); - let mut feeder_storage = - Storage::open(db.path.as_str(), "NORMAL").expect("open feeder storage"); + let mut feeder_storage = Storage::open(db.path.as_str()).expect("open feeder storage"); feeder_storage .append_safe_inputs( 10, @@ -477,6 +490,7 @@ async fn sequenced_safe_inputs_are_drained_but_not_executed() { payload: vec![0xaa], block_number: 10, }], + &default_protocol_config(), ) .expect("append safe batch-submitter input"); @@ -484,7 +498,6 @@ async fn sequenced_safe_inputs_are_drained_but_not_executed() { read_frame_direct_count(db.path.as_str(), 0, 1) == 1 }) .await; - drop(tx); shutdown_lane(&shutdown, lane_handle).await; assert!( @@ -504,8 +517,7 @@ async fn direct_inputs_are_paginated_by_buffer_capacity() { let mut config = default_test_config(); config.safe_input_buffer_capacity = 2; let (_tx, shutdown, lane_handle) = start_lane(db.path.as_str(), config).await; - let mut feeder_storage = - Storage::open(db.path.as_str(), "NORMAL").expect("open feeder storage"); + let mut feeder_storage = Storage::open(db.path.as_str()).expect("open feeder storage"); let mut directs = Vec::new(); for index in 0..5_u64 { @@ -516,7 +528,7 @@ async fn direct_inputs_are_paginated_by_buffer_capacity() { }); } feeder_storage - .append_safe_inputs(10, directs.as_slice()) + .append_safe_inputs(10, directs.as_slice(), &default_protocol_config()) .expect("append safe direct inputs"); let drained = wait_until(Duration::from_secs(2), || { @@ -534,8 +546,7 @@ async fn direct_inputs_are_paginated_by_buffer_capacity() { async fn safe_inputs_already_available_are_sequenced_before_later_user_ops() { let db = temp_db("directs-before-later-userops"); let (tx, shutdown, lane_handle) = start_lane(db.path.as_str(), default_test_config()).await; - let mut feeder_storage = - Storage::open(db.path.as_str(), "NORMAL").expect("open feeder storage"); + let mut feeder_storage = Storage::open(db.path.as_str()).expect("open feeder storage"); feeder_storage .append_safe_inputs( @@ -545,6 +556,7 @@ async fn safe_inputs_already_available_are_sequenced_before_later_user_ops() { payload: vec![0xaa], block_number: 10, }], + &default_protocol_config(), ) .expect("append safe direct input"); @@ -564,11 +576,14 @@ async fn safe_inputs_already_available_are_sequenced_before_later_user_ops() { .expect("wait for ack") .expect("ack channel open"); - let replay = { - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); + let replay: Vec = { + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); storage - .load_ordered_l2_txs_from(0) + .ordered_l2_txs_page_from(0, 1_000_000) .expect("load ordered replay") + .into_iter() + .map(|(_offset, tx)| tx) + .collect() }; shutdown_lane(&shutdown, lane_handle).await; @@ -636,7 +651,7 @@ async fn batch_closes_when_max_user_op_bytes_is_reached() { // Set alpha high enough that batch_size_target ≤ one user op (126 bytes). // 55000*1000/(17000*26) = 124 bytes < 126. { - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); storage.set_alpha(17000, 1000).expect("set alpha"); } let config = default_test_config(); @@ -660,15 +675,32 @@ async fn batch_closes_when_max_user_op_bytes_is_reached() { assert_eq!(drain, 0); } +/// Test fixture: a `WriteHead` whose size budget is unbounded, so the early-stop +/// in `dequeue_and_execute_user_op_chunk` never triggers from the size check +/// alone. Tests that want to exercise the size check construct their own. +fn unbounded_head() -> WriteHead { + WriteHead { + batch_index: 0, + batch_created_at: SystemTime::now(), + frame_fee: 0, + safe_block: 0, + batch_user_op_count: 0, + open_frame_user_op_count: 0, + frame_in_batch: 0, + max_batch_user_op_bytes: u64::MAX, + } +} + #[test] fn dequeue_returns_channel_closed_when_disconnected() { let (tx, mut rx) = mpsc::channel::(1); drop(tx); let mut app = TestApp::default(); let mut included = Vec::new(); + let head = unbounded_head(); - let err = - dequeue_and_execute_user_op_chunk(&mut rx, &mut app, 1, 1, &mut included).unwrap_err(); + let err = dequeue_and_execute_user_op_chunk(&mut rx, &mut app, 1, 1, &head, &mut included) + .unwrap_err(); assert!(matches!(err, InclusionLaneError::ChannelClosed)); } @@ -681,16 +713,44 @@ fn dequeue_flushes_executed_ops_before_observing_disconnect() { let mut app = TestApp::default(); let mut included = Vec::new(); - dequeue_and_execute_user_op_chunk(&mut rx, &mut app, 1, 16, &mut included) + let head = unbounded_head(); + dequeue_and_execute_user_op_chunk(&mut rx, &mut app, 1, 16, &head, &mut included) .expect("should flush processed user ops before disconnect"); assert_eq!(included.len(), 1); } +#[test] +fn dequeue_returns_lane_error_when_app_reports_internal() { + let (tx, mut rx) = mpsc::channel::(1); + let (pending, recv) = make_pending_user_op(0x45); + tx.blocking_send(pending).expect("enqueue pending user op"); + + let mut app = InternalUserOpApp; + let mut included = Vec::new(); + let head = unbounded_head(); + let err = dequeue_and_execute_user_op_chunk(&mut rx, &mut app, 1, 16, &head, &mut included) + .expect_err("internal application error should stop the lane"); + + assert!(matches!(err, InclusionLaneError::ExecuteUserOp { .. })); + assert!( + included.is_empty(), + "internal errors must not leave an op ready to persist" + ); + let response = recv + .blocking_recv() + .expect("lane should respond to triggering op") + .expect_err("triggering op should receive internal error"); + assert!(matches!( + response, + super::SequencerError::Internal(message) if message == "app invariant failed" + )); +} + #[test] fn catch_up_replays_multiple_pages() { let db = temp_db("catch-up-multi-page"); let expected = seed_replay_fixture(db.path.as_str()); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); let mut app = ReplayRecordingApp::default(); catch_up_application_paged(&mut app, &mut storage, Address::from([0xff; 20]), 2) @@ -704,7 +764,7 @@ fn catch_up_replays_multiple_pages() { fn catch_up_replays_from_storage_even_when_app_reports_executed_inputs() { let db = temp_db("catch-up-offset"); let expected = seed_replay_fixture(db.path.as_str()); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); let mut app = ReplayRecordingApp::with_executed_input_count(3); catch_up_application_paged(&mut app, &mut storage, Address::from([0xff; 20]), 2) @@ -718,7 +778,7 @@ fn catch_up_replays_from_storage_even_when_app_reports_executed_inputs() { fn catch_up_handles_mixed_user_ops_and_direct_inputs_across_page_boundary() { let db = temp_db("catch-up-mixed-page-boundary"); let expected = seed_replay_fixture(db.path.as_str()); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); let mut app = ReplayRecordingApp::default(); catch_up_application_paged(&mut app, &mut storage, Address::from([0xff; 20]), 4) @@ -730,8 +790,7 @@ fn catch_up_handles_mixed_user_ops_and_direct_inputs_across_page_boundary() { #[test] fn catch_up_load_error_reports_offset() { let db = temp_db("catch-up-load-error"); - let mut storage = - Storage::open_without_migrations(db.path.as_str(), "NORMAL").expect("open raw storage"); + let mut storage = Storage::open_without_migrations(db.path.as_str()).expect("open raw storage"); let mut app = ReplayRecordingApp::default(); let err = catch_up_application_paged(&mut app, &mut storage, Address::from([0xff; 20]), 2) diff --git a/sequencer/src/inclusion_lane/types.rs b/sequencer/src/ingress/inclusion_lane/types.rs similarity index 58% rename from sequencer/src/inclusion_lane/types.rs rename to sequencer/src/ingress/inclusion_lane/types.rs index 535dc89..b113db0 100644 --- a/sequencer/src/inclusion_lane/types.rs +++ b/sequencer/src/ingress/inclusion_lane/types.rs @@ -1,12 +1,17 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) +//! Cross-module types: the unit of work the API hands the lane, and the +//! per-op outcome the lane sends back through the response channel. + use std::time::SystemTime; use sequencer_core::user_op::SignedUserOp; use thiserror::Error; use tokio::sync::oneshot; +/// A signed user op accepted by the API and queued for the inclusion lane. +/// The lane sends the inclusion outcome back through `respond_to`. #[derive(Debug)] pub struct PendingUserOp { pub signed: SignedUserOp, @@ -14,6 +19,11 @@ pub struct PendingUserOp { pub received_at: SystemTime, } +/// Per-op outcome reported back to the API caller via the response channel. +/// +/// - `Invalid` — application rejected the op (nonce mismatch, fee too low, etc.); maps to HTTP 4xx. +/// - `Unavailable` — sequencer can't currently accept (shutting down, queue full); maps to HTTP 503/429. +/// - `Internal` — bug or unrecoverable failure; maps to HTTP 500. #[derive(Debug, Error, Clone)] pub enum SequencerError { #[error("{0}")] diff --git a/sequencer/src/ingress/mod.rs b/sequencer/src/ingress/mod.rs new file mode 100644 index 0000000..3795ac2 --- /dev/null +++ b/sequencer/src/ingress/mod.rs @@ -0,0 +1,9 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Inbound side: HTTP submit endpoint and the inclusion lane that consumes its +//! queue. The submit API is the public-facing port; the lane is the only writer +//! of open batch/frame state in storage. + +pub mod api; +pub mod inclusion_lane; diff --git a/sequencer/src/input_reader/mod.rs b/sequencer/src/input_reader/mod.rs deleted file mode 100644 index 46fc0d9..0000000 --- a/sequencer/src/input_reader/mod.rs +++ /dev/null @@ -1,9 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -//! Reads safe InputBox inputs from a reference source (e.g. InputBox contract) and appends them -//! to sequencer storage. Minimal design: no epochs or consensus; flat contiguous indices only. - -mod reader; - -pub use reader::{InputReader, InputReaderConfig, InputReaderError}; diff --git a/sequencer/src/l1/mod.rs b/sequencer/src/l1/mod.rs new file mode 100644 index 0000000..9300410 --- /dev/null +++ b/sequencer/src/l1/mod.rs @@ -0,0 +1,11 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! L1 client surface: reads InputBox events into storage (`reader`), submits +//! batches back out (`submitter`), and shares L1 utilities (`provider`, +//! `partition`). + +pub mod partition; +pub mod provider; +pub mod reader; +pub mod submitter; diff --git a/sequencer/src/partition.rs b/sequencer/src/l1/partition.rs similarity index 100% rename from sequencer/src/partition.rs rename to sequencer/src/l1/partition.rs diff --git a/sequencer/src/l1/provider.rs b/sequencer/src/l1/provider.rs new file mode 100644 index 0000000..daef57a --- /dev/null +++ b/sequencer/src/l1/provider.rs @@ -0,0 +1,150 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +use std::str::FromStr; +use std::time::Duration; + +use alloy::{ + providers::{DynProvider, Provider, ProviderBuilder}, + rpc::client::RpcClient, + signers::local::PrivateKeySigner, + transports::http::{Http, reqwest, reqwest::Url}, +}; +use alloy_transport::layers::RetryBackoffLayer; + +const REQUEST_TIMEOUT: Duration = Duration::from_secs(20); +const MAX_RATE_LIMIT_RETRIES: u32 = 5; +const INITIAL_BACKOFF_MS: u64 = 200; +const COMPUTE_UNITS_PER_SEC: u64 = 500; + +fn create_client(url: &str) -> Result { + let url = Url::parse(url).map_err(|e| format!("invalid RPC URL: {e}"))?; + + // Reject non-HTTPS for remote hosts to prevent accidental plaintext RPC. + // `url::Url::host_str` returns bracket-wrapped IPv6 literals (e.g. "[::1]"). + if url.scheme() != "https" && !is_loopback_host(url.host_str().unwrap_or("")) { + return Err(format!( + "remote RPC must use https, got {}://", + url.scheme() + )); + } + + let http_client = reqwest::Client::builder() + .timeout(REQUEST_TIMEOUT) + .build() + .map_err(|e| format!("failed to build HTTP client: {e}"))?; + + let transport = Http::with_client(http_client, url); + let is_local = transport.guess_local(); + + let retry = RetryBackoffLayer::new( + MAX_RATE_LIMIT_RETRIES, + INITIAL_BACKOFF_MS, + COMPUTE_UNITS_PER_SEC, + ); + + Ok(RpcClient::builder() + .layer(retry) + .transport(transport, is_local)) +} + +/// Check whether a URL host string refers to a loopback address. +/// +/// `url::Url::host_str` wraps IPv6 literals in brackets (e.g. `[::1]`), which +/// this helper normalizes alongside the IPv4 and DNS forms. +fn is_loopback_host(host: &str) -> bool { + matches!(host, "localhost" | "127.0.0.1" | "::1" | "[::1]") +} + +/// Create a read-only provider with retry and timeout. +pub fn create_provider(url: &str) -> Result { + let client = create_client(url)?; + let provider = ProviderBuilder::new().connect_client(client); + Ok(provider.erased()) +} + +/// Create a provider with a wallet signer, retry, and timeout. +pub fn create_signer_provider(url: &str, private_key: &str) -> Result { + let client = create_client(url)?; + let signer = + PrivateKeySigner::from_str(private_key).map_err(|_| "invalid private key".to_string())?; + let provider = ProviderBuilder::new().wallet(signer).connect_client(client); + Ok(provider.erased()) +} + +#[cfg(test)] +mod tests { + use super::*; + + // ── §8.5.2 / §8.5.3 — H4 regression: URL scheme enforcement ───────────── + + #[test] + fn create_client_rejects_http_for_remote_host() { + let err = create_client("http://mainnet.infura.io/v3/abc123") + .expect_err("http:// for remote host must be rejected"); + assert!( + err.contains("https"), + "error should explain https requirement, got: {err}" + ); + } + + #[test] + fn create_client_accepts_http_for_127_0_0_1() { + create_client("http://127.0.0.1:8545").expect("loopback http:// must be accepted"); + } + + #[test] + fn create_client_accepts_http_for_localhost() { + create_client("http://localhost:8545").expect("localhost http:// must be accepted"); + } + + #[test] + fn create_client_accepts_http_for_ipv6_loopback() { + create_client("http://[::1]:8545").expect("IPv6 loopback http:// must be accepted"); + } + + #[test] + fn create_client_accepts_https_for_remote_host() { + create_client("https://mainnet.infura.io/v3/abc123").expect("https:// must be accepted"); + } + + // ── §8.5.1 — H3 regression: private-key parse error must not echo bytes ─ + + #[test] + fn create_signer_provider_does_not_echo_key_bytes_on_invalid_hex() { + // A malformed key that would otherwise cause alloy's error Display to + // embed a character from the input. The fix replaced {e} with a fixed + // string. Assert the error is the fixed string exactly — not a prefix + // match — so a future change that re-adds interpolation is caught. + let bad_key = + "0xZZZZ_zzzz_ffff_ffff_ffff_ffff_ffff_ffff_ffff_ffff_ffff_ffff_ffff_ffff_ffff"; + let err = create_signer_provider("http://127.0.0.1:8545", bad_key) + .expect_err("malformed hex key must be rejected"); + assert_eq!( + err, "invalid private key", + "error message must be the fixed constant — no key bytes, no hex excerpt" + ); + // Belt-and-suspenders: no characters from the bad key should appear. + assert!( + !err.contains('Z') && !err.contains('z') && !err.contains('f'), + "error must not reflect any byte of the input key: {err}" + ); + } + + #[test] + fn create_signer_provider_does_not_echo_key_bytes_on_odd_length() { + // Odd-length hex would trigger a different error variant. Same + // invariant: fixed error message, no key bytes leaked. + let bad_key = "0xabc"; + let err = create_signer_provider("http://127.0.0.1:8545", bad_key) + .expect_err("odd-length hex key must be rejected"); + assert_eq!(err, "invalid private key"); + } + + #[test] + fn create_signer_provider_accepts_valid_key() { + let good_key = "0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80"; + create_signer_provider("http://127.0.0.1:8545", good_key) + .expect("valid key must be accepted"); + } +} diff --git a/sequencer/src/input_reader/reader.rs b/sequencer/src/l1/reader.rs similarity index 74% rename from sequencer/src/input_reader/reader.rs rename to sequencer/src/l1/reader.rs index b157f81..0dff6dd 100644 --- a/sequencer/src/input_reader/reader.rs +++ b/sequencer/src/l1/reader.rs @@ -1,6 +1,9 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) +//! Reads safe InputBox events from L1 and appends them to sequencer storage. +//! Minimal design: no epochs or consensus; flat contiguous indices only. + use std::time::Duration; use alloy::eips::BlockNumberOrTag::Safe; @@ -13,13 +16,12 @@ use cartesi_rollups_contracts::data_availability::DataAvailability::{ }; use cartesi_rollups_contracts::input_box::InputBox; use tokio::task::JoinHandle; -use tracing::{info, warn}; +use tracing::info; -use crate::partition::{decode_evm_advance_input, get_input_added_events}; -use crate::shutdown::ShutdownSignal; +use crate::l1::partition::{decode_evm_advance_input, get_input_added_events}; +use crate::runtime::shutdown::ShutdownSignal; use crate::storage::{Storage, StorageOpenError, StoredSafeInput}; - -const SQLITE_SYNCHRONOUS_PRAGMA: &str = "NORMAL"; +use sequencer_core::protocol::ProtocolConfig; #[derive(Debug, Clone)] pub struct InputReaderConfig { @@ -34,6 +36,8 @@ pub struct InputReaderConfig { pub enum InputReaderError { #[error("provider/transport: {0}")] Provider(String), + #[error("bootstrap: {0}")] + Bootstrap(String), #[error(transparent)] OpenStorage(#[from] StorageOpenError), #[error(transparent)] @@ -48,6 +52,9 @@ pub struct InputReader { genesis_block: u64, db_path: String, shutdown: ShutdownSignal, + /// Protocol config used to keep `safe_accepted_batches` consistent with + /// every `append_safe_inputs` write. + protocol: ProtocolConfig, } impl InputReader { @@ -55,15 +62,16 @@ impl InputReader { db_path: impl Into, shutdown: ShutdownSignal, config: InputReaderConfig, + protocol: ProtocolConfig, ) -> Result { - let provider = crate::provider::create_provider(&config.rpc_url) - .map_err(InputReaderError::Provider)?; + let provider = crate::l1::provider::create_provider(&config.rpc_url) + .map_err(InputReaderError::Bootstrap)?; let application = Application::new(config.app_address, &provider); let data_availability = application .getDataAvailability() .call() .await - .map_err(|e| InputReaderError::Provider(e.to_string()))?; + .map_err(map_contract_bootstrap_error)?; let input_box_address = decode_input_box_address(&data_availability)?; let input_box = InputBox::new(input_box_address, &provider); @@ -71,10 +79,10 @@ impl InputReader { .getDeploymentBlockNumber() .call() .await - .map_err(|e| InputReaderError::Provider(e.to_string()))? + .map_err(map_contract_bootstrap_error)? .try_into() .map_err(|_| { - InputReaderError::Provider( + InputReaderError::Bootstrap( "input box deployment block number did not fit into u64".to_string(), ) })?; @@ -85,15 +93,17 @@ impl InputReader { genesis_block, db_path.into(), shutdown, + protocol, )) } - fn from_parts( + pub fn from_parts( config: InputReaderConfig, input_box_address: Address, genesis_block: u64, db_path: String, shutdown: ShutdownSignal, + protocol: ProtocolConfig, ) -> Self { Self { config, @@ -101,6 +111,7 @@ impl InputReader { genesis_block, db_path, shutdown, + protocol, } } @@ -113,23 +124,23 @@ impl InputReader { } pub fn start(self) -> Result>, StorageOpenError> { - let _ = Storage::open(self.db_path.as_str(), SQLITE_SYNCHRONOUS_PRAGMA)?; + let _ = Storage::open(self.db_path.as_str())?; Ok(tokio::spawn(async move { self.run_forever().await })) } pub async fn sync_to_current_safe_head(&mut self) -> Result<(), InputReaderError> { self.bootstrap_safe_head().await?; - let provider = crate::provider::create_provider(&self.config.rpc_url) - .map_err(InputReaderError::Provider)?; + let provider = crate::l1::provider::create_provider(&self.config.rpc_url) + .map_err(InputReaderError::Bootstrap)?; self.advance_once(&provider).await } async fn run_forever(mut self) -> Result<(), InputReaderError> { self.bootstrap_safe_head().await?; - let provider = crate::provider::create_provider(&self.config.rpc_url) - .map_err(InputReaderError::Provider)?; + let provider = crate::l1::provider::create_provider(&self.config.rpc_url) + .map_err(InputReaderError::Bootstrap)?; loop { if self.shutdown.is_shutdown_requested() { @@ -139,7 +150,7 @@ impl InputReader { match self.advance_once(&provider).await { Ok(()) => {} Err(InputReaderError::Provider(error)) => { - warn!(error, "input reader advance failed, will retry"); + tracing::error!(error, "L1 provider error in input reader — will retry"); } Err(err) => return Err(err), } @@ -159,8 +170,10 @@ impl InputReader { let previous_safe_block = self.current_safe_block().await?; // If our persisted safe head is already at the current safe frontier, - // there is nothing new to scan. + // there is nothing new to scan. We only seed the progress marker on the + // first real observation; subsequent same-head polls must not refresh it. if current_safe_block <= previous_safe_block { + self.initialize_safe_progress_if_unset().await?; return Ok(()); } @@ -217,7 +230,7 @@ impl InputReader { async fn current_safe_block(&self) -> Result { let db_path = self.db_path.clone(); tokio::task::spawn_blocking(move || { - let mut storage = Storage::open(&db_path, SQLITE_SYNCHRONOUS_PRAGMA)?; + let mut storage = Storage::open(&db_path)?; storage.current_safe_block().map_err(InputReaderError::from) }) .await @@ -228,7 +241,7 @@ impl InputReader { let db_path = self.db_path.clone(); let minimum_safe_block = self.genesis_block.saturating_sub(1); tokio::task::spawn_blocking(move || { - let mut storage = Storage::open(&db_path, SQLITE_SYNCHRONOUS_PRAGMA)?; + let mut storage = Storage::open(&db_path)?; storage .ensure_minimum_safe_block(minimum_safe_block) .map_err(InputReaderError::from) @@ -237,16 +250,29 @@ impl InputReader { .map_err(|err| InputReaderError::Join(err.to_string()))? } + async fn initialize_safe_progress_if_unset(&self) -> Result<(), InputReaderError> { + let db_path = self.db_path.clone(); + tokio::task::spawn_blocking(move || { + let mut storage = Storage::open(&db_path)?; + storage + .initialize_safe_progress_if_unset() + .map_err(InputReaderError::from) + }) + .await + .map_err(|err| InputReaderError::Join(err.to_string()))? + } + async fn append_safe_inputs( &self, current_safe_block: u64, batch: Vec, ) -> Result<(), InputReaderError> { let db_path = self.db_path.clone(); + let protocol = self.protocol; tokio::task::spawn_blocking(move || { - let mut storage = Storage::open(&db_path, SQLITE_SYNCHRONOUS_PRAGMA)?; + let mut storage = Storage::open(&db_path)?; storage - .append_safe_inputs(current_safe_block, &batch) + .append_safe_inputs(current_safe_block, &batch, &protocol) .map_err(InputReaderError::from) }) .await @@ -256,7 +282,7 @@ impl InputReader { fn decode_input_box_address(data_availability: &[u8]) -> Result { let call = DataAvailabilityCalls::abi_decode(data_availability).map_err(|err| { - InputReaderError::Provider(format!( + InputReaderError::Bootstrap(format!( "application getDataAvailability returned invalid DataAvailability calldata: {err}" )) })?; @@ -267,12 +293,19 @@ fn decode_input_box_address(data_availability: &[u8]) -> Result Err(InputReaderError::Provider(format!( + }) => Err(InputReaderError::Bootstrap(format!( "application getDataAvailability returned unsupported DataAvailability.InputBoxAndEspresso(inputBox={inputBox}, fromBlock={fromBlock}, namespaceId={namespaceId})" ))), } } +fn map_contract_bootstrap_error(err: alloy::contract::Error) -> InputReaderError { + match err { + alloy::contract::Error::TransportError(_) => InputReaderError::Provider(err.to_string()), + _ => InputReaderError::Bootstrap(err.to_string()), + } +} + async fn latest_safe_block(provider: &impl Provider) -> Result { let block = provider .get_block(Safe.into()) @@ -289,6 +322,15 @@ mod tests { use alloy::sol_types::SolCall; use tempfile::NamedTempFile; + fn test_protocol() -> ProtocolConfig { + ProtocolConfig { + batch_submitter: Address::ZERO, + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + seconds_per_block: 12, + } + } + fn test_reader( db_path: String, rpc_url: String, @@ -307,11 +349,21 @@ mod tests { genesis_block, db_path, shutdown, + test_protocol(), ) } - fn require_anvil_tests() -> bool { - std::env::var_os("RUN_ANVIL_TESTS").is_some() + /// Verify that `anvil` is available. Panics with a clear message if not found. + fn require_anvil() { + assert!( + std::process::Command::new("anvil") + .arg("--version") + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .is_ok(), + "anvil not found on PATH — install Foundry (https://getfoundry.sh)" + ); } #[tokio::test] @@ -339,9 +391,7 @@ mod tests { #[tokio::test] async fn start_with_anvil_request_shutdown_then_join_returns_ok() { - if !require_anvil_tests() { - return; - } + require_anvil(); let anvil = Anvil::default().block_time(1).timeout(30_000).spawn(); let shutdown = ShutdownSignal::default(); @@ -369,9 +419,7 @@ mod tests { #[tokio::test] async fn advance_once_with_anvil_updates_safe_head_when_block_available() { - if !require_anvil_tests() { - return; - } + require_anvil(); let anvil = Anvil::default().block_time(1).timeout(30_000).spawn(); let db_file = NamedTempFile::new().expect("temp file"); @@ -390,11 +438,8 @@ mod tests { reader.advance_once(&provider).await.expect("advance_once"); let safe_block = reader.current_safe_block().await.expect("read safe block"); let safe_end = { - let mut storage = Storage::open( - db_file.path().to_string_lossy().as_ref(), - SQLITE_SYNCHRONOUS_PRAGMA, - ) - .expect("open storage"); + let mut storage = + Storage::open(db_file.path().to_string_lossy().as_ref()).expect("open storage"); storage.safe_input_end_exclusive().expect("safe end") }; assert_eq!(safe_end, 0, "no InputAdded contract so no direct inputs"); @@ -438,11 +483,8 @@ mod tests { assert!(matches!(result, Err(InputReaderError::Provider(_)))); - let mut storage = Storage::open( - db_file.path().to_string_lossy().as_ref(), - SQLITE_SYNCHRONOUS_PRAGMA, - ) - .expect("open storage"); + let mut storage = + Storage::open(db_file.path().to_string_lossy().as_ref()).expect("open storage"); assert_eq!( storage.current_safe_block().expect("read safe block"), genesis_block - 1 @@ -450,18 +492,49 @@ mod tests { } #[tokio::test] - async fn advance_once_when_safe_head_ahead_of_chain_is_no_op() { - if !require_anvil_tests() { - return; + async fn new_with_invalid_rpc_url_returns_bootstrap_error() { + let db_file = NamedTempFile::new().expect("temp file"); + + let result = InputReader::new( + db_file.path().to_string_lossy().into_owned(), + ShutdownSignal::default(), + InputReaderConfig { + rpc_url: "not-a-valid-url".to_string(), + app_address: Address::ZERO, + poll_interval: Duration::from_secs(1), + long_block_range_error_codes: Vec::new(), + }, + test_protocol(), + ) + .await; + + match result { + Err(InputReaderError::Bootstrap(_)) => {} + Err(other) => panic!("expected bootstrap error, got {other:?}"), + Ok(_) => panic!("invalid RPC URL should fail during bootstrap"), } + } + + #[tokio::test] + async fn advance_once_when_safe_head_ahead_of_chain_is_no_op() { + require_anvil(); let anvil = Anvil::default().block_time(1).timeout(30_000).spawn(); let db_file = NamedTempFile::new().expect("temp file"); let db_path = db_file.path().to_string_lossy().into_owned(); - let mut storage = Storage::open(&db_path, SQLITE_SYNCHRONOUS_PRAGMA).expect("open storage"); + let mut storage = Storage::open(&db_path).expect("open storage"); + let protocol = test_protocol(); storage - .append_safe_inputs(1000, &[]) + .append_safe_inputs(1000, &[], &protocol) .expect("set safe head ahead of chain"); + let recorded_sync = storage + .last_safe_progress_ms() + .expect("read safe-progress timestamp"); + assert!( + recorded_sync > 0, + "append_safe_inputs should stamp safe progress" + ); + drop(storage); let mut reader = test_reader( db_path, @@ -481,6 +554,16 @@ mod tests { 1000, "safe head should remain unchanged when already ahead of chain" ); + + let storage = + Storage::open(db_file.path().to_string_lossy().as_ref()).expect("re-open storage"); + assert_eq!( + storage + .last_safe_progress_ms() + .expect("read unchanged safe-progress timestamp"), + recorded_sync, + "same-head polls must not refresh the safe-progress marker" + ); } #[test] diff --git a/sequencer/src/batch_submitter/config.rs b/sequencer/src/l1/submitter/config.rs similarity index 53% rename from sequencer/src/batch_submitter/config.rs rename to sequencer/src/l1/submitter/config.rs index 6b0fd48..d7c2b74 100644 --- a/sequencer/src/batch_submitter/config.rs +++ b/sequencer/src/l1/submitter/config.rs @@ -3,9 +3,14 @@ use std::time::Duration; -/// Batch-submitter-specific options. L1 RPC URL and InputBox address are shared with the -/// input reader and come from the same discovery at startup (see `L1Config` in `config`). -/// These fields are parsed as part of `RunConfig` and passed through at runtime. +/// Batch-submitter-specific options. L1 RPC URL and InputBox address are shared +/// with the input reader and come from the same discovery at startup (see +/// `L1Config` in `config`). These fields are parsed as part of `RunConfig` and +/// passed through at runtime. +/// +/// Danger-zone tuning (`max_wait_blocks`, `preemptive_margin_blocks`, +/// `seconds_per_block`) lives in `ProtocolConfig`, not here — the submitter +/// doesn't read it. The [`crate::recovery::DangerDetector`] worker owns that. #[derive(Debug, Clone)] pub struct BatchSubmitterConfig { /// How often the submitter polls for new work when idle. diff --git a/sequencer/src/l1/submitter/mod.rs b/sequencer/src/l1/submitter/mod.rs new file mode 100644 index 0000000..7f53823 --- /dev/null +++ b/sequencer/src/l1/submitter/mod.rs @@ -0,0 +1,17 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Batch submitter: posts closed batches to L1 with at-least-once semantics. +//! +//! Each valid closed batch has a structural nonce (`batches.nonce`, set at +//! creation time as `parent.nonce + 1`). The scheduler checks that nonces are +//! strictly increasing and skips otherwise, so duplicates are deduplicated at +//! the scheduler level. See `worker` for the tick loop. + +mod config; +mod poster; +mod worker; + +pub use config::BatchSubmitterConfig; +pub use poster::{BatchPoster, BatchPosterConfig, BatchPosterError, EthereumBatchPoster, TxHash}; +pub use worker::{BatchSubmitter, BatchSubmitterError, SubmitterExit}; diff --git a/sequencer/src/l1/submitter/poster.rs b/sequencer/src/l1/submitter/poster.rs new file mode 100644 index 0000000..207d82a --- /dev/null +++ b/sequencer/src/l1/submitter/poster.rs @@ -0,0 +1,347 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +use alloy::providers::{ + DynProvider, PendingTransactionBuilder, PendingTransactionConfig, PendingTransactionError, + Provider, +}; +use alloy::rpc::types::BlockNumberOrTag; +use async_trait::async_trait; +use cartesi_rollups_contracts::input_box::InputBox; +use sequencer_core::batch::Batch; +use thiserror::Error; +use tracing::{debug, info, warn}; + +use crate::l1::partition::{decode_evm_advance_input, get_input_added_events}; + +pub type TxHash = alloy_primitives::B256; + +#[derive(Debug, Clone)] +pub struct BatchPosterConfig { + pub l1_submit_address: alloy_primitives::Address, + pub app_address: alloy_primitives::Address, + pub batch_submitter_address: alloy_primitives::Address, + pub start_block: u64, + pub confirmation_depth: u64, + /// Assumed L1 block time in seconds, used to derive a conservative + /// confirmation timeout for watched batch-submission txs. + pub seconds_per_block: u64, + /// Error codes that trigger `get_logs` retries with a shorter block range. + pub long_block_range_error_codes: Vec, +} + +#[derive(Debug, Error)] +pub enum BatchPosterError { + #[error("provider/transport: {0}")] + Provider(String), +} + +#[async_trait] +pub trait BatchPoster: Send + Sync { + async fn submit_batches(&self, payloads: Vec>) + -> Result, BatchPosterError>; + + async fn observed_submitted_batch_nonces( + &self, + from_block: u64, + ) -> Result, BatchPosterError>; +} + +#[derive(Clone)] +pub struct EthereumBatchPoster { + provider: DynProvider, + config: BatchPosterConfig, +} + +impl EthereumBatchPoster { + pub fn new(provider: DynProvider, config: BatchPosterConfig) -> Self { + Self { provider, config } + } + + /// Conservative upper-bound timeout for waiting on confirmations, derived + /// from the configured block time. Shorter block times on other chains just + /// make the watch complete sooner. + fn confirmation_timeout(&self) -> std::time::Duration { + derive_confirmation_timeout( + self.config.confirmation_depth, + self.config.seconds_per_block, + ) + } + + async fn latest_account_nonce(&self) -> Result { + self.provider + .get_transaction_count(self.config.batch_submitter_address) + .block_id(BlockNumberOrTag::Latest.into()) + .await + .map_err(|err| BatchPosterError::Provider(err.to_string())) + } + + async fn send_batch_at_nonce( + &self, + payload: Vec, + nonce: u64, + fees: &alloy::providers::utils::Eip1559Estimation, + ) -> Result, BatchPosterError> { + let input_box = InputBox::new(self.config.l1_submit_address, &self.provider); + input_box + .addInput(self.config.app_address, payload.into()) + .nonce(nonce) + .max_fee_per_gas(fees.max_fee_per_gas) + .max_priority_fee_per_gas(fees.max_priority_fee_per_gas) + .send() + .await + .map_err(|err| BatchPosterError::Provider(err.to_string())) + } + + /// Wait serially for each tx to reach `confirmation_depth + 1` confirmations. + /// + /// **Serial is not a performance concession; it's correct.** Ethereum mines + /// transactions from a single EOA in strict wallet-nonce order: tx[k] cannot + /// land on-chain until tx[k-1] has landed. So: + /// + /// - If tx[0] times out, tx[1..] cannot have been mined either; watching + /// them is provably pointless. We return `Ok(())` early and let the next + /// tick retry the whole sequence. + /// - If tx[0] confirms, tx[1] was blocked only on tx[0] and is unblocked by + /// the time we start watching it. + /// + /// Timeouts return `Ok(())` rather than `Err` because the safe response is + /// "re-enter `submit_batches` on the next tick" — which re-estimates fees + /// (natural replacement bump) and re-submits at the same wallet nonces. The + /// wallet-nonce ordering invariant above guarantees we cannot accidentally + /// skip work by returning early here. + async fn wait_for_confirmations(&self, tx_hashes: &[TxHash]) -> Result<(), BatchPosterError> { + let timeout = self.confirmation_timeout(); + for tx_hash in tx_hashes { + let watch = PendingTransactionConfig::new(*tx_hash) + .with_required_confirmations(self.config.confirmation_depth.saturating_add(1)) + .with_timeout(Some(timeout)) + .with_provider(self.provider.root().clone()); + match watch.watch().await { + Ok(_) => { + info!( + %tx_hash, + confirmation_depth = self.config.confirmation_depth, + required_confirmations = self.config.confirmation_depth.saturating_add(1), + "batch submission confirmed on L1" + ); + } + Err(PendingTransactionError::TxWatcher( + alloy::providers::WatchTxError::Timeout, + )) => { + warn!( + %tx_hash, + confirmation_depth = self.config.confirmation_depth, + timeout_secs = timeout.as_secs(), + "timed out waiting for batch submission confirmations; next tick will retry under fresher state" + ); + return Ok(()); + } + Err(err) => return Err(BatchPosterError::Provider(err.to_string())), + } + } + + Ok(()) + } +} + +fn derive_confirmation_timeout( + confirmation_depth: u64, + seconds_per_block: u64, +) -> std::time::Duration { + let blocks_to_wait = confirmation_depth.saturating_add(1).saturating_mul(2); + std::time::Duration::from_secs(blocks_to_wait.saturating_mul(seconds_per_block)) +} + +#[async_trait] +impl BatchPoster for EthereumBatchPoster { + async fn submit_batches( + &self, + payloads: Vec>, + ) -> Result, BatchPosterError> { + if payloads.is_empty() { + return Ok(Vec::new()); + } + + let fees = self + .provider + .estimate_eip1559_fees() + .await + .map_err(|err| BatchPosterError::Provider(err.to_string()))?; + let mut next_nonce = self.latest_account_nonce().await?; + let mut tx_hashes = Vec::with_capacity(payloads.len()); + + for payload in payloads { + let pending = self.send_batch_at_nonce(payload, next_nonce, &fees).await?; + let tx_hash = *pending.tx_hash(); + debug!( + tx_nonce = next_nonce, + %tx_hash, + confirmation_depth = self.config.confirmation_depth, + "sent batch submission tx to L1" + ); + tx_hashes.push(tx_hash); + next_nonce = next_nonce.saturating_add(1); + } + + self.wait_for_confirmations(tx_hashes.as_slice()).await?; + Ok(tx_hashes) + } + + async fn observed_submitted_batch_nonces( + &self, + from_block: u64, + ) -> Result, BatchPosterError> { + let latest = self + .provider + .get_block_number() + .await + .map_err(|err| BatchPosterError::Provider(err.to_string()))?; + let start_block = from_block.max(self.config.start_block); + if start_block > latest { + return Ok(Vec::new()); + } + + let events = get_input_added_events( + &self.provider, + self.config.app_address, + &self.config.l1_submit_address, + start_block, + latest, + self.config.long_block_range_error_codes.as_slice(), + ) + .await + .map_err(|errs| { + BatchPosterError::Provider( + errs.into_iter() + .next() + .map(|e| e.to_string()) + .unwrap_or_default(), + ) + })?; + + let mut observed_nonces = Vec::new(); + for (event, _log) in events { + let evm_advance = decode_evm_advance_input(event.input.as_ref()) + .map_err(BatchPosterError::Provider)?; + if evm_advance.msgSender != self.config.batch_submitter_address { + continue; + } + let batch: Batch = ssz::Decode::from_ssz_bytes(evm_advance.payload.as_ref()) + .map_err(|err| BatchPosterError::Provider(format!("{err:?}")))?; + observed_nonces.push(batch.nonce); + } + + Ok(observed_nonces) + } +} + +#[cfg(test)] +pub(crate) mod mock { + use super::{Batch, BatchPoster, BatchPosterError, TxHash}; + use async_trait::async_trait; + use std::sync::Mutex; + + #[derive(Debug)] + pub struct MockBatchPoster { + pub submissions: Mutex>, + pub observed_submitted_nonces: Mutex>, + pub observed_submitted_error: Mutex>, + pub last_from_block: Mutex>, + } + + impl MockBatchPoster { + pub fn new() -> Self { + Self { + submissions: Mutex::new(Vec::new()), + observed_submitted_nonces: Mutex::new(Vec::new()), + observed_submitted_error: Mutex::new(None), + last_from_block: Mutex::new(None), + } + } + + pub fn submissions(&self) -> Vec<(u64, usize)> { + self.submissions.lock().expect("lock").clone() + } + + pub fn set_observed_submitted_nonces(&self, value: Vec) { + *self.observed_submitted_nonces.lock().expect("lock") = value; + } + + pub fn set_observed_submitted_error(&self, value: Option<&str>) { + *self.observed_submitted_error.lock().expect("lock") = value.map(str::to_string); + } + + pub fn last_from_block(&self) -> Option { + *self.last_from_block.lock().expect("lock") + } + } + + #[async_trait] + impl BatchPoster for MockBatchPoster { + async fn submit_batches( + &self, + payloads: Vec>, + ) -> Result, BatchPosterError> { + let mut tx_hashes = Vec::with_capacity(payloads.len()); + for payload in payloads { + let batch_index = ssz::Decode::from_ssz_bytes(payload.as_ref()) + .map(|b: Batch| b.nonce) + .unwrap_or(0); + self.submissions + .lock() + .expect("lock") + .push((batch_index, payload.len())); + tx_hashes.push(TxHash::ZERO); + } + Ok(tx_hashes) + } + + async fn observed_submitted_batch_nonces( + &self, + from_block: u64, + ) -> Result, BatchPosterError> { + *self.last_from_block.lock().expect("lock") = Some(from_block); + if let Some(err) = self.observed_submitted_error.lock().expect("lock").clone() { + return Err(BatchPosterError::Provider(err)); + } + let configured = self.observed_submitted_nonces.lock().expect("lock").clone(); + if !configured.is_empty() { + return Ok(configured); + } + Ok(self + .submissions + .lock() + .expect("lock") + .iter() + .map(|(idx, _)| *idx) + .collect()) + } + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use super::{BatchPoster, derive_confirmation_timeout, mock::MockBatchPoster}; + + #[tokio::test] + async fn mock_poster_tracks_requested_suffix_start_block() { + let poster = MockBatchPoster::new(); + let observed = poster + .observed_submitted_batch_nonces(42) + .await + .expect("observe submitted batches"); + + assert!(observed.is_empty()); + assert_eq!(poster.last_from_block(), Some(42)); + } + + #[test] + fn confirmation_timeout_derives_from_seconds_per_block() { + assert_eq!(derive_confirmation_timeout(2, 12), Duration::from_secs(72)); + assert_eq!(derive_confirmation_timeout(2, 1), Duration::from_secs(6)); + assert_eq!(derive_confirmation_timeout(5, 3), Duration::from_secs(36)); + } +} diff --git a/sequencer/src/l1/submitter/worker.rs b/sequencer/src/l1/submitter/worker.rs new file mode 100644 index 0000000..d916ab8 --- /dev/null +++ b/sequencer/src/l1/submitter/worker.rs @@ -0,0 +1,524 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Batch submitter worker: stateless, at-least-once submission to L1. +//! +//! The submitter never observes danger — that is the [`crate::recovery::DangerDetector`] +//! worker's job. Each tick here is a pure "what pending work is left?" step: +//! +//! 1. Read the scheduler-accepted frontier (safe block + next-expected nonce) +//! from SQLite. Shared snapshot maintained by the input reader via +//! `append_safe_inputs`. +//! 2. Query L1 for batch submissions newer than the safe block; fold any +//! matching observed nonces to advance the local expected nonce past +//! already-mined submissions. +//! 3. Load every valid closed batch whose nonce is still past the advanced +//! frontier and submit them all in one shot. +//! +//! The outer loop is uniform: tick, maybe sleep, repeat. A tick that produced +//! submissions re-enters immediately (no sleep) so the suffix drains quickly; +//! an idle or transient-error tick sleeps `idle_poll_interval` before the next +//! attempt. +//! +//! Mid-tick cancellation is crash-safe: storage transactions either commit or +//! auto-roll-back on drop, and any already-sent L1 transaction is picked up by +//! the next startup's `observed_submitted_batch_nonces` scan. + +use std::sync::Arc; +use std::time::Duration; + +use thiserror::Error; +use tracing::{debug, error}; + +use crate::l1::submitter::{BatchPoster, BatchPosterError, BatchSubmitterConfig}; +use crate::runtime::shutdown::ShutdownSignal; +use crate::storage::{PendingBatch, Storage, StorageOpenError, SubmitterFrontier}; + +#[derive(Debug, Error)] +pub enum BatchSubmitterError { + #[error(transparent)] + OpenStorage(#[from] StorageOpenError), + #[error(transparent)] + Storage(#[from] rusqlite::Error), + #[error("batch submitter join error: {0}")] + Join(String), + #[error(transparent)] + Poster(#[from] BatchPosterError), +} + +/// How the submitter loop exited. +/// +/// There is only one deliberate exit path (shutdown). Danger detection lives +/// in the [`crate::recovery::DangerDetector`] worker; this type does not +/// concern itself with that signal. +#[derive(Debug)] +pub enum SubmitterExit { + /// Shutdown signal fired. + Shutdown, +} + +/// Outcome of one tick. Drives the outer loop's sleep cadence. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum TickOutcome { + /// Nothing pending; sleep before the next tick. + Idle, + /// Submitted one or more batches; re-enter immediately so the suffix + /// drains without idle-sleep. + Submitted(usize), + /// Transient provider error; log and sleep before retrying. + Transient, +} + +/// Pure: given the current submitter frontier and the batch nonces we just +/// observed on L1 past that frontier, compute the nonce at which we should +/// start submitting the remaining suffix. When the observed list is empty +/// (nothing new on L1) the result is just `frontier.accepted_next_nonce`. +fn decide_submit_start(frontier: SubmitterFrontier, recently_observed_nonces: &[u64]) -> u64 { + // Fold observed nonces over the safe-accepted frontier to derive the next + // unresolved nonce. The scan starts at `safe_block + 1` (the submitter + // asks the poster for that), so wallet-nonce ordering guarantees the + // observed list mirrors our submission order. + advance_expected_batch_nonce( + frontier.accepted_next_nonce, + recently_observed_nonces.iter().copied(), + ) +} + +pub struct BatchSubmitter { + db_path: String, + poster: Arc

, + idle_poll_interval: Duration, + shutdown: ShutdownSignal, +} + +impl BatchSubmitter

{ + pub fn new( + db_path: impl Into, + poster: Arc

, + shutdown: ShutdownSignal, + config: BatchSubmitterConfig, + ) -> Self { + Self { + db_path: db_path.into(), + poster, + idle_poll_interval: config.idle_poll_interval(), + shutdown, + } + } + + pub fn start( + self, + ) -> Result>, StorageOpenError> + { + let _ = Storage::open_read_only(self.db_path.as_str())?; + Ok(tokio::spawn(async move { self.run_forever().await })) + } + + /// Top-level driver. Races the work loop against the shutdown signal. + async fn run_forever(self) -> Result { + tokio::select! { + biased; + _ = self.shutdown.wait_for_shutdown() => Ok(SubmitterExit::Shutdown), + result = self.run_loop() => result, + } + } + + /// Tick → sleep-if-idle → tick. Productive ticks re-enter immediately; + /// idle or transient-error ticks wait `idle_poll_interval`. Fatal errors + /// propagate. + async fn run_loop(&self) -> Result { + loop { + let outcome = match self.tick_once().await { + Ok(o) => o, + Err(BatchSubmitterError::Poster(source)) => { + error!(error = %source, "L1 provider error — will retry"); + TickOutcome::Transient + } + Err(fatal) => return Err(fatal), + }; + match outcome { + TickOutcome::Submitted(_) => continue, + TickOutcome::Idle | TickOutcome::Transient => { + tokio::time::sleep(self.idle_poll_interval).await; + } + } + } + } + + pub(crate) async fn tick_once(&self) -> Result { + let frontier = self.load_frontier().await?; + + // Must start scanning at `safe_block + 1`: after a danger-zone shutdown + // the flusher only returns once `Pending <= Safe`, so any wallet-nonce + // slots backed by blocks at or below the safe head are already + // resolved and folded into `accepted_next_nonce`. Re-scanning those + // blocks here would double-count the finalized prefix. + let recent_observed = self + .poster + .observed_submitted_batch_nonces(frontier.safe_block.saturating_add(1)) + .await?; + + let from_nonce = decide_submit_start(frontier, &recent_observed); + let pending = self.pending_batches(from_nonce).await?; + if pending.is_empty() { + return Ok(TickOutcome::Idle); + } + + for batch in &pending { + debug!( + batch_index = batch.batch_index, + nonce = batch.nonce, + "queueing batch for L1 submission" + ); + } + let submitted_count = pending.len(); + let payloads: Vec> = pending.into_iter().map(|b| b.encoded).collect(); + let tx_hashes = self.poster.submit_batches(payloads).await?; + if tx_hashes.len() != submitted_count { + return Err(BatchSubmitterError::Poster(BatchPosterError::Provider( + format!( + "poster returned {} tx hashes for {submitted_count} submitted batches", + tx_hashes.len(), + ), + ))); + } + + Ok(TickOutcome::Submitted(submitted_count)) + } + + async fn load_frontier(&self) -> Result { + let db_path = self.db_path.clone(); + tokio::task::spawn_blocking(move || { + let mut storage = Storage::open_read_only(&db_path)?; + storage + .submitter_frontier() + .map_err(BatchSubmitterError::from) + }) + .await + .map_err(|err| BatchSubmitterError::Join(err.to_string()))? + } + + async fn pending_batches( + &self, + min_nonce: u64, + ) -> Result, BatchSubmitterError> { + let db_path = self.db_path.clone(); + tokio::task::spawn_blocking(move || { + let mut storage = Storage::open_read_only(&db_path)?; + storage + .pending_batches(min_nonce) + .map_err(BatchSubmitterError::from) + }) + .await + .map_err(|err| BatchSubmitterError::Join(err.to_string()))? + } +} + +/// Advance `expected` by greedily consuming any matching observed nonce. +/// +/// `observed_nonces` is the stream of **batch nonces** (from the SSZ payload) +/// decoded from `InputAdded` events sent by our batch-submitter EOA, in L1 +/// event order. Because L1 mines txs from a single EOA in strict wallet-nonce +/// order, this stream is naturally gap-less at the wallet-nonce level: +/// tx[k]'s event cannot appear on-chain without tx[k-1]'s event, and the +/// observed batch nonce sequence therefore mirrors our submission order. +/// +/// Batch nonces themselves (unlike wallet nonces) CAN repeat across recovery +/// generations — e.g., after a cascade, a fresh batch reuses its invalidated +/// predecessor's nonce. That's why we still match on equality rather than +/// trusting a sort: in a post-recovery window, the same batch nonce can be +/// observed twice (once from the invalidated generation, once from the new +/// one), and we only want to advance once. +/// +/// Under the wallet-nonce ordering above, once the next `expected` doesn't +/// appear in the stream the frontier naturally stops advancing — the gap +/// means the scheduler hasn't seen that nonce on-chain yet (or observed it at +/// a different wallet nonce from an earlier generation). +fn advance_expected_batch_nonce( + mut expected: u64, + observed_nonces: impl IntoIterator, +) -> u64 { + for nonce in observed_nonces { + if nonce == expected { + expected = expected.saturating_add(1); + } + } + expected +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use alloy_primitives::Address; + + use super::{TickOutcome, decide_submit_start}; + use crate::l1::submitter::{BatchSubmitterConfig, poster::mock::MockBatchPoster}; + use crate::runtime::shutdown::ShutdownSignal; + use crate::storage::test_helpers::{TestDb, temp_db}; + use crate::storage::{SafeInputRange, Storage, StoredSafeInput, SubmitterFrontier}; + use sequencer_core::protocol::ProtocolConfig; + + const BATCH_SUBMITTER_ADDRESS: Address = Address::repeat_byte(0x11); + + /// Protocol pinned to `BATCH_SUBMITTER_ADDRESS` — worker tests use that as + /// their test submitter, so populate sees the seeded safe_inputs. + fn submitter_test_protocol() -> ProtocolConfig { + ProtocolConfig { + batch_submitter: BATCH_SUBMITTER_ADDRESS, + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + seconds_per_block: 12, + } + } + + fn default_test_config() -> BatchSubmitterConfig { + BatchSubmitterConfig { + idle_poll_interval_ms: 1000, + } + } + + fn seed_two_closed_batches(db_path: &str) { + let mut storage = Storage::open(db_path).expect("open storage"); + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + let next_safe = head.safe_block; + storage + .close_frame_and_batch(&mut head, next_safe) + .expect("close batch 0"); + storage + .close_frame_and_batch(&mut head, next_safe) + .expect("close batch 1"); + storage + .close_frame_and_batch(&mut head, next_safe) + .expect("close batch 2"); + } + + fn seed_safe_submitted_batches(db_path: &str, safe_block: u64, nonces: &[u64]) { + let mut storage = Storage::open(db_path).expect("open storage"); + let inputs: Vec<_> = nonces + .iter() + .map(|nonce| StoredSafeInput { + sender: BATCH_SUBMITTER_ADDRESS, + payload: ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: *nonce, + frames: Vec::new(), + }), + block_number: safe_block, + }) + .collect(); + storage + .append_safe_inputs(safe_block, inputs.as_slice(), &submitter_test_protocol()) + .expect("append safe submitted batches"); + } + + #[tokio::test] + async fn tick_once_submits_first_missing_closed_batch() { + let TestDb { _dir, path } = temp_db("tick-submits"); + seed_two_closed_batches(&path); + + let mock = Arc::new(MockBatchPoster::new()); + let submitter = super::BatchSubmitter::new( + path.clone(), + mock.clone(), + ShutdownSignal::default(), + default_test_config(), + ); + + let outcome = submitter.tick_once().await.expect("tick once"); + assert_eq!(outcome, TickOutcome::Submitted(3)); + + let submissions = mock.submissions(); + assert_eq!(submissions.len(), 3); + assert_eq!(submissions[0].0, 0); + assert_eq!(submissions[1].0, 1); + assert_eq!(submissions[2].0, 2); + } + + #[tokio::test] + async fn tick_once_submits_nothing_when_already_caught_up() { + let TestDb { _dir, path } = temp_db("tick-caught-up"); + seed_two_closed_batches(&path); + seed_safe_submitted_batches(&path, 10, &[0, 1]); + + let mock = Arc::new(MockBatchPoster::new()); + mock.set_observed_submitted_nonces(vec![2]); + let submitter = super::BatchSubmitter::new( + path.clone(), + mock.clone(), + ShutdownSignal::default(), + default_test_config(), + ); + + let outcome = submitter.tick_once().await.expect("tick once"); + assert_eq!(outcome, TickOutcome::Idle); + assert!(mock.submissions().is_empty()); + assert_eq!(mock.last_from_block(), Some(11)); + } + + #[tokio::test] + async fn tick_once_skips_already_submitted() { + let TestDb { _dir, path } = temp_db("tick-combines-prefix-and-suffix"); + seed_two_closed_batches(&path); + seed_safe_submitted_batches(&path, 10, &[0, 1, 2]); + + let mock = Arc::new(MockBatchPoster::new()); + let submitter = super::BatchSubmitter::new( + path.clone(), + mock.clone(), + ShutdownSignal::default(), + default_test_config(), + ); + + let outcome = submitter.tick_once().await.expect("tick once"); + assert_eq!(outcome, TickOutcome::Idle); + assert!(mock.submissions().is_empty()); + } + + #[tokio::test] + async fn tick_once_submits_only_missing_suffix_from_safe_frontier() { + let TestDb { _dir, path } = temp_db("tick-safe-frontier-suffix"); + seed_two_closed_batches(&path); + seed_safe_submitted_batches(&path, 10, &[0, 1]); + + let mock = Arc::new(MockBatchPoster::new()); + let submitter = super::BatchSubmitter::new( + path.clone(), + mock.clone(), + ShutdownSignal::default(), + default_test_config(), + ); + + let outcome = submitter.tick_once().await.expect("tick once"); + assert_eq!(outcome, TickOutcome::Submitted(1)); + assert_eq!(mock.last_from_block(), Some(11)); + + let submissions = mock.submissions(); + assert_eq!(submissions.len(), 1); + assert_eq!(submissions[0].0, 2); + } + + #[tokio::test] + async fn tick_once_replaces_from_latest_mined_prefix_not_safe_prefix() { + let TestDb { _dir, path } = temp_db("tick-latest-mined-prefix"); + seed_two_closed_batches(&path); + seed_safe_submitted_batches(&path, 10, &[0]); + + let mock = Arc::new(MockBatchPoster::new()); + mock.set_observed_submitted_nonces(vec![1]); + let submitter = super::BatchSubmitter::new( + path.clone(), + mock.clone(), + ShutdownSignal::default(), + default_test_config(), + ); + + let outcome = submitter.tick_once().await.expect("tick once"); + assert_eq!(outcome, TickOutcome::Submitted(1)); + assert_eq!(mock.last_from_block(), Some(11)); + + let submissions = mock.submissions(); + assert_eq!(submissions.len(), 1); + assert_eq!(submissions[0].0, 2); + } + + #[tokio::test] + async fn tick_once_propagates_poster_errors() { + let TestDb { _dir, path } = temp_db("tick-poster-error"); + seed_two_closed_batches(&path); + + let mock = Arc::new(MockBatchPoster::new()); + mock.set_observed_submitted_error(Some("rpc fail")); + let submitter = super::BatchSubmitter::new( + path, + mock, + ShutdownSignal::default(), + default_test_config(), + ); + + let err = submitter + .tick_once() + .await + .expect_err("poster error should propagate"); + assert!(matches!(err, super::BatchSubmitterError::Poster(_))); + } + + // ── decide_submit_start (pure) ──────────────────────────────────────── + + #[test] + fn decide_submit_start_advances_past_observed_prefix() { + let from_nonce = decide_submit_start( + SubmitterFrontier { + safe_block: 10, + accepted_next_nonce: 0, + }, + &[0, 1, 2], + ); + assert_eq!(from_nonce, 3); + } + + #[test] + fn decide_submit_start_stops_at_first_gap() { + let from_nonce = decide_submit_start( + SubmitterFrontier { + safe_block: 10, + accepted_next_nonce: 0, + }, + &[0, 2, 3], + ); + assert_eq!(from_nonce, 1); + } + + #[test] + fn decide_submit_start_handles_empty_observed_list() { + let from_nonce = decide_submit_start( + SubmitterFrontier { + safe_block: 10, + accepted_next_nonce: 5, + }, + &[], + ); + assert_eq!(from_nonce, 5); + } + + #[test] + fn decide_submit_start_advances_once_per_matching_nonce_across_recovery_generations() { + // Post-recovery scenario the `advance_expected_batch_nonce` doc calls + // out: batch nonces can repeat across recovery generations because a + // cascade re-uses the last valid ancestor's `nonce + 1`. The observed + // event stream can therefore contain the same batch nonce twice (once + // from the invalidated generation, once from the recovery generation). + // + // decide_submit_start must advance exactly ONCE per matching nonce — + // the second occurrence at a nonce that no longer equals `expected` is + // a no-op, as intended. The underlying fold is table-tested below; this + // pins the wrapper at the nonce-reuse case explicitly. + let from_nonce = decide_submit_start( + SubmitterFrontier { + safe_block: 10, + accepted_next_nonce: 2, + }, + // Two events reporting nonce=2 (one per generation), then nonce=3. + &[2, 2, 3], + ); + // 2 matches expected=2 → advance to 3. Second 2 doesn't match + // expected=3, skip. 3 matches → advance to 4. + assert_eq!(from_nonce, 4); + } + + #[test] + fn advance_expected_batch_nonce_matches_scheduler_nonce_rule() { + assert_eq!(super::advance_expected_batch_nonce(0, Vec::::new()), 0); + assert_eq!(super::advance_expected_batch_nonce(0, vec![0, 1, 2]), 3); + assert_eq!(super::advance_expected_batch_nonce(0, vec![0, 2, 3]), 1); + assert_eq!(super::advance_expected_batch_nonce(0, vec![1, 2, 3]), 0); + assert_eq!(super::advance_expected_batch_nonce(0, vec![0, 1, 1, 2]), 3); + assert_eq!( + super::advance_expected_batch_nonce(0, vec![6, 4, 3, 2, 2, 0, 1]), + 2 + ); + assert_eq!(super::advance_expected_batch_nonce(0, vec![0, 2, 1]), 2); + assert_eq!(super::advance_expected_batch_nonce(2, vec![2, 3]), 4); + } +} diff --git a/sequencer/src/l2_tx_feed/mod.rs b/sequencer/src/l2_tx_feed/mod.rs deleted file mode 100644 index 7c78a45..0000000 --- a/sequencer/src/l2_tx_feed/mod.rs +++ /dev/null @@ -1,11 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -mod error; -mod feed; - -#[cfg(test)] -mod tests; - -pub use error::{SubscribeError, SubscriptionError}; -pub use feed::{BroadcastTxMessage, L2TxFeed, L2TxFeedConfig, Subscription}; diff --git a/sequencer/src/l2_tx_feed/tests.rs b/sequencer/src/l2_tx_feed/tests.rs deleted file mode 100644 index d93c8ed..0000000 --- a/sequencer/src/l2_tx_feed/tests.rs +++ /dev/null @@ -1,215 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use std::time::{Duration, SystemTime}; - -use alloy_primitives::{Address, Signature}; -use tempfile::TempDir; -use tokio::sync::oneshot; - -use super::{BroadcastTxMessage, L2TxFeed, L2TxFeedConfig, SubscribeError}; -use crate::inclusion_lane::{PendingUserOp, SequencerError}; -use crate::shutdown::ShutdownSignal; -use crate::storage::{SafeInputRange, Storage, StoredSafeInput}; -use sequencer_core::l2_tx::{DirectInput, SequencedL2Tx, ValidUserOp}; -use sequencer_core::user_op::UserOp; - -#[test] -fn broadcast_user_op_serializes_with_hex_data() { - let msg = BroadcastTxMessage::from_offset_and_tx( - 7, - SequencedL2Tx::UserOp(ValidUserOp { - sender: Address::from_slice(&[0x11; 20]), - fee: 3, - data: vec![0xaa, 0xbb], - }), - ); - let json = serde_json::to_string(&msg).expect("serialize"); - assert!(json.contains("\"kind\":\"user_op\"")); - assert!(json.contains("\"offset\":7")); - assert!(json.contains("\"fee\":3")); - assert!(json.contains("\"data\":\"0xaabb\"")); -} - -#[test] -fn broadcast_direct_input_serializes_with_hex_payload() { - let msg = BroadcastTxMessage::from_offset_and_tx( - 9, - SequencedL2Tx::Direct(DirectInput { - sender: Address::ZERO, - block_number: 42, - payload: vec![0xcc, 0xdd], - }), - ); - let json = serde_json::to_string(&msg).expect("serialize"); - assert!(json.contains("\"kind\":\"direct_input\"")); - assert!(json.contains("\"offset\":9")); - assert!(json.contains("\"sender\":\"0x0000000000000000000000000000000000000000\"")); - assert!(json.contains("\"block_number\":42")); - assert!(json.contains("\"payload\":\"0xccdd\"")); -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn subscribe_from_rejects_catchup_window() { - let db = test_db("catchup-window"); - seed_ordered_txs(db.path.as_str()); - let feed = test_feed(db.path.as_str(), ShutdownSignal::default()); - - let result = feed.subscribe_from(0, 1); - - assert!(matches!( - result, - Err(SubscribeError::CatchUpWindowExceeded { - requested_offset: 0, - live_start_offset: 2, - max_catchup_events: 1, - }) - )); -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn subscription_replays_existing_rows_in_order() { - let db = test_db("replay-existing"); - seed_ordered_txs(db.path.as_str()); - let feed = test_feed(db.path.as_str(), ShutdownSignal::default()); - - let mut subscription = feed.subscribe_from(0, u64::MAX).expect("subscribe"); - - let first = tokio::time::timeout(Duration::from_secs(1), subscription.recv()) - .await - .expect("wait first event") - .expect("first event"); - let second = tokio::time::timeout(Duration::from_secs(1), subscription.recv()) - .await - .expect("wait second event") - .expect("second event"); - - assert_eq!(first.offset(), 0); - assert_eq!(second.offset(), 1); - - subscription.finish().await.expect("finish subscription"); -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn subscription_filters_batch_submitter_safe_inputs() { - let db = test_db("filters-batch-submitter-inputs"); - let batch_submitter_address = Address::from([0xfe; 20]); - seed_ordered_txs_with_sender(db.path.as_str(), batch_submitter_address); - let feed = L2TxFeed::new( - db.path.clone(), - ShutdownSignal::default(), - L2TxFeedConfig { - idle_poll_interval: Duration::from_millis(2), - page_size: 64, - batch_submitter_address: Some(batch_submitter_address), - }, - ); - - let mut subscription = feed.subscribe_from(0, u64::MAX).expect("subscribe"); - let first = tokio::time::timeout(Duration::from_secs(1), subscription.recv()) - .await - .expect("wait first event") - .expect("first event"); - - assert!(matches!( - first, - BroadcastTxMessage::UserOp { offset: 0, .. } - )); - - let no_second = tokio::time::timeout(Duration::from_millis(50), subscription.recv()).await; - assert!( - no_second.is_err(), - "filtered batch-submitter input should not be broadcast" - ); - - subscription.finish().await.expect("finish subscription"); -} - -#[tokio::test(flavor = "multi_thread", worker_threads = 2)] -async fn shutdown_signal_closes_subscription() { - let db = test_db("shutdown-closes"); - seed_ordered_txs(db.path.as_str()); - let shutdown = ShutdownSignal::default(); - let feed = test_feed(db.path.as_str(), shutdown.clone()); - - let mut subscription = feed.subscribe_from(u64::MAX, u64::MAX).expect("subscribe"); - - shutdown.request_shutdown(); - - assert!( - tokio::time::timeout(Duration::from_secs(1), subscription.recv()) - .await - .expect("wait for subscription close") - .is_none() - ); - subscription.finish().await.expect("clean shutdown"); -} - -fn test_feed(db_path: &str, shutdown: ShutdownSignal) -> L2TxFeed { - L2TxFeed::new( - db_path.to_string(), - shutdown, - L2TxFeedConfig { - idle_poll_interval: Duration::from_millis(2), - page_size: 64, - batch_submitter_address: None, - }, - ) -} - -fn test_db(label: &str) -> TestDb { - let dir = TempDir::new().expect("create temp dir"); - let path = dir.path().join(format!("{label}.db")); - TestDb { - _dir: dir, - path: path.to_string_lossy().into_owned(), - } -} - -fn seed_ordered_txs(db_path: &str) { - seed_ordered_txs_with_sender(db_path, Address::ZERO); -} - -fn seed_ordered_txs_with_sender(db_path: &str, direct_sender: Address) { - let mut storage = Storage::open(db_path, "NORMAL").expect("open storage"); - let mut head = storage - .initialize_open_state(0, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - - let (respond_to, _recv) = oneshot::channel::>(); - let pending = PendingUserOp { - signed: sequencer_core::user_op::SignedUserOp { - sender: Address::from_slice(&[0x11; 20]), - signature: Signature::test_signature(), - user_op: UserOp { - nonce: 0, - max_fee: 3, - data: vec![0x42].into(), - }, - }, - respond_to, - received_at: SystemTime::now(), - }; - - storage - .append_user_ops_chunk(&mut head, &[pending]) - .expect("append user-op chunk"); - storage - .append_safe_inputs( - 10, - &[StoredSafeInput { - sender: direct_sender, - payload: vec![0xaa], - block_number: 10, - }], - ) - .expect("append direct input"); - storage - .close_frame_only(&mut head, 10, SafeInputRange::new(0, 1)) - .expect("close frame with one drained direct input"); -} - -struct TestDb { - _dir: TempDir, - path: String, -} diff --git a/sequencer/src/lib.rs b/sequencer/src/lib.rs index edb38c0..a40c98d 100644 --- a/sequencer/src/lib.rs +++ b/sequencer/src/lib.rs @@ -3,19 +3,27 @@ //! Sequencer prototype focused on deterministic inclusion and replay. //! -//! Flow: API -> inclusion lane -> SQLite -> catch-up replay. -//! The inclusion lane is the single writer that defines execution order. -pub mod api; -pub mod batch_submitter; -pub mod config; -pub mod inclusion_lane; -pub mod input_reader; -pub mod l2_tx_feed; -pub mod partition; -pub mod provider; -mod runtime; -pub mod shutdown; +//! Top-level layout follows the system's data flow: +//! +//! - `ingress` — submit API + inclusion lane (write path from external clients) +//! - `egress` — subscribe API + L2-tx feed (read path to internal indexers) +//! - `l1` — input reader, batch submitter, L1 helpers +//! - `storage` — SQLite-backed persistence (organized by writer role) +//! - `recovery` — cascade invalidation + recovery batch +//! - `runtime` — orchestration, config, shutdown +//! - `http` — shared HTTP error type + axum::serve orchestration +//! +//! The inclusion lane is the single writer of open-batch state; this is the +//! invariant the storage layer relies on. + +pub mod egress; +pub mod http; +pub mod ingress; +pub mod l1; +pub mod recovery; +pub mod runtime; pub mod storage; -pub use config::RunConfig; +pub use http::{ApiConfig, ApiError, WS_CATCHUP_WINDOW_EXCEEDED_REASON}; +pub use runtime::config::RunConfig; pub use runtime::{RunError, run}; diff --git a/sequencer/src/provider.rs b/sequencer/src/provider.rs deleted file mode 100644 index 40789d5..0000000 --- a/sequencer/src/provider.rs +++ /dev/null @@ -1,56 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use std::str::FromStr; -use std::time::Duration; - -use alloy::{ - providers::{DynProvider, Provider, ProviderBuilder}, - rpc::client::RpcClient, - signers::local::PrivateKeySigner, - transports::http::{Http, reqwest, reqwest::Url}, -}; -use alloy_transport::layers::RetryBackoffLayer; - -const REQUEST_TIMEOUT: Duration = Duration::from_secs(20); -const MAX_RATE_LIMIT_RETRIES: u32 = 5; -const INITIAL_BACKOFF_MS: u64 = 200; -const COMPUTE_UNITS_PER_SEC: u64 = 500; - -fn create_client(url: &str) -> Result { - let url = Url::parse(url).map_err(|e| format!("invalid RPC URL: {e}"))?; - - let http_client = reqwest::Client::builder() - .timeout(REQUEST_TIMEOUT) - .build() - .map_err(|e| format!("failed to build HTTP client: {e}"))?; - - let transport = Http::with_client(http_client, url); - let is_local = transport.guess_local(); - - let retry = RetryBackoffLayer::new( - MAX_RATE_LIMIT_RETRIES, - INITIAL_BACKOFF_MS, - COMPUTE_UNITS_PER_SEC, - ); - - Ok(RpcClient::builder() - .layer(retry) - .transport(transport, is_local)) -} - -/// Create a read-only provider with retry and timeout. -pub fn create_provider(url: &str) -> Result { - let client = create_client(url)?; - let provider = ProviderBuilder::new().connect_client(client); - Ok(provider.erased()) -} - -/// Create a provider with a wallet signer, retry, and timeout. -pub fn create_signer_provider(url: &str, private_key: &str) -> Result { - let client = create_client(url)?; - let signer = - PrivateKeySigner::from_str(private_key).map_err(|e| format!("invalid private key: {e}"))?; - let provider = ProviderBuilder::new().wallet(signer).connect_client(client); - Ok(provider.erased()) -} diff --git a/sequencer/src/recovery/detector.rs b/sequencer/src/recovery/detector.rs new file mode 100644 index 0000000..a62a615 --- /dev/null +++ b/sequencer/src/recovery/detector.rs @@ -0,0 +1,313 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Runtime danger detector. +//! +//! A tiny background task that, every `poll_interval`, asks [`Storage::check_danger`] +//! whether any batch is past the preemptive threshold. If so, the task exits +//! with [`DetectorExit::DangerZone`] — the runtime turns that into a deliberate +//! non-error process shutdown, the orchestrator respawns, and +//! `run_preemptive_recovery` takes over on startup. +//! +//! This is its own worker (not part of the batch submitter) because the two +//! concerns are orthogonal: the submitter makes progress on L1, which involves +//! slow confirmations; the detector just reads the DB + wall clock at a fixed +//! cadence. Keeping them separate means one never delays the other, and each +//! stays a ~20-line state machine. +//! +//! Detection is eventually consistent with the input reader: a transition into +//! danger may lag by up to one `poll_interval`. The preemptive margin absorbs +//! this bounded lag. + +use std::time::Duration; + +use thiserror::Error; +use tracing::debug; + +use crate::runtime::clock::unix_now_ms; +use crate::runtime::shutdown::ShutdownSignal; +use crate::storage::{DangerStatus, Storage, StorageOpenError}; +use sequencer_core::protocol::ProtocolConfig; + +/// How the detector's loop exited. +/// +/// `DangerZone` is a *deliberate* exit — not an error. The runtime maps it to +/// a distinct `RunError` variant so operators can tell "time to recover" apart +/// from "something crashed". +#[derive(Debug)] +pub enum DetectorExit { + /// Shutdown signal fired before any danger was detected. + Shutdown, + /// The strict or wall-clock-adjusted check flagged a batch. Stop for + /// recovery. + DangerZone { batch_index: u64 }, +} + +#[derive(Debug, Error)] +pub enum DangerDetectorError { + #[error(transparent)] + OpenStorage(#[from] StorageOpenError), + #[error(transparent)] + Storage(#[from] rusqlite::Error), + #[error("danger detector join error: {0}")] + Join(String), +} + +pub struct DangerDetector { + db_path: String, + protocol: ProtocolConfig, + poll_interval: Duration, + shutdown: ShutdownSignal, +} + +impl DangerDetector { + pub fn new( + db_path: impl Into, + protocol: ProtocolConfig, + poll_interval: Duration, + shutdown: ShutdownSignal, + ) -> Self { + Self { + db_path: db_path.into(), + protocol, + poll_interval, + shutdown, + } + } + + pub fn start( + self, + ) -> Result>, StorageOpenError> + { + let _ = Storage::open_read_only(self.db_path.as_str())?; + Ok(tokio::spawn(async move { self.run_forever().await })) + } + + async fn run_forever(self) -> Result { + loop { + if self.shutdown.is_shutdown_requested() { + return Ok(DetectorExit::Shutdown); + } + + match self.check_once().await? { + DangerStatus::Safe => { + debug!("danger check: safe"); + } + DangerStatus::Strict(batch_index) | DangerStatus::Stalled(batch_index) => { + tracing::error!( + batch_index, + danger_threshold = self.protocol.danger_threshold(), + "danger zone detected — triggering shutdown for flush and recovery" + ); + return Ok(DetectorExit::DangerZone { batch_index }); + } + } + + tokio::select! { + biased; + _ = self.shutdown.wait_for_shutdown() => return Ok(DetectorExit::Shutdown), + _ = tokio::time::sleep(self.poll_interval) => {} + } + } + } + + async fn check_once(&self) -> Result { + let db_path = self.db_path.clone(); + let protocol = self.protocol; + let now_ms = unix_now_ms(); + tokio::task::spawn_blocking(move || { + let mut storage = Storage::open_read_only(&db_path)?; + storage + .check_danger(&protocol, now_ms) + .map_err(DangerDetectorError::from) + }) + .await + .map_err(|err| DangerDetectorError::Join(err.to_string()))? + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::test_helpers::{SENDER_A, temp_db}; + use crate::storage::{SafeInputRange, Storage, StoredSafeInput}; + use std::time::Duration; + + fn test_protocol() -> ProtocolConfig { + ProtocolConfig { + batch_submitter: SENDER_A, + max_wait_blocks: 1200, + preemptive_margin_blocks: 75, + seconds_per_block: 12, + } + } + + fn make_stale_batch_payload(nonce: u64, safe_block: u64) -> Vec { + ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce, + frames: vec![sequencer_core::batch::Frame { + user_ops: Vec::new(), + safe_block, + fee_price: 0, + }], + }) + } + + #[tokio::test] + async fn exits_on_shutdown_when_safe() { + let db = temp_db("detector-shutdown"); + let mut storage = Storage::open(&db.path).expect("open storage"); + storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + drop(storage); + + let shutdown = ShutdownSignal::default(); + let detector = DangerDetector::new( + db.path.clone(), + test_protocol(), + Duration::from_millis(50), + shutdown.clone(), + ); + let handle = detector.start().expect("start detector"); + + tokio::time::sleep(Duration::from_millis(20)).await; + shutdown.request_shutdown(); + let exit = tokio::time::timeout(Duration::from_secs(2), handle) + .await + .expect("detector exits within timeout") + .expect("join") + .expect("detector result"); + assert!(matches!(exit, DetectorExit::Shutdown)); + } + + #[tokio::test] + async fn exits_with_danger_zone_when_strict_check_fires() { + // Closed frontier batch is aged past `danger_threshold` against the + // observed safe block — the strict arm of `check_danger` trips. + let db = temp_db("detector-strict-danger"); + let mut storage = Storage::open(&db.path).expect("open storage"); + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 1"); + + let protocol = test_protocol(); + storage + .append_safe_inputs( + 1135, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 10), + block_number: 20, + }], + &protocol, + ) + .expect("append"); + drop(storage); + + let shutdown = ShutdownSignal::default(); + let detector = DangerDetector::new( + db.path.clone(), + protocol, + Duration::from_millis(50), + shutdown, + ); + let handle = detector.start().expect("start detector"); + + let exit = tokio::time::timeout(Duration::from_secs(2), handle) + .await + .expect("detector exits within timeout") + .expect("join") + .expect("detector result"); + match exit { + DetectorExit::DangerZone { batch_index } => { + assert_eq!(batch_index, 1, "closed frontier batch 1 is in danger"); + } + other => panic!("expected DangerZone, got {other:?}"), + } + } + + #[tokio::test] + async fn exits_with_danger_zone_when_wall_clock_fallback_fires() { + // Safe head appears frozen — the strict block-based arm wouldn't trip + // (ages look fine against the last observed safe block), but the + // wall-clock-adjusted check infers extended L1 silence and lowers the + // effective threshold. + // + // The detector treats Strict and Stalled identically (both exit with + // DangerZone), but the Stalled path goes through `wall_clock_adjusted_threshold` + // — a completely separate code path that deserves its own test. + let db = temp_db("detector-stalled-danger"); + let mut storage = Storage::open(&db.path).expect("open storage"); + let mut head = storage + .initialize_open_state(100, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 0"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 1"); + + let protocol = test_protocol(); + storage + .append_safe_inputs( + 1200, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 100), + block_number: 200, + }], + &protocol, + ) + .expect("append accepted batch 0"); + + // Strict check: batch 1's first_frame_safe_block = 100, current safe = 1200. + // age = 1100 < danger_threshold (1125). Strict would NOT fire. + // + // Rewind synced_at_ms by 25 blocks' worth of wall-clock time so the + // wall-clock arm shaves 25 off the threshold (1125 → 1100). At 1100, + // batch 1's age = 1100 trips `>=`. Stalled fires. + let now_ms = crate::runtime::clock::unix_now_ms(); + drop(storage); + let rewind_conn = + Storage::open_connection(&db.path).expect("open raw connection to rewind synced_at_ms"); + rewind_conn + .execute( + "UPDATE l1_safe_head SET synced_at_ms = ?1 WHERE singleton_id = 0", + [i64::try_from(now_ms.saturating_sub(25 * 12 * 1000)).unwrap_or(i64::MAX)], + ) + .expect("rewind safe-progress timestamp"); + drop(rewind_conn); + + let shutdown = ShutdownSignal::default(); + let detector = DangerDetector::new( + db.path.clone(), + protocol, + Duration::from_millis(50), + shutdown, + ); + let handle = detector.start().expect("start detector"); + + let exit = tokio::time::timeout(Duration::from_secs(2), handle) + .await + .expect("detector exits within timeout") + .expect("join") + .expect("detector result"); + match exit { + DetectorExit::DangerZone { batch_index } => { + assert_eq!( + batch_index, 1, + "wall-clock-adjusted check must report the same batch the strict arm would", + ); + } + other => panic!("expected DangerZone from wall-clock fallback, got {other:?}"), + } + } +} diff --git a/sequencer/src/recovery/flusher.rs b/sequencer/src/recovery/flusher.rs new file mode 100644 index 0000000..3ce5624 --- /dev/null +++ b/sequencer/src/recovery/flusher.rs @@ -0,0 +1,676 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Mempool flusher: submits no-op transactions to resolve pending wallet-nonce slots +//! before recovery runs. +//! +//! After a danger-zone detection, the sequencer goes offline and calls +//! [`MempoolFlusher::flush_and_wait`] to ensure every `w_nonce` slot is consumed +//! (either by its original batch transaction or by a replacement no-op). Once all +//! slots reach safe finality, the recovery procedure can read fully-finalized L1 state. + +use alloy::network::TransactionBuilder; +use alloy::providers::{ + DynProvider, PendingTransactionConfig, PendingTransactionError, Provider, WatchTxError, +}; +use alloy::rpc::types::BlockNumberOrTag; +use alloy_primitives::{Address, B256, U256}; +use std::time::Duration; +use thiserror::Error; +use tracing::{debug, error, info}; + +#[derive(Debug, Error)] +pub enum FlushError { + #[error("provider/transport: {0}")] + Provider(String), +} + +pub struct MempoolFlusher { + provider: DynProvider, + address: Address, + confirmation_timeout: Duration, + safe_poll_interval: Duration, +} + +/// Derive the flusher's watch/poll durations from the configured block time. +/// +/// `confirmation_timeout` is 10 blocks — long enough to survive one-off L1 +/// stalls but short enough to retry within a reasonable window. +/// `safe_poll_interval` is one block — matches the natural cadence for +/// `get_transaction_count(Safe)` to advance. +/// +/// H6 regression: both values must scale with `SEQ_SECONDS_PER_BLOCK`; a fixed +/// 12s assumption would mis-pace on non-mainnet chains. +fn derive_timeouts(seconds_per_block: u64) -> (Duration, Duration) { + ( + Duration::from_secs(10 * seconds_per_block), + Duration::from_secs(seconds_per_block), + ) +} + +/// Bump base 1559 fees to satisfy Ethereum's transaction replacement rule +/// (EIP-1559 §Replacement, ≥10% bump on both `max_fee_per_gas` and +/// `max_priority_fee_per_gas`). +/// +/// H5 regression: a replacement no-op must out-bid any pending batch tx at the +/// same nonce to guarantee slot consumption. The `+ 1` on `max_fee` handles the +/// edge case where `base * 11 / 10` equals `base * 11 / 10` after integer +/// rounding; the priority doubling is generous but preserves the invariant. +fn bumped_replacement_fees(base_max_fee: u128, base_priority_fee: u128) -> (u128, u128) { + let new_max_fee = base_max_fee.saturating_mul(11) / 10 + 1; + let new_priority_fee = base_priority_fee.saturating_mul(2).max(1); + (new_max_fee, new_priority_fee) +} + +fn send_failures_error(failures: &[(u64, String)]) -> FlushError { + const MAX_SAMPLES: usize = 3; + + let samples = failures + .iter() + .take(MAX_SAMPLES) + .map(|(nonce, message)| format!("nonce {nonce}: {message}")) + .collect::>() + .join("; "); + let remaining = failures.len().saturating_sub(MAX_SAMPLES); + let suffix = if remaining == 0 { + String::new() + } else { + format!("; ... and {remaining} more") + }; + + FlushError::Provider(format!( + "failed to submit {} flush no-op transaction(s): {samples}{suffix}", + failures.len() + )) +} + +fn map_watch_error(err: PendingTransactionError) -> Result { + match err { + PendingTransactionError::TxWatcher(WatchTxError::Timeout) => Ok(false), + other => Err(FlushError::Provider(other.to_string())), + } +} + +impl MempoolFlusher { + pub fn new(provider: DynProvider, address: Address, seconds_per_block: u64) -> Self { + let (confirmation_timeout, safe_poll_interval) = derive_timeouts(seconds_per_block); + Self { + provider, + address, + confirmation_timeout, + safe_poll_interval, + } + } + + #[cfg(test)] + fn with_timeouts( + mut self, + confirmation_timeout: Duration, + safe_poll_interval: Duration, + ) -> Self { + self.confirmation_timeout = confirmation_timeout; + self.safe_poll_interval = safe_poll_interval; + self + } + + /// Flush the mempool by submitting no-op transactions for all pending nonce slots, + /// then waiting for safe finality on all of them. + /// + /// The loop runs until `get_transaction_count(Pending) <= get_transaction_count(Safe)`, + /// meaning every slot has reached safe finality. + /// + /// At each iteration: + /// 1. Submit 0-ETH self-transfers for nonces between `Latest` and `Pending`. + /// These compete with any batch transactions still in the mempool. + /// 2. Watch each submitted tx for L1 inclusion (same pattern as batch poster). + /// 3. Sleep to let the safe head advance, then re-check the loop condition. + /// 4. If any watch times out, retry the outer loop (tx may have been dropped). + pub async fn flush_and_wait(&self) -> Result<(), FlushError> { + let mut attempt = 0u32; + loop { + let safe_nonce = self.nonce_at(BlockNumberOrTag::Safe).await?; + let pending_nonce = self.nonce_at(BlockNumberOrTag::Pending).await?; + + if pending_nonce <= safe_nonce { + info!( + safe_nonce, + "mempool flush complete — all slots reached safe finality" + ); + return Ok(()); + } + + let unresolved = pending_nonce - safe_nonce; + + if attempt == 0 { + info!( + safe_nonce, + pending_nonce, + unresolved, + "flushing mempool: submitting no-ops for unresolved w_nonce slots" + ); + } else { + // Retry after a previous timeout — re-print status so operators + // see the current state without scrolling back. + error!( + attempt, + safe_nonce, + pending_nonce, + unresolved, + "flush retry: previous attempt timed out, resubmitting" + ); + } + attempt += 1; + + // Submit no-ops for nonces between Latest and Pending. + let latest_nonce = self.nonce_at(BlockNumberOrTag::Latest).await?; + let tx_hashes = self.submit_noops(latest_nonce, pending_nonce).await?; + + // Watch each submitted tx for L1 inclusion. + if !self.watch_txs(&tx_hashes).await? { + continue; + } + + // Sleep to let the safe head catch up before re-checking. + tokio::time::sleep(self.safe_poll_interval).await; + } + } + + /// Submit 0-ETH self-transfers for nonces `from_nonce..to_nonce`. + /// Returns the tx hashes of successfully submitted transactions. + async fn submit_noops(&self, from_nonce: u64, to_nonce: u64) -> Result, FlushError> { + if from_nonce >= to_nonce { + return Ok(Vec::new()); + } + + let fees = self + .provider + .estimate_eip1559_fees() + .await + .map_err(|e| FlushError::Provider(e.to_string()))?; + + let (bumped_max_fee, bumped_priority_fee) = + bumped_replacement_fees(fees.max_fee_per_gas, fees.max_priority_fee_per_gas); + + debug!( + from_nonce, + to_nonce, + count = to_nonce - from_nonce, + max_fee_per_gas = bumped_max_fee, + max_priority_fee = bumped_priority_fee, + "submitting flush no-ops" + ); + + let mut tx_hashes = Vec::new(); + let mut send_failures = Vec::new(); + for nonce in from_nonce..to_nonce { + let tx = alloy::rpc::types::TransactionRequest::default() + .with_to(self.address) + .with_value(U256::ZERO) + .with_nonce(nonce) + .with_max_fee_per_gas(bumped_max_fee) + .with_max_priority_fee_per_gas(bumped_priority_fee); + + match self.provider.send_transaction(tx).await { + Ok(pending) => { + let tx_hash = *pending.tx_hash(); + debug!(nonce, %tx_hash, "flush no-op submitted"); + tx_hashes.push(tx_hash); + } + Err(e) => { + let message = e.to_string(); + error!(nonce, error = %message, "flush no-op send failed"); + send_failures.push((nonce, message)); + } + } + } + + if !send_failures.is_empty() { + return Err(send_failures_error(send_failures.as_slice())); + } + + Ok(tx_hashes) + } + + /// Watch submitted transactions for L1 inclusion. + /// Uses the same `PendingTransactionConfig::watch` pattern as the batch poster. + /// Returns `true` if all txs confirmed, `false` on timeout. + async fn watch_txs(&self, tx_hashes: &[B256]) -> Result { + for tx_hash in tx_hashes { + let watch = PendingTransactionConfig::new(*tx_hash) + .with_required_confirmations(1) + .with_timeout(Some(self.confirmation_timeout)) + .with_provider(self.provider.root().clone()); + match watch.watch().await { + Ok(_) => { + debug!(%tx_hash, "flush no-op included on L1"); + } + Err(err @ PendingTransactionError::TxWatcher(WatchTxError::Timeout)) => { + // This should not happen during normal L1 operation. + // Possible causes: L1 congestion, tx dropped from mempool, + // gas price too low to compete. + error!( + %tx_hash, + timeout_secs = self.confirmation_timeout.as_secs(), + "flush no-op timed out waiting for L1 inclusion — will retry" + ); + return map_watch_error(err); + } + Err(err) => return map_watch_error(err), + } + } + Ok(true) + } + + async fn nonce_at(&self, block: BlockNumberOrTag) -> Result { + self.provider + .get_transaction_count(self.address) + .block_id(block.into()) + .await + .map_err(|e| FlushError::Provider(e.to_string())) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use alloy::network::TransactionBuilder; + use alloy::node_bindings::Anvil; + use alloy::providers::Provider; + + // ── §7.7.4 H5: replacement-fee bump satisfies EIP-1559 rules ───────── + + #[test] + fn replacement_fee_bump_exceeds_ten_percent_for_max_fee() { + // `max_fee_per_gas` must strictly exceed base by ≥10% for any positive base. + for base in [1_u128, 10, 100, 1_000, 1_000_000, 1_000_000_000_000] { + let (new_max, _) = bumped_replacement_fees(base, 0); + assert!( + new_max.saturating_mul(10) >= base.saturating_mul(11), + "max_fee bump violates ≥10% rule: base={base}, new={new_max}", + ); + } + } + + #[test] + fn replacement_fee_bump_doubles_priority_fee() { + // `priority_fee` doubles (200%), easily clearing the 10% replacement threshold. + for base in [1_u128, 10, 1_000, 1_000_000_000] { + let (_, new_prio) = bumped_replacement_fees(0, base); + assert_eq!(new_prio, base.saturating_mul(2)); + assert!( + new_prio.saturating_mul(10) >= base.saturating_mul(11), + "priority bump violates ≥10% rule: base={base}, new={new_prio}", + ); + } + } + + #[test] + fn replacement_fee_floor_is_positive_even_when_base_is_zero() { + // If the estimator returns zero, bumped values are still positive so the + // tx is actually broadcast rather than rejected by the node. + let (new_max, new_prio) = bumped_replacement_fees(0, 0); + assert!(new_max >= 1); + assert!(new_prio >= 1); + } + + #[test] + fn send_failure_error_summarizes_failed_slots() { + let err = send_failures_error(&[ + (7, "nonce too low".to_string()), + (8, "replacement transaction underpriced".to_string()), + (9, "insufficient funds".to_string()), + (10, "fee cap less than block base fee".to_string()), + ]); + + let message = err.to_string(); + assert!(message.contains("failed to submit 4 flush no-op transaction(s)")); + assert!(message.contains("nonce 7: nonce too low")); + assert!(message.contains("nonce 8: replacement transaction underpriced")); + assert!(message.contains("nonce 9: insufficient funds")); + assert!(message.contains("and 1 more")); + assert!(!message.contains("nonce 10")); + } + + #[test] + fn watch_error_mapping_retries_only_timeouts() { + let timeout = map_watch_error(PendingTransactionError::TxWatcher(WatchTxError::Timeout)) + .expect("timeout should be a retryable watch result"); + assert!(!timeout, "timeout should ask the caller to retry"); + + let err = map_watch_error(PendingTransactionError::FailedToRegister) + .expect_err("non-timeout watcher failures must surface"); + assert!(matches!(err, FlushError::Provider(_))); + } + + #[test] + fn replacement_fee_bump_saturates_at_u128_max() { + // Overflow safety: astronomical base fees must not wrap around. + let (new_max, new_prio) = bumped_replacement_fees(u128::MAX, u128::MAX); + assert_eq!(new_max, u128::MAX / 10 + 1); + assert_eq!(new_prio, u128::MAX); + } + + // ── §7.7.5 H6: timeouts derive from seconds_per_block ──────────────── + + #[test] + fn timeouts_derive_from_seconds_per_block() { + assert_eq!( + derive_timeouts(12), + (Duration::from_secs(120), Duration::from_secs(12)), + "mainnet 12s block: 120s confirmation, 12s poll", + ); + assert_eq!( + derive_timeouts(2), + (Duration::from_secs(20), Duration::from_secs(2)), + "fast L2 2s block: scaled proportionally", + ); + assert_eq!( + derive_timeouts(1), + (Duration::from_secs(10), Duration::from_secs(1)), + "minimum accepted block time (H8: SEQ_SECONDS_PER_BLOCK >= 1)", + ); + } + + #[test] + fn confirmation_timeout_is_ten_times_safe_poll_interval() { + // Structural invariant: confirmation window == 10 × poll interval. + for spb in [1_u64, 2, 5, 12, 30] { + let (conf, poll) = derive_timeouts(spb); + assert_eq!(conf, poll * 10); + } + } + + /// Verify that `anvil` is available. Panics with a clear message if not found. + fn require_anvil() { + assert!( + std::process::Command::new("anvil") + .arg("--version") + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .is_ok(), + "anvil not found on PATH — install Foundry (https://getfoundry.sh)" + ); + } + + /// Spawn Anvil with manual mining and fast safe-finality (2 slots/epoch). + fn spawn_anvil() -> alloy::node_bindings::AnvilInstance { + Anvil::default() + .arg("--no-mining") + .arg("--slots-in-an-epoch") + .arg("2") + .timeout(30_000) + .spawn() + } + + /// Create a signer provider from an Anvil private key. + fn signer_provider(anvil: &alloy::node_bindings::AnvilInstance) -> DynProvider { + let key_hex = alloy_primitives::hex::encode(anvil.first_key().to_bytes()); + crate::l1::provider::create_signer_provider( + anvil.endpoint_url().as_str(), + &format!("0x{key_hex}"), + ) + .expect("create signer provider") + } + + /// Mine blocks at a fixed interval until the token is dropped. + fn start_miner(provider: DynProvider, interval: Duration) -> tokio::sync::oneshot::Sender<()> { + let (stop_tx, mut stop_rx) = tokio::sync::oneshot::channel(); + tokio::spawn(async move { + loop { + tokio::select! { + _ = &mut stop_rx => break, + _ = tokio::time::sleep(interval) => { + let _ = provider.raw_request::<_, serde_json::Value>( + "evm_mine".into(), ()).await; + } + } + } + }); + stop_tx + } + + /// Send a 0-ETH self-transfer at a specific nonce (without waiting for inclusion). + async fn send_tx_at_nonce(provider: &DynProvider, addr: Address, nonce: u64) { + let fees = provider + .estimate_eip1559_fees() + .await + .expect("estimate fees"); + let tx = alloy::rpc::types::TransactionRequest::default() + .with_to(addr) + .with_value(U256::ZERO) + .with_nonce(nonce) + .with_max_fee_per_gas(fees.max_fee_per_gas) + .with_max_priority_fee_per_gas(fees.max_priority_fee_per_gas); + let _ = provider.send_transaction(tx).await.expect("send tx"); + } + + #[tokio::test] + async fn flush_is_noop_when_no_pending_nonces() { + require_anvil(); + + let anvil = spawn_anvil(); + let provider = signer_provider(&anvil); + let addr = anvil.addresses()[0]; + + // Mine a few blocks so safe head advances past genesis. + for _ in 0..4 { + let _: serde_json::Value = provider + .raw_request("evm_mine".into(), ()) + .await + .expect("mine"); + } + + let flusher = MempoolFlusher::new(provider, addr, 12); + // No pending txs — should return immediately. + flusher.flush_and_wait().await.expect("flush"); + } + + #[tokio::test] + async fn flush_resolves_pending_nonces_to_safe() { + require_anvil(); + + let anvil = spawn_anvil(); + let provider = signer_provider(&anvil); + let addr = anvil.addresses()[0]; + + // Send 3 txs into the mempool (unmined). + for nonce in 0..3 { + send_tx_at_nonce(&provider, addr, nonce).await; + } + + // Verify: pending=3, safe=0. + let pending = provider + .get_transaction_count(addr) + .block_id(BlockNumberOrTag::Pending.into()) + .await + .expect("pending nonce"); + assert_eq!(pending, 3); + + let safe = provider + .get_transaction_count(addr) + .block_id(BlockNumberOrTag::Safe.into()) + .await + .expect("safe nonce"); + assert_eq!(safe, 0); + + // Start a background miner so blocks are produced. + let _miner = start_miner(provider.clone(), Duration::from_millis(100)); + + // Run the flusher — it should resolve all 3 nonces to safe. + let flusher = MempoolFlusher::new(provider.clone(), addr, 12) + .with_timeouts(Duration::from_secs(5), Duration::from_millis(200)); + tokio::time::timeout(Duration::from_secs(10), flusher.flush_and_wait()) + .await + .expect("flush should complete within timeout") + .expect("flush should succeed"); + + // Verify: safe nonce caught up. + let safe_after = provider + .get_transaction_count(addr) + .block_id(BlockNumberOrTag::Safe.into()) + .await + .expect("safe nonce after flush"); + assert!( + safe_after >= 3, + "safe nonce should be >= 3 after flush, got {safe_after}" + ); + } + + #[tokio::test] + async fn flush_handles_already_mined_but_not_safe() { + require_anvil(); + + let anvil = spawn_anvil(); + let provider = signer_provider(&anvil); + let addr = anvil.addresses()[0]; + + // Send 2 txs and mine them (latest but not safe). + for nonce in 0..2 { + send_tx_at_nonce(&provider, addr, nonce).await; + } + let _: serde_json::Value = provider + .raw_request("evm_mine".into(), ()) + .await + .expect("mine"); + + let latest = provider + .get_transaction_count(addr) + .block_id(BlockNumberOrTag::Latest.into()) + .await + .expect("latest nonce"); + assert_eq!(latest, 2, "txs should be mined"); + + let safe = provider + .get_transaction_count(addr) + .block_id(BlockNumberOrTag::Safe.into()) + .await + .expect("safe nonce"); + assert_eq!(safe, 0, "txs should not be safe yet"); + + // Start miner to advance safe head. + let _miner = start_miner(provider.clone(), Duration::from_millis(100)); + + // Flusher should wait for safe finality (no new txs to submit). + let flusher = MempoolFlusher::new(provider.clone(), addr, 12) + .with_timeouts(Duration::from_secs(5), Duration::from_millis(200)); + tokio::time::timeout(Duration::from_secs(10), flusher.flush_and_wait()) + .await + .expect("flush should complete within timeout") + .expect("flush should succeed"); + + let safe_after = provider + .get_transaction_count(addr) + .block_id(BlockNumberOrTag::Safe.into()) + .await + .expect("safe nonce after flush"); + assert!( + safe_after >= 2, + "safe nonce should be >= 2 after flush, got {safe_after}" + ); + } + + // ── §7.7.7 flusher under extended provider outage ──────────────────── + // + // Implementation note (matters for what this test pins): `flush_and_wait` + // does NOT retry internally on `Provider` errors — a failed `nonce_at` + // call propagates via `?` and the function returns. The "retry forever" + // language in TEST_PLAN §7.7.7 is really about the orchestrator restart + // loop: on each respawn a fresh flusher is constructed and tried, and + // this repeats until the provider becomes reachable again (covered at + // e2e by §11.2.2-followup / §11.1.5's `respawn_until_stable`). + // + // This test pins the two ends of that contract: (a) a mid-flush + // disconnect surfaces as `FlushError::Provider` fast (no hang, no + // internal retry), and (b) a fresh flusher call after reconnect + // completes and consumes the pending wallet-nonce slot. + + #[tokio::test] + async fn flush_surfaces_provider_error_under_disconnect_and_completes_on_reconnect() { + use rollups_harness::TcpProxy; + + require_anvil(); + + let anvil = spawn_anvil(); + // Direct-to-Anvil provider: the test uses this to seed pending + // mempool state and inspect the chain. Bypasses the proxy so the + // seeding itself isn't affected by disconnect. + let direct_provider = signer_provider(&anvil); + let addr = anvil.addresses()[0]; + + // Proxy in front of Anvil — this is what the flusher dials. Anvil's + // endpoint uses `localhost` which the proxy's upstream parser rejects + // (it expects a literal IP). Swap for `127.0.0.1` so `parse` accepts. + let anvil_upstream = anvil.endpoint().replace("localhost", "127.0.0.1"); + let proxy = TcpProxy::spawn(anvil_upstream.as_str()) + .await + .expect("spawn proxy"); + + let key_hex = alloy_primitives::hex::encode(anvil.first_key().to_bytes()); + let proxied_provider = crate::l1::provider::create_signer_provider( + proxy.endpoint().as_str(), + &format!("0x{key_hex}"), + ) + .expect("create signer provider through proxy"); + + // Seed: submit a tx at wallet-nonce 0 into Anvil's mempool (auto- + // mining is off, so it stays pending). The flusher now has work. + send_tx_at_nonce(&direct_provider, addr, 0).await; + let pending = direct_provider + .get_transaction_count(addr) + .block_id(BlockNumberOrTag::Pending.into()) + .await + .expect("pending nonce"); + assert_eq!(pending, 1, "seed tx should be pending"); + + // Disconnect the proxy. The flusher's provider can no longer reach + // Anvil — any RPC call sees a torn-down TCP connection. + proxy.disconnect(); + let flusher = MempoolFlusher::new(proxied_provider.clone(), addr, 12) + .with_timeouts(Duration::from_secs(2), Duration::from_millis(200)); + + // `flush_and_wait` must fail fast (no internal retry loop). Wrap in + // a generous outer timeout just to bound test flakiness if alloy's + // HTTP client has small internal retries. + let err = tokio::time::timeout(Duration::from_secs(5), flusher.flush_and_wait()) + .await + .expect("flush_and_wait must not hang under disconnect") + .expect_err("flush_and_wait must surface a Provider error under disconnect"); + assert!( + matches!(err, FlushError::Provider(_)), + "expected FlushError::Provider, got: {err:?}", + ); + + // Reconnect the proxy + start mining so the flusher can make forward + // progress. This models the orchestrator's next respawn succeeding + // after the provider returns. + proxy.reconnect(); + let _miner = start_miner(direct_provider.clone(), Duration::from_millis(100)); + + // A fresh flusher (a respawn would build a new one from scratch). + // It should now read nonces, replace the pending tx with a bumped- + // fee no-op (or let the original land), wait for safe, and return. + let flusher_after = MempoolFlusher::new(proxied_provider, addr, 12) + .with_timeouts(Duration::from_secs(5), Duration::from_millis(200)); + tokio::time::timeout(Duration::from_secs(15), flusher_after.flush_and_wait()) + .await + .expect("flush_and_wait should complete after reconnect") + .expect("flush should succeed once the provider is reachable"); + + // Forward progress: the nonce-0 slot was consumed (either by the + // flusher's no-op or by the original tx landing). `safe_nonce` is + // >= 1 only if something at nonce 0 reached safe finality — proof + // the flusher completed its job end-to-end. + let safe_after = direct_provider + .get_transaction_count(addr) + .block_id(BlockNumberOrTag::Safe.into()) + .await + .expect("safe nonce after flush"); + assert!( + safe_after >= 1, + "nonce-0 slot must be consumed and safe after flush, got {safe_after}", + ); + + proxy.shutdown().await.expect("proxy shutdown"); + } +} diff --git a/sequencer/src/recovery/mod.rs b/sequencer/src/recovery/mod.rs new file mode 100644 index 0000000..028c11b --- /dev/null +++ b/sequencer/src/recovery/mod.rs @@ -0,0 +1,315 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Preemptive recovery: detect danger zone, flush mempool, cascade-invalidate stale batches. +//! +//! At startup the sequencer checks if any batch is approaching the staleness deadline. +//! If so, it flushes the L1 mempool (competing with pending batch transactions using +//! no-op replacements), re-syncs the safe head, and runs the atomic recovery procedure +//! (populate scheduler frontier, assign nonces, detect stale, cascade-invalidate, +//! open recovery batch). +//! +//! At runtime a dedicated [`DangerDetector`] worker performs the same danger-zone +//! check each tick. If it fires, the detector exits with `DetectorExit::DangerZone`, +//! the runtime treats that as a `RunError::DangerZoneDetected`, and the process exits. +//! External orchestration restarts the sequencer, and this startup path runs again. +//! +//! ## Fault model +//! +//! Recovery is designed to handle **submission and outage failures**: the sequencer +//! crashes, the L1 provider becomes unreachable, transactions are dropped from the +//! mempool, or the process is offline for an extended period. It is **not** designed +//! to handle arbitrarily malformed self-submissions. The scheduler frontier +//! reconstruction (`populate_safe_accepted_batches`) trusts that on-chain batches +//! from the sequencer's own address are structurally valid. This is a deliberate +//! system assumption, not a gap — the sequencer controls its own submissions. +//! +//! ## Lifecycle +//! +//! ```text +//! steady state danger +//! ┌──────────┐ ┌──────────┐ +//! │ running │──detector tick──▶ 🚨 │ exiting │ +//! └──────────┘ └─────┬────┘ +//! ▲ │ non-zero exit +//! │ ▼ +//! ┌────┴─────┐ ┌─────────────────┐ +//! │ normal │◀───────────────│ orchestrator │──respawn──▶ startup +//! │ ticks │ │ (systemd/k8s) │ │ +//! └──────────┘ └─────────────────┘ ▼ +//! ┌────────────────────┐ +//! │ run_preemptive_ │ +//! │ recovery │ +//! │ ├─ try L1 resync │ +//! │ ├─ decide action │ +//! │ ├─ flush + cascade│ +//! │ └─ open batch │ +//! └────────────────────┘ +//! ``` +//! +//! See `docs/recovery/` for the full design, TLA+ specs, and design history. + +mod detector; +mod flusher; + +use thiserror::Error; + +use crate::l1::reader::{InputReader, InputReaderError}; +use crate::runtime::config::L1Config; +use crate::storage::{self, DangerStatus, StorageOpenError}; +pub use detector::{DangerDetector, DangerDetectorError, DetectorExit}; +pub use flusher::MempoolFlusher; +use sequencer_core::protocol::ProtocolConfig; + +#[derive(Debug, Error)] +pub enum RecoveryError { + #[error(transparent)] + OpenStorage(#[from] StorageOpenError), + #[error(transparent)] + Storage(#[from] rusqlite::Error), + #[error("flush: {0}")] + Flush(#[from] flusher::FlushError), + #[error("input reader: {0}")] + InputReader(#[from] InputReaderError), + #[error("provider: {0}")] + Provider(String), + #[error("startup refused: {0:?}")] + Refuse(RefuseReason), +} + +/// Why startup cannot proceed safely. +/// +/// Each variant captures a distinct combination of L1 reachability and DB +/// state that makes the flush-then-cascade recovery either unsafe or +/// impossible. The operator sees the variant in logs and must intervene. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum RefuseReason { + /// No prior safe-head observation was ever recorded AND L1 is unreachable. + /// We have no baseline to trust for the wall-clock estimate, and can't + /// refresh it either. First boot requires L1. + NeverSyncedAndUnreachable, + /// The wall-clock-adjusted check flagged a stale batch, which means the + /// safe head itself appears frozen. `flush_and_wait` would spin waiting + /// for a safe head that isn't advancing, so we refuse instead. + StalledSafeHead { batch_index: u64 }, + /// Strict danger detected but L1 is unreachable. We can't run the flush + /// step safely without a live L1 provider; refusing gives the operator a + /// chance to restore L1 before retrying. + StrictDangerButUnreachable { batch_index: u64 }, +} + +/// What a fresh startup must do, given the current (danger, L1-reachable, +/// ever-synced) state. +/// +/// Pure function output — no side effects. The `run_preemptive_recovery` +/// driver executes the chosen action. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum StartupAction { + /// No danger; proceed to run the recovery transaction (which is a no-op + /// on a healthy state apart from opening a Tip if one is missing). + Proceed, + /// Strict danger with a fresh L1 view. Flush the mempool, re-sync, then + /// run the recovery transaction. + FlushAndCascade { batch_index: u64 }, + /// Can't proceed safely; return the reason and let the operator decide. + Refuse(RefuseReason), +} + +/// Pure decision: given the danger status, whether L1 is reachable, and +/// whether we've ever recorded a safe-head observation, return what startup +/// should do. All the startup-policy complexity lives here, isolated from +/// storage and RPC side effects. +pub fn decide_startup_action( + danger: DangerStatus, + l1_reachable: bool, + last_safe_progress_ms: u64, +) -> StartupAction { + let ever_synced = last_safe_progress_ms != 0; + + // First-boot guard: if we've never seen a real safe-head observation AND + // we can't contact L1 to refresh it, we have nothing to base a safety + // decision on. Refuse. + if !ever_synced && !l1_reachable { + return StartupAction::Refuse(RefuseReason::NeverSyncedAndUnreachable); + } + + match (danger, l1_reachable) { + (DangerStatus::Safe, _) => StartupAction::Proceed, + (DangerStatus::Strict(batch_index), true) => StartupAction::FlushAndCascade { batch_index }, + (DangerStatus::Strict(batch_index), false) => { + StartupAction::Refuse(RefuseReason::StrictDangerButUnreachable { batch_index }) + } + (DangerStatus::Stalled(batch_index), _) => { + StartupAction::Refuse(RefuseReason::StalledSafeHead { batch_index }) + } + } +} + +/// Run the full preemptive recovery procedure at startup. +/// +/// 1. Try to sync the safe head from L1. If L1 is unreachable, fall through +/// using the persisted safe head plus the wall-clock estimator. +/// 2. Consult [`decide_startup_action`] to pick what to do. +/// 3. If the decision is `FlushAndCascade`: flush the mempool, re-sync, then +/// continue. +/// 4. Run the atomic recovery transaction (cascade stale batches if any, +/// always re-open the Tip if missing). +/// +/// Returns the list of invalidated batch indices (empty if no stale batches). +pub async fn run_preemptive_recovery( + db_path: &str, + input_reader: &mut InputReader, + l1_config: &L1Config, + protocol: &ProtocolConfig, +) -> Result, RecoveryError> { + // ── Step 1: Sync safe head (tolerate L1 failure) ─────────────── + // + // `sync_to_current_safe_head` goes through `append_safe_inputs`, which + // maintains `safe_accepted_batches` atomically with each advance. After + // a successful sync, the scheduler-frontier view is consistent with + // l1_safe_head for every downstream reader. + let l1_reachable = match input_reader.sync_to_current_safe_head().await { + Ok(()) => { + tracing::info!("L1 safe head synced"); + true + } + Err(e) => { + let InputReaderError::Provider(error) = e else { + return Err(RecoveryError::InputReader(e)); + }; + tracing::error!(error = %error, "L1 unreachable during startup safe-head sync"); + false + } + }; + + // ── Step 2: Read danger + last-progress, decide action ───────── + let (danger, last_safe_progress_ms) = { + let mut storage = storage::Storage::open(db_path)?; + let last = storage.last_safe_progress_ms()?; + let danger = storage.check_danger(protocol, crate::runtime::clock::unix_now_ms())?; + (danger, last) + }; + let action = decide_startup_action(danger, l1_reachable, last_safe_progress_ms); + + // ── Step 3: Execute decision ─────────────────────────────────── + match action { + StartupAction::Proceed => { + tracing::info!("no danger zone detected — skipping flush"); + } + StartupAction::FlushAndCascade { batch_index } => { + tracing::error!( + batch_index, + danger_threshold = protocol.danger_threshold(), + max_wait_blocks = protocol.max_wait_blocks, + "danger zone detected — entering preemptive recovery" + ); + + let flush_provider = crate::l1::provider::create_signer_provider( + &l1_config.eth_rpc_url, + &l1_config.batch_submitter_private_key, + ) + .map_err(|e| RecoveryError::Provider(e.to_string()))?; + let flusher = MempoolFlusher::new( + flush_provider, + l1_config.batch_submitter_address, + protocol.seconds_per_block, + ); + flusher.flush_and_wait().await?; + + tracing::info!("re-syncing L1 safe head after flush"); + input_reader.sync_to_current_safe_head().await?; + } + StartupAction::Refuse(reason) => { + tracing::error!( + ?reason, + reachable = l1_reachable, + "startup refused: flush cannot run safely" + ); + return Err(RecoveryError::Refuse(reason)); + } + } + + // ── Step 4: Atomic recovery ──────────────────────────────────── + // + // `safe_accepted_batches` is already caught up to `l1_safe_head` (step 1 + // and, if we flushed, step 3 re-synced it). The recovery transaction only + // needs to cascade + open. + tracing::info!("running startup recovery (detect stale, cascade-invalidate, open recovery)"); + let mut det_storage = storage::Storage::open(db_path)?; + let invalidated = det_storage.detect_and_recover(protocol.max_wait_blocks)?; + + if invalidated.is_empty() { + tracing::info!("no stale batches found — continuing normally"); + } else { + tracing::error!( + count = invalidated.len(), + batches = ?invalidated, + "stale batches invalidated — recovery batch opened" + ); + } + + Ok(invalidated) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn proceed_on_safe_regardless_of_l1() { + assert_eq!( + decide_startup_action(DangerStatus::Safe, true, 0), + StartupAction::Proceed + ); + assert_eq!( + decide_startup_action(DangerStatus::Safe, false, 1_000_000), + StartupAction::Proceed + ); + } + + #[test] + fn flush_and_cascade_on_strict_plus_reachable() { + assert_eq!( + decide_startup_action(DangerStatus::Strict(42), true, 1_000_000), + StartupAction::FlushAndCascade { batch_index: 42 } + ); + } + + #[test] + fn refuse_on_strict_plus_unreachable() { + assert_eq!( + decide_startup_action(DangerStatus::Strict(42), false, 1_000_000), + StartupAction::Refuse(RefuseReason::StrictDangerButUnreachable { batch_index: 42 }) + ); + } + + #[test] + fn refuse_on_stalled_regardless_of_l1() { + assert_eq!( + decide_startup_action(DangerStatus::Stalled(7), true, 1_000_000), + StartupAction::Refuse(RefuseReason::StalledSafeHead { batch_index: 7 }) + ); + assert_eq!( + decide_startup_action(DangerStatus::Stalled(7), false, 1_000_000), + StartupAction::Refuse(RefuseReason::StalledSafeHead { batch_index: 7 }) + ); + } + + #[test] + fn refuse_when_never_synced_and_unreachable() { + assert_eq!( + decide_startup_action(DangerStatus::Safe, false, 0), + StartupAction::Refuse(RefuseReason::NeverSyncedAndUnreachable) + ); + } + + #[test] + fn never_synced_but_reachable_proceeds() { + // First-boot happy path: we've never observed the safe head before, + // but L1 is reachable so step 1 just did the first sync. + assert_eq!( + decide_startup_action(DangerStatus::Safe, true, 0), + StartupAction::Proceed + ); + } +} diff --git a/sequencer/src/runtime.rs b/sequencer/src/runtime.rs deleted file mode 100644 index 7016b78..0000000 --- a/sequencer/src/runtime.rs +++ /dev/null @@ -1,420 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use thiserror::Error; -use tracing::warn; - -use crate::api::{self, ApiConfig}; -use crate::batch_submitter::{BatchPosterConfig, EthereumBatchPoster}; -use crate::batch_submitter::{BatchSubmitter, BatchSubmitterConfig, BatchSubmitterError}; -use crate::config::{L1Config, RunConfig}; -use crate::inclusion_lane::{InclusionLane, InclusionLaneConfig, InclusionLaneError}; -use crate::input_reader::{InputReader, InputReaderConfig, InputReaderError}; -use crate::l2_tx_feed::{L2TxFeed, L2TxFeedConfig}; -use crate::shutdown::ShutdownSignal; -use crate::storage::{self, StorageOpenError}; -use sequencer_core::application::Application; - -const SQLITE_SYNCHRONOUS_PRAGMA: &str = "NORMAL"; -const QUEUE_CAPACITY: usize = 8192; -const INPUT_READER_POLL_INTERVAL: std::time::Duration = std::time::Duration::from_secs(2); - -#[derive(Debug, Error)] -pub enum RunError { - #[error(transparent)] - OpenStorage(#[from] StorageOpenError), - #[error(transparent)] - Io(#[from] std::io::Error), - #[error("server stopped unexpectedly")] - ServerStoppedUnexpectedly, - #[error("server join error: {source}")] - ServerJoin { - #[source] - source: tokio::task::JoinError, - }, - #[error("inclusion lane stopped unexpectedly")] - InclusionLaneStoppedUnexpectedly, - #[error("inclusion lane exited: {source}")] - InclusionLane { - #[source] - source: InclusionLaneError, - }, - #[error("inclusion lane join error: {source}")] - InclusionLaneJoin { - #[source] - source: tokio::task::JoinError, - }, - #[error("input reader stopped unexpectedly")] - InputReaderStoppedUnexpectedly, - #[error("input reader exited: {source}")] - InputReader { - #[source] - source: InputReaderError, - }, - #[error("input reader join error: {source}")] - InputReaderJoin { - #[source] - source: tokio::task::JoinError, - }, - #[error("batch submitter stopped unexpectedly")] - BatchSubmitterStoppedUnexpectedly, - #[error("batch submitter exited: {source}")] - BatchSubmitter { - #[source] - source: BatchSubmitterError, - }, - #[error("batch submitter join error: {source}")] - BatchSubmitterJoin { - #[source] - source: tokio::task::JoinError, - }, -} - -enum FirstExit { - Signal(Option), - Server(RunError), - InclusionLane(RunError), - InputReader(RunError), - BatchSubmitter(RunError), -} - -pub async fn run(app: A, config: RunConfig) -> Result<(), RunError> -where - A: Application + 'static, -{ - let domain = config.build_domain(); - let shutdown = ShutdownSignal::default(); - - // Ensure the data directory exists before any component tries to open the DB. - std::fs::create_dir_all(&config.data_dir)?; - let db_path = config.db_path(); - - // Single L1/InputBox config shared by input reader and batch submitter (no duplicate RPC URL or addresses). - let batch_submitter_private_key = config.resolve_private_key()?; - - let batch_submitter_address = { - use alloy::signers::local::PrivateKeySigner; - use std::str::FromStr; - PrivateKeySigner::from_str(&batch_submitter_private_key) - .map_err(|e| RunError::Io(std::io::Error::other(e.to_string())))? - .address() - }; - let mut input_reader = InputReader::new( - db_path.clone(), - shutdown.clone(), - InputReaderConfig { - rpc_url: config.eth_rpc_url.clone(), - app_address: config.app_address, - poll_interval: INPUT_READER_POLL_INTERVAL, - long_block_range_error_codes: config.long_block_range_error_codes.clone(), - }, - ) - .await - .map_err(|source| RunError::InputReader { source })?; - let input_reader_genesis_block = input_reader.genesis_block(); - let l1_config = L1Config { - eth_rpc_url: config.eth_rpc_url.clone(), - input_box_address: input_reader.input_box_address(), - app_address: config.app_address, - batch_submitter_private_key, - batch_submitter_address, - }; - input_reader - .sync_to_current_safe_head() - .await - .map_err(|source| RunError::InputReader { source })?; - - tracing::info!( - http_addr = %config.http_addr, - data_dir = %config.data_dir, - eth_rpc_url = %l1_config.eth_rpc_url, - input_box_address = %l1_config.input_box_address, - input_reader_genesis_block, - chain_id = config.chain_id, - app_address = %l1_config.app_address, - "starting sequencer" - ); - - let storage = storage::Storage::open(&db_path, SQLITE_SYNCHRONOUS_PRAGMA)?; - let (tx, mut inclusion_lane_handle) = InclusionLane::start( - QUEUE_CAPACITY, - shutdown.clone(), - app, - storage, - InclusionLaneConfig::new(l1_config.batch_submitter_address), - ); - let mut input_reader_handle = input_reader.start()?; - - // Batch submitter uses the same L1 config (InputBox address and RPC URL) as the input reader. - let batch_submitter_config = BatchSubmitterConfig { - idle_poll_interval_ms: config.batch_submitter_idle_poll_interval_ms, - }; - let poster_config = BatchPosterConfig { - l1_submit_address: l1_config.input_box_address, - app_address: l1_config.app_address, - batch_submitter_address: l1_config.batch_submitter_address, - start_block: input_reader_genesis_block, - confirmation_depth: config.batch_submitter_confirmation_depth, - long_block_range_error_codes: config.long_block_range_error_codes, - }; - let provider = build_batch_submitter_provider(&l1_config)?; - - // Validate that the RPC chain ID matches --chain-id. - use alloy::providers::Provider; - let rpc_chain_id = provider - .get_chain_id() - .await - .map_err(|e| std::io::Error::other(format!("failed to query RPC chain ID: {e}")))?; - assert_eq!( - rpc_chain_id, config.chain_id, - "RPC chain ID {rpc_chain_id} does not match --chain-id {}", - config.chain_id - ); - - let poster = std::sync::Arc::new(EthereumBatchPoster::new(provider, poster_config)); - let submitter = BatchSubmitter::new( - db_path.clone(), - l1_config.batch_submitter_address, - poster, - shutdown.clone(), - batch_submitter_config, - ); - let mut batch_submitter_handle = submitter.start().map_err(RunError::OpenStorage)?; - - let tx_feed = L2TxFeed::new( - db_path.clone(), - shutdown.clone(), - L2TxFeedConfig { - batch_submitter_address: Some(l1_config.batch_submitter_address), - ..L2TxFeedConfig::default() - }, - ); - - let mut server_task = api::start( - &config.http_addr, - tx, - domain, - A::MAX_METHOD_PAYLOAD_BYTES, - shutdown.clone(), - tx_feed, - ApiConfig::default(), - ) - .await?; - - tracing::info!(address = %config.http_addr, "listening"); - - let shutdown_signal = tokio::signal::ctrl_c(); - tokio::pin!(shutdown_signal); - - let first_exit = tokio::select! { - signal_result = &mut shutdown_signal => { - FirstExit::Signal(signal_result.err().map(RunError::from)) - } - server_result = &mut server_task => { - FirstExit::Server(map_server_exit(server_result)) - } - lane_result = &mut inclusion_lane_handle => { - FirstExit::InclusionLane(map_lane_exit(lane_result)) - } - reader_result = &mut input_reader_handle => { - FirstExit::InputReader(map_input_reader_exit(reader_result)) - } - submitter_result = &mut batch_submitter_handle => { - FirstExit::BatchSubmitter(map_batch_submitter_exit(submitter_result)) - } - }; - - begin_runtime_shutdown(&shutdown); - finish_runtime( - first_exit, - server_task, - inclusion_lane_handle, - input_reader_handle, - batch_submitter_handle, - ) - .await -} - -fn begin_runtime_shutdown(shutdown: &ShutdownSignal) { - shutdown.request_shutdown(); -} - -async fn wait_for_clean_shutdown( - server_task: tokio::task::JoinHandle>, - inclusion_lane_handle: tokio::task::JoinHandle>, - input_reader_handle: tokio::task::JoinHandle>, - batch_submitter_handle: tokio::task::JoinHandle>, -) -> Result<(), RunError> { - wait_for_server_shutdown(server_task).await?; - wait_for_lane_shutdown(inclusion_lane_handle).await?; - wait_for_input_reader_shutdown(input_reader_handle).await?; - wait_for_batch_submitter_shutdown(batch_submitter_handle).await?; - Ok(()) -} - -async fn finish_runtime( - first_exit: FirstExit, - server_task: tokio::task::JoinHandle>, - inclusion_lane_handle: tokio::task::JoinHandle>, - input_reader_handle: tokio::task::JoinHandle>, - batch_submitter_handle: tokio::task::JoinHandle>, -) -> Result<(), RunError> { - match first_exit { - FirstExit::Signal(signal_error) => { - let shutdown_result = wait_for_clean_shutdown( - server_task, - inclusion_lane_handle, - input_reader_handle, - batch_submitter_handle, - ) - .await; - match (signal_error, shutdown_result) { - (Some(err), _) => Err(err), - (None, Ok(())) => Ok(()), - (None, Err(err)) => Err(err), - } - } - FirstExit::Server(primary) => { - log_cleanup_result( - "inclusion lane", - wait_for_lane_shutdown(inclusion_lane_handle).await, - ); - log_cleanup_result( - "input reader", - wait_for_input_reader_shutdown(input_reader_handle).await, - ); - log_cleanup_result( - "batch submitter", - wait_for_batch_submitter_shutdown(batch_submitter_handle).await, - ); - Err(primary) - } - FirstExit::InclusionLane(primary) => { - log_cleanup_result("server", wait_for_server_shutdown(server_task).await); - log_cleanup_result( - "input reader", - wait_for_input_reader_shutdown(input_reader_handle).await, - ); - log_cleanup_result( - "batch submitter", - wait_for_batch_submitter_shutdown(batch_submitter_handle).await, - ); - Err(primary) - } - FirstExit::InputReader(primary) => { - log_cleanup_result("server", wait_for_server_shutdown(server_task).await); - log_cleanup_result( - "inclusion lane", - wait_for_lane_shutdown(inclusion_lane_handle).await, - ); - log_cleanup_result( - "batch submitter", - wait_for_batch_submitter_shutdown(batch_submitter_handle).await, - ); - Err(primary) - } - FirstExit::BatchSubmitter(primary) => { - log_cleanup_result("server", wait_for_server_shutdown(server_task).await); - log_cleanup_result( - "inclusion lane", - wait_for_lane_shutdown(inclusion_lane_handle).await, - ); - log_cleanup_result( - "input reader", - wait_for_input_reader_shutdown(input_reader_handle).await, - ); - Err(primary) - } - } -} - -async fn wait_for_server_shutdown( - server_task: tokio::task::JoinHandle>, -) -> Result<(), RunError> { - match server_task.await { - Ok(Ok(())) => Ok(()), - Ok(Err(source)) => Err(RunError::Io(source)), - Err(source) => Err(RunError::ServerJoin { source }), - } -} - -async fn wait_for_lane_shutdown( - inclusion_lane_handle: tokio::task::JoinHandle>, -) -> Result<(), RunError> { - match inclusion_lane_handle.await { - Ok(Ok(())) => Ok(()), - Ok(Err(source)) => Err(RunError::InclusionLane { source }), - Err(source) => Err(RunError::InclusionLaneJoin { source }), - } -} - -async fn wait_for_input_reader_shutdown( - input_reader_handle: tokio::task::JoinHandle>, -) -> Result<(), RunError> { - match input_reader_handle.await { - Ok(Ok(())) => Ok(()), - Ok(Err(source)) => Err(RunError::InputReader { source }), - Err(source) => Err(RunError::InputReaderJoin { source }), - } -} - -async fn wait_for_batch_submitter_shutdown( - batch_submitter_handle: tokio::task::JoinHandle>, -) -> Result<(), RunError> { - match batch_submitter_handle.await { - Ok(Ok(())) => Ok(()), - Ok(Err(source)) => Err(RunError::BatchSubmitter { source }), - Err(source) => Err(RunError::BatchSubmitterJoin { source }), - } -} - -fn map_server_exit(result: Result, tokio::task::JoinError>) -> RunError { - match result { - Ok(Ok(())) => RunError::ServerStoppedUnexpectedly, - Ok(Err(source)) => RunError::Io(source), - Err(source) => RunError::ServerJoin { source }, - } -} - -fn map_lane_exit( - result: Result, tokio::task::JoinError>, -) -> RunError { - match result { - Ok(Ok(())) => RunError::InclusionLaneStoppedUnexpectedly, - Ok(Err(source)) => RunError::InclusionLane { source }, - Err(source) => RunError::InclusionLaneJoin { source }, - } -} - -fn map_input_reader_exit( - result: Result, tokio::task::JoinError>, -) -> RunError { - match result { - Ok(Ok(())) => RunError::InputReaderStoppedUnexpectedly, - Ok(Err(source)) => RunError::InputReader { source }, - Err(source) => RunError::InputReaderJoin { source }, - } -} - -fn map_batch_submitter_exit( - result: Result, tokio::task::JoinError>, -) -> RunError { - match result { - Ok(Ok(())) => RunError::BatchSubmitterStoppedUnexpectedly, - Ok(Err(source)) => RunError::BatchSubmitter { source }, - Err(source) => RunError::BatchSubmitterJoin { source }, - } -} - -fn log_cleanup_result(component: &str, result: Result<(), RunError>) { - if let Err(err) = result { - warn!(component, error = %err, "component shutdown after primary failure also errored"); - } -} - -fn build_batch_submitter_provider( - l1: &L1Config, -) -> Result { - crate::provider::create_signer_provider(&l1.eth_rpc_url, &l1.batch_submitter_private_key) - .map_err(std::io::Error::other) -} diff --git a/sequencer/src/runtime/clock.rs b/sequencer/src/runtime/clock.rs new file mode 100644 index 0000000..a0874e8 --- /dev/null +++ b/sequencer/src/runtime/clock.rs @@ -0,0 +1,19 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Shared clock helper. +//! +//! Every callsite that needs "now in Unix-ms" goes through [`unix_now_ms`] so +//! the sequencer has a single place to swap in a test clock if needed. +//! `SystemTime::now()` pre-epoch is defended against via `unwrap_or_default()`. + +use std::time::SystemTime; + +/// Current wall-clock time as Unix-ms. Passed into +/// [`crate::storage::Storage::check_danger`] and friends. +pub fn unix_now_ms() -> u64 { + SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_millis() as u64 +} diff --git a/sequencer/src/config.rs b/sequencer/src/runtime/config.rs similarity index 69% rename from sequencer/src/config.rs rename to sequencer/src/runtime/config.rs index 1e4e1ad..616c3fb 100644 --- a/sequencer/src/config.rs +++ b/sequencer/src/runtime/config.rs @@ -1,13 +1,10 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) -use alloy_primitives::{Address, U256}; +use alloy_primitives::Address; use alloy_sol_types::Eip712Domain; use clap::{ArgGroup, Parser}; -pub const DOMAIN_NAME: &str = "CartesiAppSequencer"; -pub const DOMAIN_VERSION: &str = "1"; - const DEFAULT_HTTP_ADDR: &str = "127.0.0.1:3000"; const DEFAULT_DATA_DIR: &str = "sequencer-data"; const DB_FILENAME: &str = "sequencer.db"; @@ -61,7 +58,7 @@ pub struct RunConfig { #[arg(long, env = "SEQ_ETH_RPC_URL", value_parser = parse_non_empty_string)] pub eth_rpc_url: String, /// Error codes that trigger `get_logs` retries with a shorter block range. - #[arg(long, env = "SEQ_LONG_BLOCK_RANGE_ERROR_CODES", value_delimiter = ',', default_values = crate::partition::DEFAULT_LONG_BLOCK_RANGE_ERROR_CODES)] + #[arg(long, env = "SEQ_LONG_BLOCK_RANGE_ERROR_CODES", value_delimiter = ',', default_values = crate::l1::partition::DEFAULT_LONG_BLOCK_RANGE_ERROR_CODES)] pub long_block_range_error_codes: Vec, /// Expected chain ID. Validated against the RPC at startup. #[arg(long, env = "SEQ_CHAIN_ID")] @@ -92,24 +89,29 @@ pub struct RunConfig { )] pub batch_submitter_idle_poll_interval_ms: u64, - /// Number of blocks behind Latest that the batch submitter treats as confirmed. + /// Additional confirmations to wait for after a batch-submission tx is included on L1. #[arg( long, env = "SEQ_BATCH_SUBMITTER_CONFIRMATION_DEPTH", - default_value = "0" + default_value = "2" )] pub batch_submitter_confirmation_depth: u64, + + /// Blocks before MAX_WAIT_BLOCKS to trigger preemptive recovery. + /// The danger threshold is MAX_WAIT_BLOCKS minus this margin. + /// Must be less than MAX_WAIT_BLOCKS (validated at startup). + #[arg(long, env = "SEQ_PREEMPTIVE_MARGIN_BLOCKS", default_value = "75")] + pub preemptive_margin_blocks: u64, + + /// Assumed L1 block time in seconds. Used to estimate block progression from + /// wall-clock time when the L1 provider is unreachable. + #[arg(long, env = "SEQ_SECONDS_PER_BLOCK", default_value = "12", value_parser = clap::value_parser!(u64).range(1..))] + pub seconds_per_block: u64, } impl RunConfig { pub fn build_domain(&self) -> Eip712Domain { - Eip712Domain { - name: Some(DOMAIN_NAME.into()), - version: Some(DOMAIN_VERSION.into()), - chain_id: Some(U256::from(self.chain_id)), - verifying_contract: Some(self.app_address), - salt: None, - } + sequencer_core::build_input_domain(self.chain_id, self.app_address) } /// Full path to the SQLite database file inside `data_dir`. @@ -157,9 +159,10 @@ fn parse_address(raw: &str) -> Result { #[cfg(test)] mod tests { - use super::{DOMAIN_NAME, DOMAIN_VERSION, RunConfig}; + use super::RunConfig; use alloy_primitives::{Address, U256}; use clap::Parser; + use sequencer_core::{DOMAIN_NAME, DOMAIN_VERSION}; const TEST_ARGS: [&str; 9] = [ "sequencer", @@ -203,6 +206,13 @@ mod tests { ); } + #[test] + fn run_config_defaults_batch_submitter_confirmation_depth_to_two() { + let config = RunConfig::try_parse_from(TEST_ARGS).expect("parse run config"); + + assert_eq!(config.batch_submitter_confirmation_depth, 2); + } + #[test] fn run_config_builds_domain_with_fixed_name_and_version() { let config = RunConfig::try_parse_from(TEST_ARGS).expect("parse run config"); @@ -216,4 +226,49 @@ mod tests { Some(Address::from_slice(&[0x11; 20])) ); } + + // ── §8.4.2 — H8 regression: SEQ_SECONDS_PER_BLOCK=0 is rejected by clap ── + // + // The H8 hardening added `value_parser = clap::value_parser!(u64).range(1..)` + // on `seconds_per_block` to prevent a divide-by-zero panic in the + // wall-clock fallback (`elapsed_secs / seconds_per_block`). Without the + // value parser, an operator typo would panic the process during the worst + // possible moment — an L1 outage. These tests lock the clap-level guard. + + fn args_with_seconds_per_block(value: &str) -> Vec<&str> { + let mut args: Vec<&str> = TEST_ARGS.to_vec(); + args.push("--seconds-per-block"); + args.push(value); + args + } + + #[test] + fn run_config_rejects_seconds_per_block_zero() { + let err = RunConfig::try_parse_from(args_with_seconds_per_block("0")) + .expect_err("seconds_per_block=0 must be rejected"); + let message = err.to_string(); + // The exact clap wording depends on the version; the specific field is + // what we want to pin. + assert!( + message.contains("--seconds-per-block") || message.contains("seconds_per_block"), + "error must name the offending field, got: {message}" + ); + } + + #[test] + fn run_config_accepts_seconds_per_block_one() { + // One is the minimum allowed (1..). + let config = + RunConfig::try_parse_from(args_with_seconds_per_block("1")).expect("parse succeeds"); + assert_eq!(config.seconds_per_block, 1); + } + + #[test] + fn run_config_default_seconds_per_block_is_12() { + let config = RunConfig::try_parse_from(TEST_ARGS).expect("parse run config"); + assert_eq!( + config.seconds_per_block, 12, + "default should reflect Ethereum block time" + ); + } } diff --git a/sequencer/src/runtime/mod.rs b/sequencer/src/runtime/mod.rs new file mode 100644 index 0000000..eb77640 --- /dev/null +++ b/sequencer/src/runtime/mod.rs @@ -0,0 +1,712 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Process orchestration: bootstraps L1 state, opens storage, runs preemptive +//! recovery, then spawns the lane / input reader / batch submitter / +//! danger detector / feed / HTTP servers and awaits their completion. + +pub mod clock; +pub mod config; +pub mod shutdown; + +use std::time::Duration; + +use thiserror::Error; +use tracing::warn; + +use crate::egress::l2_tx_feed::{L2TxFeed, L2TxFeedConfig}; +use crate::http::{self, ApiConfig}; +use crate::ingress::inclusion_lane::{InclusionLane, InclusionLaneConfig, InclusionLaneError}; +use crate::l1::reader::{InputReader, InputReaderConfig, InputReaderError}; +use crate::l1::submitter::{BatchPosterConfig, EthereumBatchPoster}; +use crate::l1::submitter::{ + BatchSubmitter, BatchSubmitterConfig, BatchSubmitterError, SubmitterExit, +}; +use crate::recovery::{DangerDetector, DangerDetectorError, DetectorExit}; +use crate::storage::{self, StorageOpenError}; +use alloy_primitives::Address; +use config::{L1Config, RunConfig}; +use sequencer_core::application::Application; +use sequencer_core::protocol::ProtocolConfig; +use shutdown::ShutdownSignal; + +const QUEUE_CAPACITY: usize = 8192; +const INPUT_READER_POLL_INTERVAL: Duration = Duration::from_secs(2); +/// Danger detector cadence. Cheap DB-only check; re-running quickly bounds the +/// lag on entering the danger zone. The preemptive margin absorbs bounded lag. +const DANGER_DETECTOR_POLL_INTERVAL: Duration = Duration::from_secs(2); + +#[derive(Debug, Error)] +pub enum RunError { + #[error(transparent)] + OpenStorage(#[from] StorageOpenError), + #[error(transparent)] + Io(#[from] std::io::Error), + #[error("server stopped unexpectedly")] + ServerStoppedUnexpectedly, + #[error("server join error: {source}")] + ServerJoin { + #[source] + source: tokio::task::JoinError, + }, + #[error("inclusion lane stopped unexpectedly")] + InclusionLaneStoppedUnexpectedly, + #[error("inclusion lane exited: {source}")] + InclusionLane { + #[source] + source: InclusionLaneError, + }, + #[error("inclusion lane join error: {source}")] + InclusionLaneJoin { + #[source] + source: tokio::task::JoinError, + }, + #[error("input reader stopped unexpectedly")] + InputReaderStoppedUnexpectedly, + #[error("input reader exited: {source}")] + InputReader { + #[source] + source: InputReaderError, + }, + #[error("input reader join error: {source}")] + InputReaderJoin { + #[source] + source: tokio::task::JoinError, + }, + #[error("batch submitter stopped unexpectedly")] + BatchSubmitterStoppedUnexpectedly, + #[error("batch submitter exited: {source}")] + BatchSubmitter { + #[source] + source: BatchSubmitterError, + }, + #[error("batch submitter join error: {source}")] + BatchSubmitterJoin { + #[source] + source: tokio::task::JoinError, + }, + #[error("danger detector exited: {source}")] + DangerDetector { + #[source] + source: DangerDetectorError, + }, + #[error("danger detector join error: {source}")] + DangerDetectorJoin { + #[source] + source: tokio::task::JoinError, + }, + #[error("danger detector stopped unexpectedly")] + DangerDetectorStoppedUnexpectedly, + /// Deliberate shutdown triggered by the danger detector. Not an error in + /// the usual sense — the orchestrator is expected to respawn, at which + /// point `run_preemptive_recovery` handles it. + #[error("danger zone detected at batch {batch_index} — stopping for recovery")] + DangerZoneDetected { batch_index: u64 }, + #[error("RPC chain ID {rpc} does not match --chain-id {config}")] + ChainIdMismatch { rpc: u64, config: u64 }, +} + +enum FirstExit { + Signal(Option), + Server(RunError), + InclusionLane(RunError), + InputReader(RunError), + BatchSubmitter(RunError), + DangerDetector(RunError), +} + +pub async fn run(app: A, config: RunConfig) -> Result<(), RunError> +where + A: Application + 'static, +{ + let domain = config.build_domain(); + let shutdown = ShutdownSignal::default(); + + // Ensure the data directory exists before any component tries to open the DB. + std::fs::create_dir_all(&config.data_dir)?; + let db_path = config.db_path(); + + let batch_submitter_private_key = config.resolve_private_key()?; + + let batch_submitter_address = + batch_submitter_address_from_private_key(batch_submitter_private_key.as_str())?; + + // One ProtocolConfig shared across the whole process: the input reader, + // the danger detector, and startup recovery all mirror the same + // scheduler-acceptance rules. + let protocol = ProtocolConfig { + batch_submitter: batch_submitter_address, + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: config.preemptive_margin_blocks, + seconds_per_block: config.seconds_per_block, + }; + + let input_reader_config = InputReaderConfig { + rpc_url: config.eth_rpc_url.clone(), + app_address: config.app_address, + poll_interval: INPUT_READER_POLL_INTERVAL, + long_block_range_error_codes: config.long_block_range_error_codes.clone(), + }; + + // Bootstrap L1 config: try L1 first, fall back to DB cache if unreachable. + // On first startup, L1 is required (no cache). On subsequent startups, the + // cache allows the sequencer to start without L1 (e.g., during provider outages). + let (mut input_reader, input_reader_genesis_block, l1_config) = match InputReader::new( + db_path.clone(), + shutdown.clone(), + input_reader_config.clone(), + protocol, + ) + .await + { + Ok(reader) => { + let genesis = reader.genesis_block(); + let input_box = reader.input_box_address(); + + // Validate chain ID early — before any DB writes. + { + use alloy::providers::Provider; + let check_provider = crate::l1::provider::create_provider(&config.eth_rpc_url) + .map_err(|e| RunError::Io(std::io::Error::other(e)))?; + match check_provider.get_chain_id().await { + Ok(rpc_chain_id) if rpc_chain_id != config.chain_id => { + return Err(RunError::ChainIdMismatch { + rpc: rpc_chain_id, + config: config.chain_id, + }); + } + Ok(_) => {} // verified + Err(e) => { + tracing::warn!( + error = %e, + "could not validate RPC chain ID at bootstrap" + ); + } + } + } + + // Cache for future startups when L1 might be unreachable. + if let Ok(mut s) = storage::Storage::open(&db_path) { + let _ = s.save_l1_bootstrap_cache(input_box, genesis, config.chain_id); + } + + let l1 = L1Config { + eth_rpc_url: config.eth_rpc_url.clone(), + input_box_address: input_box, + app_address: config.app_address, + batch_submitter_private_key, + batch_submitter_address, + }; + (reader, genesis, l1) + } + Err(InputReaderError::Provider(e)) => { + // L1 unreachable. Try the DB cache. + tracing::error!( + error = %e, + "L1 unreachable during bootstrap — checking DB cache" + ); + let cache_storage = storage::Storage::open(&db_path)?; + let cached = cache_storage + .l1_bootstrap_cache() + .map_err(|e| RunError::Io(std::io::Error::other(e.to_string())))?; + let Some((input_box, genesis, cached_chain_id)) = cached else { + return Err(RunError::Io(std::io::Error::other( + "L1 unreachable and no bootstrap cache — \ + L1 is required for first startup", + ))); + }; + if cached_chain_id != config.chain_id { + return Err(RunError::ChainIdMismatch { + rpc: cached_chain_id, + config: config.chain_id, + }); + } + + let reader = InputReader::from_parts( + input_reader_config, + input_box, + genesis, + db_path.clone(), + shutdown.clone(), + protocol, + ); + let l1 = L1Config { + eth_rpc_url: config.eth_rpc_url.clone(), + input_box_address: input_box, + app_address: config.app_address, + batch_submitter_private_key, + batch_submitter_address, + }; + (reader, genesis, l1) + } + Err(source) => return Err(RunError::InputReader { source }), + }; + + tracing::info!( + http_addr = %config.http_addr, + data_dir = %config.data_dir, + eth_rpc_url = %l1_config.eth_rpc_url, + input_box_address = %l1_config.input_box_address, + input_reader_genesis_block, + chain_id = config.chain_id, + app_address = %l1_config.app_address, + batch_submitter_address = %l1_config.batch_submitter_address, + max_wait_blocks = protocol.max_wait_blocks, + preemptive_margin_blocks = protocol.preemptive_margin_blocks, + danger_threshold = protocol.danger_threshold(), + "sequencer startup" + ); + + // ── Preemptive recovery ──────────────────────────────────────── + // See docs/recovery/ for the full design and TLA+ spec. + crate::recovery::run_preemptive_recovery(&db_path, &mut input_reader, &l1_config, &protocol) + .await + .map_err(|e| RunError::Io(std::io::Error::other(e.to_string())))?; + + let storage = storage::Storage::open(&db_path)?; + let (tx, mut inclusion_lane_handle) = InclusionLane::start( + QUEUE_CAPACITY, + shutdown.clone(), + app, + storage, + InclusionLaneConfig::new(l1_config.batch_submitter_address), + ); + let mut input_reader_handle = input_reader.start()?; + + let batch_submitter_config = BatchSubmitterConfig { + idle_poll_interval_ms: config.batch_submitter_idle_poll_interval_ms, + }; + let poster_config = BatchPosterConfig { + l1_submit_address: l1_config.input_box_address, + app_address: l1_config.app_address, + batch_submitter_address: l1_config.batch_submitter_address, + start_block: input_reader_genesis_block, + confirmation_depth: config.batch_submitter_confirmation_depth, + seconds_per_block: config.seconds_per_block, + long_block_range_error_codes: config.long_block_range_error_codes, + }; + let provider = build_batch_submitter_provider(&l1_config)?; + + let poster = std::sync::Arc::new(EthereumBatchPoster::new(provider, poster_config)); + let submitter = BatchSubmitter::new( + db_path.clone(), + poster, + shutdown.clone(), + batch_submitter_config, + ); + let mut batch_submitter_handle = submitter.start().map_err(RunError::OpenStorage)?; + + let detector = DangerDetector::new( + db_path.clone(), + protocol, + DANGER_DETECTOR_POLL_INTERVAL, + shutdown.clone(), + ); + let mut danger_detector_handle = detector.start().map_err(RunError::OpenStorage)?; + + let tx_feed = L2TxFeed::new( + db_path.clone(), + shutdown.clone(), + L2TxFeedConfig { + batch_submitter_address: Some(l1_config.batch_submitter_address), + ..L2TxFeedConfig::default() + }, + ); + + let mut server_task = http::start( + &config.http_addr, + tx, + domain, + A::MAX_METHOD_PAYLOAD_BYTES, + shutdown.clone(), + tx_feed, + ApiConfig::default(), + ) + .await?; + + tracing::info!(address = %config.http_addr, "listening"); + + let shutdown_signal = tokio::signal::ctrl_c(); + tokio::pin!(shutdown_signal); + + let first_exit = tokio::select! { + signal_result = &mut shutdown_signal => { + FirstExit::Signal(signal_result.err().map(RunError::from)) + } + server_result = &mut server_task => { + FirstExit::Server(map_server_exit(server_result)) + } + lane_result = &mut inclusion_lane_handle => { + FirstExit::InclusionLane(map_lane_exit(lane_result)) + } + reader_result = &mut input_reader_handle => { + FirstExit::InputReader(map_input_reader_exit(reader_result)) + } + submitter_result = &mut batch_submitter_handle => { + FirstExit::BatchSubmitter(map_batch_submitter_exit(submitter_result)) + } + detector_result = &mut danger_detector_handle => { + FirstExit::DangerDetector(map_danger_detector_exit(detector_result)) + } + }; + + begin_runtime_shutdown(&shutdown); + finish_runtime( + first_exit, + server_task, + inclusion_lane_handle, + input_reader_handle, + batch_submitter_handle, + danger_detector_handle, + ) + .await +} + +fn batch_submitter_address_from_private_key(private_key: &str) -> Result { + use alloy::signers::local::PrivateKeySigner; + use std::str::FromStr; + + Ok(PrivateKeySigner::from_str(private_key) + .map_err(|_| RunError::Io(std::io::Error::other("invalid private key")))? + .address()) +} + +fn begin_runtime_shutdown(shutdown: &ShutdownSignal) { + shutdown.request_shutdown(); +} + +async fn wait_for_clean_shutdown( + server_task: tokio::task::JoinHandle>, + inclusion_lane_handle: tokio::task::JoinHandle>, + input_reader_handle: tokio::task::JoinHandle>, + batch_submitter_handle: tokio::task::JoinHandle>, + danger_detector_handle: tokio::task::JoinHandle>, +) -> Result<(), RunError> { + wait_for_server_shutdown(server_task).await?; + wait_for_lane_shutdown(inclusion_lane_handle).await?; + wait_for_input_reader_shutdown(input_reader_handle).await?; + wait_for_batch_submitter_shutdown(batch_submitter_handle).await?; + wait_for_danger_detector_shutdown(danger_detector_handle).await?; + Ok(()) +} + +async fn finish_runtime( + first_exit: FirstExit, + server_task: tokio::task::JoinHandle>, + inclusion_lane_handle: tokio::task::JoinHandle>, + input_reader_handle: tokio::task::JoinHandle>, + batch_submitter_handle: tokio::task::JoinHandle>, + danger_detector_handle: tokio::task::JoinHandle>, +) -> Result<(), RunError> { + match first_exit { + FirstExit::Signal(signal_error) => { + let shutdown_result = wait_for_clean_shutdown( + server_task, + inclusion_lane_handle, + input_reader_handle, + batch_submitter_handle, + danger_detector_handle, + ) + .await; + match (signal_error, shutdown_result) { + (Some(err), _) => Err(err), + (None, Ok(())) => Ok(()), + (None, Err(err)) => Err(err), + } + } + FirstExit::Server(primary) => { + log_cleanup_result( + "inclusion lane", + wait_for_lane_shutdown(inclusion_lane_handle).await, + ); + log_cleanup_result( + "input reader", + wait_for_input_reader_shutdown(input_reader_handle).await, + ); + log_cleanup_result( + "batch submitter", + wait_for_batch_submitter_shutdown(batch_submitter_handle).await, + ); + log_cleanup_result( + "danger detector", + wait_for_danger_detector_shutdown(danger_detector_handle).await, + ); + Err(primary) + } + FirstExit::InclusionLane(primary) => { + log_cleanup_result("server", wait_for_server_shutdown(server_task).await); + log_cleanup_result( + "input reader", + wait_for_input_reader_shutdown(input_reader_handle).await, + ); + log_cleanup_result( + "batch submitter", + wait_for_batch_submitter_shutdown(batch_submitter_handle).await, + ); + log_cleanup_result( + "danger detector", + wait_for_danger_detector_shutdown(danger_detector_handle).await, + ); + Err(primary) + } + FirstExit::InputReader(primary) => { + log_cleanup_result("server", wait_for_server_shutdown(server_task).await); + log_cleanup_result( + "inclusion lane", + wait_for_lane_shutdown(inclusion_lane_handle).await, + ); + log_cleanup_result( + "batch submitter", + wait_for_batch_submitter_shutdown(batch_submitter_handle).await, + ); + log_cleanup_result( + "danger detector", + wait_for_danger_detector_shutdown(danger_detector_handle).await, + ); + Err(primary) + } + FirstExit::BatchSubmitter(primary) => { + log_cleanup_result("server", wait_for_server_shutdown(server_task).await); + log_cleanup_result( + "inclusion lane", + wait_for_lane_shutdown(inclusion_lane_handle).await, + ); + log_cleanup_result( + "input reader", + wait_for_input_reader_shutdown(input_reader_handle).await, + ); + log_cleanup_result( + "danger detector", + wait_for_danger_detector_shutdown(danger_detector_handle).await, + ); + Err(primary) + } + FirstExit::DangerDetector(primary) => { + log_cleanup_result("server", wait_for_server_shutdown(server_task).await); + log_cleanup_result( + "inclusion lane", + wait_for_lane_shutdown(inclusion_lane_handle).await, + ); + log_cleanup_result( + "input reader", + wait_for_input_reader_shutdown(input_reader_handle).await, + ); + log_cleanup_result( + "batch submitter", + wait_for_batch_submitter_shutdown(batch_submitter_handle).await, + ); + Err(primary) + } + } +} + +async fn wait_for_server_shutdown( + server_task: tokio::task::JoinHandle>, +) -> Result<(), RunError> { + match server_task.await { + Ok(Ok(())) => Ok(()), + Ok(Err(source)) => Err(RunError::Io(source)), + Err(source) => Err(RunError::ServerJoin { source }), + } +} + +async fn wait_for_lane_shutdown( + inclusion_lane_handle: tokio::task::JoinHandle>, +) -> Result<(), RunError> { + match inclusion_lane_handle.await { + Ok(Ok(())) => Ok(()), + Ok(Err(source)) => Err(RunError::InclusionLane { source }), + Err(source) => Err(RunError::InclusionLaneJoin { source }), + } +} + +async fn wait_for_input_reader_shutdown( + input_reader_handle: tokio::task::JoinHandle>, +) -> Result<(), RunError> { + match input_reader_handle.await { + Ok(Ok(())) => Ok(()), + Ok(Err(source)) => Err(RunError::InputReader { source }), + Err(source) => Err(RunError::InputReaderJoin { source }), + } +} + +async fn wait_for_batch_submitter_shutdown( + batch_submitter_handle: tokio::task::JoinHandle>, +) -> Result<(), RunError> { + match batch_submitter_handle.await { + Ok(Ok(SubmitterExit::Shutdown)) => Ok(()), + Ok(Err(source)) => Err(RunError::BatchSubmitter { source }), + Err(source) => Err(RunError::BatchSubmitterJoin { source }), + } +} + +async fn wait_for_danger_detector_shutdown( + danger_detector_handle: tokio::task::JoinHandle>, +) -> Result<(), RunError> { + match danger_detector_handle.await { + Ok(Ok(DetectorExit::Shutdown)) => Ok(()), + Ok(Ok(DetectorExit::DangerZone { batch_index })) => { + Err(RunError::DangerZoneDetected { batch_index }) + } + Ok(Err(source)) => Err(RunError::DangerDetector { source }), + Err(source) => Err(RunError::DangerDetectorJoin { source }), + } +} + +fn map_server_exit(result: Result, tokio::task::JoinError>) -> RunError { + match result { + Ok(Ok(())) => RunError::ServerStoppedUnexpectedly, + Ok(Err(source)) => RunError::Io(source), + Err(source) => RunError::ServerJoin { source }, + } +} + +fn map_lane_exit( + result: Result, tokio::task::JoinError>, +) -> RunError { + match result { + Ok(Ok(())) => RunError::InclusionLaneStoppedUnexpectedly, + Ok(Err(source)) => RunError::InclusionLane { source }, + Err(source) => RunError::InclusionLaneJoin { source }, + } +} + +fn map_input_reader_exit( + result: Result, tokio::task::JoinError>, +) -> RunError { + match result { + Ok(Ok(())) => RunError::InputReaderStoppedUnexpectedly, + Ok(Err(source)) => RunError::InputReader { source }, + Err(source) => RunError::InputReaderJoin { source }, + } +} + +fn map_batch_submitter_exit( + result: Result, tokio::task::JoinError>, +) -> RunError { + match result { + Ok(Ok(SubmitterExit::Shutdown)) => RunError::BatchSubmitterStoppedUnexpectedly, + Ok(Err(source)) => RunError::BatchSubmitter { source }, + Err(source) => RunError::BatchSubmitterJoin { source }, + } +} + +fn map_danger_detector_exit( + result: Result, tokio::task::JoinError>, +) -> RunError { + match result { + Ok(Ok(DetectorExit::Shutdown)) => { + // Shouldn't happen — detector Shutdown means its own shutdown signal + // fired, which only happens after someone else triggered + // runtime-wide shutdown. Treat this as a real exit only if nothing + // else did first. + RunError::DangerDetectorStoppedUnexpectedly + } + Ok(Ok(DetectorExit::DangerZone { batch_index })) => { + RunError::DangerZoneDetected { batch_index } + } + Ok(Err(source)) => RunError::DangerDetector { source }, + Err(source) => RunError::DangerDetectorJoin { source }, + } +} + +fn log_cleanup_result(component: &str, result: Result<(), RunError>) { + if let Err(err) = result { + warn!(component, error = %err, "component shutdown after primary failure also errored"); + } +} + +fn build_batch_submitter_provider( + l1: &L1Config, +) -> Result { + crate::l1::provider::create_signer_provider(&l1.eth_rpc_url, &l1.batch_submitter_private_key) + .map_err(std::io::Error::other) +} + +#[cfg(test)] +mod tests { + use super::{RunError, batch_submitter_address_from_private_key, map_danger_detector_exit}; + use crate::recovery::{DangerDetectorError, DetectorExit}; + use sequencer_core::MAX_WAIT_BLOCKS; + use sequencer_core::protocol::ProtocolConfig; + + fn protocol_with_margin(preemptive_margin_blocks: u64) -> ProtocolConfig { + ProtocolConfig { + batch_submitter: alloy_primitives::Address::ZERO, + max_wait_blocks: MAX_WAIT_BLOCKS, + preemptive_margin_blocks, + seconds_per_block: 12, + } + } + + // ── §8.4.1 preemptive_margin_blocks validation ──────────────────── + + #[test] + #[should_panic(expected = "preemptive_margin_blocks")] + fn margin_equal_to_max_wait_panics() { + let _ = protocol_with_margin(MAX_WAIT_BLOCKS).danger_threshold(); + } + + #[test] + #[should_panic(expected = "preemptive_margin_blocks")] + fn margin_greater_than_max_wait_panics() { + let _ = protocol_with_margin(MAX_WAIT_BLOCKS + 1).danger_threshold(); + } + + #[test] + fn margin_one_below_max_wait_yields_threshold_one() { + assert_eq!( + protocol_with_margin(MAX_WAIT_BLOCKS - 1).danger_threshold(), + 1 + ); + } + + #[test] + fn zero_margin_yields_full_wait_window() { + assert_eq!(protocol_with_margin(0).danger_threshold(), MAX_WAIT_BLOCKS); + } + + #[test] + fn default_margin_matches_production_setting() { + // Default is 75 per `SEQ_PREEMPTIVE_MARGIN_BLOCKS`. + assert_eq!( + protocol_with_margin(75).danger_threshold(), + MAX_WAIT_BLOCKS - 75 + ); + } + + #[test] + fn invalid_private_key_error_does_not_echo_key_material() { + let secret = "0xabc123SECRET"; + let err = batch_submitter_address_from_private_key(secret) + .expect_err("invalid private key should be rejected"); + let message = err.to_string(); + + assert_eq!(message, "invalid private key"); + assert!( + !message.contains(secret), + "private key material must not be reflected in startup errors" + ); + } + + #[test] + fn danger_detector_shutdown_maps_to_detector_specific_unexpected_exit() { + let err = map_danger_detector_exit(Ok(Ok(DetectorExit::Shutdown))); + assert!(matches!(err, RunError::DangerDetectorStoppedUnexpectedly)); + } + + #[test] + fn danger_detector_danger_zone_maps_to_deliberate_runtime_exit() { + let err = map_danger_detector_exit(Ok(Ok(DetectorExit::DangerZone { batch_index: 7 }))); + assert!(matches!( + err, + RunError::DangerZoneDetected { batch_index: 7 } + )); + } + + #[test] + fn danger_detector_errors_preserve_source_category() { + let err = map_danger_detector_exit(Ok(Err(DangerDetectorError::Join("boom".into())))); + assert!(matches!(err, RunError::DangerDetector { .. })); + } +} diff --git a/sequencer/src/shutdown.rs b/sequencer/src/runtime/shutdown.rs similarity index 100% rename from sequencer/src/shutdown.rs rename to sequencer/src/runtime/shutdown.rs diff --git a/sequencer/src/storage/admin.rs b/sequencer/src/storage/admin.rs new file mode 100644 index 0000000..c1ef8d8 --- /dev/null +++ b/sequencer/src/storage/admin.rs @@ -0,0 +1,101 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Operator/admin writes: tune fee policy parameters (`set_log_gas_price`, +//! `set_alpha`). Used today by tests and ad-hoc operator commands; not on the +//! hot path. + +use rusqlite::{Result, params}; + +use super::Storage; + +impl Storage { + pub fn set_log_gas_price(&mut self, log_gas_price: u16) -> Result<()> { + let changed = self.conn.execute( + "UPDATE batch_policy SET log_gas_price = ?1 WHERE singleton_id = 0", + params![i64::from(log_gas_price)], + )?; + if changed != 1 { + return Err(rusqlite::Error::StatementChangedRows(changed)); + } + Ok(()) + } + + /// Set the alpha knob from a `num/denom` rational. Computes both + /// `log_alpha` and `log_one_plus_alpha` (the policy-derived view needs + /// both). Panics if `num + denom` overflows `u64` — a misuse, not a + /// runtime condition. + pub fn set_alpha(&mut self, num: u64, denom: u64) -> Result<()> { + use sequencer_core::fee::log_fee_ratio; + + let log_alpha = log_fee_ratio(num, denom); + let one_plus_alpha_num = num.checked_add(denom).expect( + "set_alpha: num + denom overflows u64; use smaller values for the alpha fraction", + ); + let log_one_plus_alpha = log_fee_ratio(one_plus_alpha_num, denom); + + let changed = self.conn.execute( + "UPDATE batch_policy \ + SET log_alpha = ?1, log_one_plus_alpha = ?2 \ + WHERE singleton_id = 0", + params![i64::from(log_alpha), i64::from(log_one_plus_alpha)], + )?; + if changed != 1 { + return Err(rusqlite::Error::StatementChangedRows(changed)); + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use crate::storage::{Storage, test_helpers::temp_db}; + + #[test] + fn high_gas_price_clamps_recommended_fee_to_max_exponent() { + let db = temp_db("clamp-fee"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + // Set gas price high enough that log_recommended_fee > MAX_EXPONENT (17101). + // Default: log_recommended_fee = gas_price + 20 + 419 + 621. + // With gas_price = 17000: 17000 + 1060 = 18060 > 17101. + storage + .set_log_gas_price(17000) + .expect("set high gas price"); + + let policy = storage.batch_policy().expect("read policy"); + assert_eq!( + policy.recommended_fee, + sequencer_core::fee::MAX_EXPONENT, + "recommended_fee should be clamped to MAX_EXPONENT" + ); + + // fee_to_linear must not panic with the clamped value. + let _ = sequencer_core::fee::fee_to_linear(policy.recommended_fee); + } + + #[test] + #[should_panic(expected = "num + denom overflows u64")] + fn set_alpha_rejects_overflow() { + let db = temp_db("alpha-overflow"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + storage.set_alpha(u64::MAX, 1).unwrap(); + } + + /// CHECK constraint guards against alpha values that would push the batch-size + /// target past `log_max_batch_bytes`. Migrated from the old `sql.rs` test suite. + #[test] + fn batch_policy_check_rejects_unsafe_alpha() { + let db = temp_db("unsafe-alpha"); + let storage = Storage::open(db.path.as_str()).expect("open storage"); + // log_alpha=-350 → log_batch_size_target = 1403-(-350)-419 = 1334 >= log_max_batch_bytes=1333 + let err = storage.conn.execute( + "UPDATE batch_policy SET log_alpha = ?1, log_one_plus_alpha = ?2 WHERE singleton_id = 0", + [-350_i64, 0_i64], + ); + assert!( + err.is_err(), + "CHECK should reject unsafe alpha (log_batch_size_target >= log_max_batch_bytes)" + ); + } +} diff --git a/sequencer/src/storage/convert.rs b/sequencer/src/storage/convert.rs new file mode 100644 index 0000000..be26c28 --- /dev/null +++ b/sequencer/src/storage/convert.rs @@ -0,0 +1,60 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Saturating width conversions between Rust and SQLite integer types, plus +//! `SystemTime` ↔ `i64` Unix-ms conversions. +//! +//! SQLite stores integers as `INTEGER` (signed 64-bit). Rust domain types use +//! narrower unsigned widths (`u16`, `u32`, `u64`). The conversions here are +//! load-bearing glue that the rest of the storage module calls pervasively. +//! +//! All conversions saturate rather than panic — the domain values we persist +//! are always non-negative and well within `i64::MAX`, but saturation keeps +//! corrupted or malicious DB rows from crashing the process. + +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +// ── Time helpers ────────────────────────────────────────────────────────── + +pub(super) fn to_unix_ms(time: SystemTime) -> i64 { + time.duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_millis() + .try_into() + .unwrap_or(i64::MAX) +} + +pub(super) fn from_unix_ms(ms: i64) -> SystemTime { + let clamped_ms = ms.max(0) as u64; + UNIX_EPOCH + Duration::from_millis(clamped_ms) +} + +/// Current wall-clock time as an `i64` SQLite timestamp. +/// +/// Delegates to [`crate::runtime::clock::unix_now_ms`] so the whole crate goes +/// through one clock entry point. +pub(super) fn now_unix_ms() -> i64 { + i64::try_from(crate::runtime::clock::unix_now_ms()).unwrap_or(i64::MAX) +} + +// ── Width conversions ───────────────────────────────────────────────────── + +pub(super) fn u64_to_i64(value: u64) -> i64 { + i64::try_from(value).unwrap_or(i64::MAX) +} + +pub(super) fn usize_to_i64(value: usize) -> i64 { + i64::try_from(value).unwrap_or(i64::MAX) +} + +pub(super) fn i64_to_u64(value: i64) -> u64 { + value.max(0) as u64 +} + +pub(super) fn i64_to_u16(value: i64) -> u16 { + u16::try_from(value.max(0)).unwrap_or(u16::MAX) +} + +pub(super) fn i64_to_u32(value: i64) -> u32 { + u32::try_from(value.max(0)).unwrap_or(u32::MAX) +} diff --git a/sequencer/src/storage/db.rs b/sequencer/src/storage/db.rs deleted file mode 100644 index 88f8084..0000000 --- a/sequencer/src/storage/db.rs +++ /dev/null @@ -1,1242 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use rusqlite::{Connection, OpenFlags, Result, Transaction, TransactionBehavior}; -use rusqlite_migration::{M, Migrations}; -use std::time::{Duration, SystemTime, UNIX_EPOCH}; - -use super::sql::{ - sql_count_user_ops_for_frame, sql_insert_open_batch, sql_insert_open_batch_with_index, - sql_insert_open_frame, sql_insert_safe_inputs_batch, sql_insert_sequenced_direct_inputs, - sql_insert_user_ops_batch, sql_select_batch_policy, sql_select_frames_for_batch, - sql_select_latest_batch_index, sql_select_latest_batch_with_user_op_count, - sql_select_latest_frame_in_batch_for_batch, sql_select_max_safe_input_index, - sql_select_ordered_l2_tx_count, sql_select_ordered_l2_txs_for_batch, - sql_select_ordered_l2_txs_from_offset, sql_select_ordered_l2_txs_page_from_offset, - sql_select_safe_block, sql_select_safe_inputs_range, sql_select_total_drained_direct_inputs, - sql_select_user_ops_for_frame, sql_update_batch_policy_alpha, - sql_update_batch_policy_log_gas_price, sql_update_safe_block, -}; -use super::{ - BatchPolicy, FrameHeader, SafeFrontier, SafeInputRange, StorageOpenError, StoredSafeInput, - WriteHead, -}; -use crate::inclusion_lane::PendingUserOp; -use alloy_primitives::Address; -use sequencer_core::batch::{Batch, BatchForSubmission, Frame as BatchFrame, WireUserOp}; -use sequencer_core::l2_tx::{DirectInput, SequencedL2Tx, ValidUserOp}; - -const MIGRATION_0001_SCHEMA: &str = include_str!("migrations/0001_schema.sql"); - -pub struct Storage { - conn: Connection, -} - -impl Storage { - pub fn open(path: &str, synchronous: &str) -> std::result::Result { - let conn = Self::open_connection_with_migrations(path, synchronous)?; - Ok(Self { conn }) - } - - pub fn open_without_migrations( - path: &str, - synchronous: &str, - ) -> std::result::Result { - let conn = Self::open_connection(path, synchronous)?; - Ok(Self { conn }) - } - - pub fn open_read_only(path: &str) -> std::result::Result { - let conn = Self::open_connection_read_only(path)?; - Ok(Self { conn }) - } - - pub fn open_connection( - path: &str, - synchronous: &str, - ) -> std::result::Result { - let conn = Connection::open(path)?; - conn.pragma_update(None, "foreign_keys", "ON")?; - conn.pragma_update(None, "journal_mode", "WAL")?; - conn.pragma_update(None, "synchronous", synchronous)?; - conn.pragma_update(None, "busy_timeout", 5000)?; - Ok(conn) - } - - pub fn open_connection_read_only( - path: &str, - ) -> std::result::Result { - let conn = Connection::open_with_flags(path, OpenFlags::SQLITE_OPEN_READ_ONLY)?; - conn.pragma_update(None, "query_only", "ON")?; - // Readers should fail fast under write pressure to keep tail latency bounded. - conn.pragma_update(None, "busy_timeout", 50)?; - Ok(conn) - } - - pub fn open_connection_with_migrations( - path: &str, - synchronous: &str, - ) -> std::result::Result { - let mut conn = Self::open_connection(path, synchronous)?; - Self::run_migrations(&mut conn)?; - Ok(conn) - } - - pub fn run_migrations(conn: &mut Connection) -> std::result::Result<(), StorageOpenError> { - Migrations::from_slice(&[M::up(MIGRATION_0001_SCHEMA)]).to_latest(conn)?; - Ok(()) - } - - pub fn load_next_undrained_safe_input_index(&mut self) -> Result { - let value = sql_select_total_drained_direct_inputs(&self.conn)?; - Ok(i64_to_u64(value)) - } - - pub fn safe_input_end_exclusive(&mut self) -> Result { - let value = sql_select_max_safe_input_index(&self.conn)?; - Ok(match value { - Some(last_index) => i64_to_u64(last_index).saturating_add(1), - None => 0, - }) - } - - pub fn current_safe_block(&mut self) -> Result { - let value = sql_select_safe_block(&self.conn)?; - Ok(i64_to_u64(value)) - } - - pub fn ensure_minimum_safe_block(&mut self, minimum_safe_block: u64) -> Result<()> { - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Immediate)?; - let current_safe_block = query_current_safe_block(&tx)?; - if current_safe_block < minimum_safe_block { - let changed_rows = sql_update_safe_block(&tx, u64_to_i64(minimum_safe_block))?; - if changed_rows != 1 { - return Err(rusqlite::Error::StatementChangedRows(changed_rows)); - } - } - tx.commit()?; - Ok(()) - } - - pub fn load_safe_frontier(&mut self) -> Result { - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Deferred)?; - let safe_block = query_current_safe_block(&tx)?; - let end_exclusive = query_latest_safe_input_index_exclusive(&tx)?; - tx.commit()?; - Ok(SafeFrontier { - safe_block, - end_exclusive, - }) - } - - /// Scan safe-input payloads for `sender` in pages, SSZ-decode each payload - /// to extract the batch nonce, and compute the longest contiguous nonce - /// prefix starting from 0. Memory is bounded by `page_size` payloads per - /// iteration rather than the full table. - pub fn advance_safe_batch_nonce_for_sender( - &mut self, - sender: Address, - page_size: u64, - ) -> Result<(u64, u64)> { - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Deferred)?; - let safe_block = query_current_safe_block(&tx)?; - - const SQL: &str = "SELECT safe_input_index, payload FROM safe_inputs \ - WHERE sender = ?1 AND safe_input_index >= ?2 \ - ORDER BY safe_input_index ASC LIMIT ?3"; - let mut expected: u64 = 0; - let mut offset: i64 = 0; - let limit = i64::try_from(page_size).unwrap_or(i64::MAX); - loop { - let mut stmt = tx.prepare_cached(SQL)?; - let mut rows = stmt.query(rusqlite::params![sender.as_slice(), offset, limit])?; - let mut fetched_rows: i64 = 0; - while let Some(row) = rows.next()? { - fetched_rows += 1; - offset = row.get::<_, i64>(0)?.saturating_add(1); - let payload: Vec = row.get(1)?; - if let Ok(batch) = ::from_ssz_bytes(&payload) - && batch.nonce == expected - { - expected = expected.saturating_add(1); - } - } - if fetched_rows < limit { - break; - } - } - - tx.commit()?; - Ok((safe_block, expected)) - } - - pub fn fill_safe_inputs( - &mut self, - from_inclusive: u64, - to_exclusive: u64, - out: &mut Vec, - ) -> Result<()> { - assert!( - from_inclusive <= to_exclusive, - "invalid safe-input interval [{from_inclusive}, {to_exclusive})" - ); - - if from_inclusive == to_exclusive { - return Ok(()); - } - - let rows = sql_select_safe_inputs_range( - &self.conn, - u64_to_i64(from_inclusive), - u64_to_i64(to_exclusive), - )?; - - let mut fetched_count = 0_u64; - for (offset, row) in rows.into_iter().enumerate() { - let index = i64_to_u64(row.safe_input_index); - let expected = from_inclusive.saturating_add(offset as u64); - - assert_eq!( - index, expected, - "non-contiguous safe-input index: expected {expected}, found {index}" - ); - - out.push(StoredSafeInput { - sender: Address::from_slice(row.sender.as_slice()), - payload: row.payload, - block_number: i64_to_u64(row.block_number), - }); - fetched_count = fetched_count.saturating_add(1); - } - - assert_eq!( - from_inclusive.saturating_add(fetched_count), - to_exclusive, - "safe-input interval [{from_inclusive}, {to_exclusive}) not fully populated" - ); - - Ok(()) - } - - pub fn append_safe_inputs( - &mut self, - safe_block: u64, - inputs: &[StoredSafeInput], - ) -> Result<()> { - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Immediate)?; - - let current_safe_block = query_current_safe_block(&tx)?; - assert!( - safe_block >= current_safe_block, - "safe block regressed: current={current_safe_block}, next={safe_block}" - ); - assert!( - safe_block > current_safe_block || inputs.is_empty(), - "safe block must advance when appending new safe inputs" - ); - - let next_expected = query_latest_safe_input_index_exclusive(&tx)?; - sql_insert_safe_inputs_batch(&tx, next_expected, inputs)?; - let changed_rows = sql_update_safe_block(&tx, u64_to_i64(safe_block))?; - if changed_rows != 1 { - return Err(rusqlite::Error::StatementChangedRows(changed_rows)); - } - - tx.commit()?; - Ok(()) - } - - pub fn load_open_state(&mut self) -> Result> { - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Deferred)?; - let head = load_current_write_head(&tx)?; - tx.commit()?; - Ok(head) - } - - pub fn initialize_open_state( - &mut self, - safe_block: u64, - leading_direct_range: SafeInputRange, - ) -> Result { - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Immediate)?; - assert!( - load_current_write_head(&tx)?.is_none(), - "open state already exists" - ); - - let now_ms = now_unix_ms(); - let policy = query_batch_policy(&tx)?; - insert_open_batch_with_index(&tx, 0, now_ms)?; - insert_open_frame(&tx, 0, 0, now_ms, policy.recommended_fee, safe_block)?; - persist_frame_direct_sequence(&tx, 0, 0, leading_direct_range)?; - tx.commit()?; - - Ok(WriteHead { - batch_index: 0, - batch_created_at: from_unix_ms(now_ms), - frame_fee: policy.recommended_fee, - safe_block, - batch_user_op_count: 0, - open_frame_user_op_count: 0, - frame_in_batch: 0, - max_batch_user_op_bytes: super::batch_size_target_bytes(policy), - }) - } - - pub fn batch_policy(&mut self) -> Result { - let (log_recommended_fee, log_batch_size_target) = sql_select_batch_policy(&self.conn)?; - let max_exp = sequencer_core::fee::MAX_EXPONENT; - Ok(BatchPolicy { - // Clamp to MAX_EXPONENT to prevent panics in fee_to_linear. - recommended_fee: i64_to_u16(log_recommended_fee).min(max_exp), - batch_size_target: i64_to_u16(log_batch_size_target).min(max_exp), - }) - } - - pub fn set_log_gas_price(&mut self, log_gas_price: u16) -> Result<()> { - let changed_rows = - sql_update_batch_policy_log_gas_price(&self.conn, i64::from(log_gas_price))?; - if changed_rows != 1 { - return Err(rusqlite::Error::StatementChangedRows(changed_rows)); - } - Ok(()) - } - - pub fn set_alpha(&mut self, num: u64, denom: u64) -> Result<()> { - use sequencer_core::fee::log_fee_ratio; - - let log_alpha = log_fee_ratio(num, denom); - let one_plus_alpha_num = num.checked_add(denom).expect( - "set_alpha: num + denom overflows u64; use smaller values for the alpha fraction", - ); - let log_one_plus_alpha = log_fee_ratio(one_plus_alpha_num, denom); - - let changed_rows = sql_update_batch_policy_alpha( - &self.conn, - i64::from(log_alpha), - i64::from(log_one_plus_alpha), - )?; - if changed_rows != 1 { - return Err(rusqlite::Error::StatementChangedRows(changed_rows)); - } - Ok(()) - } - - pub fn append_user_ops_chunk( - &mut self, - head: &mut WriteHead, - user_ops: &[PendingUserOp], - ) -> Result<()> { - if user_ops.is_empty() { - return Ok(()); - } - - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Immediate)?; - // Keep the invariant check inside the write transaction so validation and writes - // observe the same database snapshot. - assert_write_head_matches_open_state(&tx, head)?; - - sql_insert_user_ops_batch( - &tx, - u64_to_i64(head.batch_index), - i64::from(head.frame_in_batch), - head.open_frame_user_op_count, - user_ops, - )?; - - tx.commit()?; - head.increment_batch_user_op_count(user_ops.len()); - Ok(()) - } - - pub fn close_frame_only( - &mut self, - head: &mut WriteHead, - next_safe_block: u64, - leading_direct_range: SafeInputRange, - ) -> Result<()> { - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Immediate)?; - assert_write_head_matches_open_state(&tx, head)?; - let now_ms = now_unix_ms(); - let policy = query_batch_policy(&tx)?; - let next_frame_in_batch = head.frame_in_batch.saturating_add(1); - insert_open_frame( - &tx, - head.batch_index, - next_frame_in_batch, - now_ms, - policy.recommended_fee, - next_safe_block, - )?; - persist_frame_direct_sequence( - &tx, - head.batch_index, - next_frame_in_batch, - leading_direct_range, - )?; - tx.commit()?; - head.advance_frame(policy, next_safe_block); - Ok(()) - } - - pub fn close_frame_and_batch( - &mut self, - head: &mut WriteHead, - next_safe_block: u64, - ) -> Result<()> { - let tx = self - .conn - .transaction_with_behavior(TransactionBehavior::Immediate)?; - assert_write_head_matches_open_state(&tx, head)?; - let now_ms = now_unix_ms(); - // Batch policy is sampled here: the derived fee is committed to the newly - // opened frame, and the batch size target is stored on the write head. - let policy = query_batch_policy(&tx)?; - let next_batch_index = insert_open_batch(&tx, now_ms)?; - insert_open_frame( - &tx, - next_batch_index, - 0, - now_ms, - policy.recommended_fee, - next_safe_block, - )?; - tx.commit()?; - head.move_to_next_batch( - next_batch_index, - from_unix_ms(now_ms), - policy, - next_safe_block, - ); - Ok(()) - } - - pub fn load_ordered_l2_txs_from(&mut self, offset: u64) -> Result> { - // Read the persisted total order used by catch-up and downstream feed readers. - let rows = sql_select_ordered_l2_txs_from_offset(&self.conn, u64_to_i64(offset))?; - Ok(decode_ordered_l2_txs(rows)) - } - - pub fn load_ordered_l2_txs_page_from( - &mut self, - offset: u64, - limit: usize, - ) -> Result> { - if limit == 0 { - return Ok(Vec::new()); - } - - let rows = sql_select_ordered_l2_txs_page_from_offset( - &self.conn, - u64_to_i64(offset), - usize_to_i64(limit), - )?; - Ok(decode_ordered_l2_txs(rows)) - } - - pub fn ordered_l2_tx_count(&mut self) -> Result { - let value = sql_select_ordered_l2_tx_count(&self.conn)?; - Ok(i64_to_u64(value)) - } - - pub fn latest_batch_index(&mut self) -> Result> { - let value = sql_select_latest_batch_index(&self.conn)?; - Ok(value.map(i64_to_u64)) - } - - pub fn load_frames_for_batch(&mut self, batch_index: u64) -> Result> { - let rows = sql_select_frames_for_batch(&self.conn, u64_to_i64(batch_index))?; - Ok(rows - .into_iter() - .map(|row| FrameHeader { - frame_in_batch: i64_to_u32(row.frame_in_batch), - fee: i64_to_u16(row.fee), - safe_block: i64_to_u64(row.safe_block), - }) - .collect()) - } - - pub fn load_ordered_l2_txs_for_batch( - &mut self, - batch_index: u64, - ) -> Result> { - let rows = sql_select_ordered_l2_txs_for_batch(&self.conn, u64_to_i64(batch_index))?; - Ok(decode_ordered_l2_txs(rows)) - } - - pub fn load_batch_for_submission(&mut self, batch_index: u64) -> Result { - let created_at_ms: i64 = self.conn.query_row( - "SELECT created_at_ms FROM batches WHERE batch_index = ?1 LIMIT 1", - [u64_to_i64(batch_index)], - |row| row.get(0), - )?; - - let frame_headers = self.load_frames_for_batch(batch_index)?; - let mut frames = Vec::with_capacity(frame_headers.len()); - - for header in frame_headers { - let rows = sql_select_user_ops_for_frame( - &self.conn, - u64_to_i64(batch_index), - i64::from(header.frame_in_batch), - )?; - - let user_ops = rows - .into_iter() - .map(|row| WireUserOp { - nonce: i64_to_u32(row.nonce), - max_fee: i64_to_u16(row.max_fee), - data: row.data, - signature: row.sig, - }) - .collect(); - - frames.push(BatchFrame { - user_ops, - safe_block: header.safe_block, - fee_price: header.fee, - }); - } - - let batch = Batch { - nonce: batch_index, - frames, - }; - let created_at_ms_u64 = created_at_ms.max(0) as u64; - - Ok(BatchForSubmission { - batch_index, - created_at_ms: created_at_ms_u64, - batch, - }) - } -} - -fn decode_ordered_l2_txs(rows: Vec) -> Vec { - let mut out = Vec::new(); - - for row in rows { - if row.kind == 0 { - let sender_bytes = row.sender.expect("ordered replay row: missing sender"); - assert_eq!( - sender_bytes.len(), - 20, - "ordered replay row: sender must be 20 bytes" - ); - - let entry = ValidUserOp { - sender: Address::from_slice(sender_bytes.as_slice()), - // Replay uses the persisted frame fee (log-space exponent) to mirror canonical execution. - fee: i64_to_u16(row.fee.expect("ordered replay row: missing fee")), - data: row.data.expect("ordered replay row: missing data"), - }; - out.push(SequencedL2Tx::UserOp(entry)); - } else { - let direct = DirectInput { - sender: Address::from_slice( - row.sender - .expect("ordered replay row: missing sender") - .as_slice(), - ), - block_number: i64_to_u64( - row.block_number - .expect("ordered replay row: missing block_number"), - ), - payload: row.payload.expect("ordered replay row: missing payload"), - }; - out.push(SequencedL2Tx::Direct(direct)); - } - } - - out -} - -fn load_current_write_head(tx: &Transaction<'_>) -> Result> { - let Some((batch_index, batch_created_at, batch_user_op_count)) = query_latest_batch(tx)? else { - return Ok(None); - }; - let (frame_in_batch, frame_fee, safe_block) = query_latest_frame_in_batch(tx, batch_index)?; - let open_frame_user_op_count = query_frame_user_op_count(tx, batch_index, frame_in_batch)?; - let policy = query_batch_policy(tx)?; - Ok(Some(WriteHead { - batch_index, - batch_created_at, - frame_fee, - safe_block, - batch_user_op_count, - open_frame_user_op_count, - frame_in_batch, - max_batch_user_op_bytes: super::batch_size_target_bytes(policy), - })) -} - -fn assert_write_head_matches_open_state(tx: &Transaction<'_>, expected: &WriteHead) -> Result<()> { - let actual = load_current_write_head(tx)?.expect("stale WriteHead: storage has no open state"); - assert_eq!( - expected.batch_index, actual.batch_index, - "stale WriteHead: batch_index mismatch" - ); - assert_eq!( - expected.frame_in_batch, actual.frame_in_batch, - "stale WriteHead: frame_in_batch mismatch" - ); - assert_eq!( - expected.batch_user_op_count, actual.batch_user_op_count, - "stale WriteHead: batch_user_op_count mismatch" - ); - assert_eq!( - expected.open_frame_user_op_count, actual.open_frame_user_op_count, - "stale WriteHead: open_frame_user_op_count mismatch" - ); - assert_eq!( - expected.frame_fee, actual.frame_fee, - "stale WriteHead: frame_fee mismatch" - ); - assert_eq!( - expected.safe_block, actual.safe_block, - "stale WriteHead: safe_block mismatch" - ); - assert_eq!( - to_unix_ms(expected.batch_created_at), - to_unix_ms(actual.batch_created_at), - "stale WriteHead: batch_created_at mismatch" - ); - Ok(()) -} - -fn query_latest_batch(tx: &Transaction<'_>) -> Result> { - match sql_select_latest_batch_with_user_op_count(tx) { - Ok((batch_index, batch_created_at_ms, batch_user_op_count)) => Ok(Some(( - i64_to_u64(batch_index), - from_unix_ms(batch_created_at_ms), - i64_to_u64(batch_user_op_count), - ))), - Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None), - Err(source) => Err(source), - } -} - -fn query_latest_frame_in_batch(tx: &Transaction<'_>, batch_index: u64) -> Result<(u32, u16, u64)> { - let (frame_in_batch, frame_fee, safe_block) = - sql_select_latest_frame_in_batch_for_batch(tx, u64_to_i64(batch_index))?; - Ok(( - i64_to_u32(frame_in_batch), - i64_to_u16(frame_fee), - i64_to_u64(safe_block), - )) -} - -fn query_frame_user_op_count( - tx: &Transaction<'_>, - batch_index: u64, - frame_in_batch: u32, -) -> Result { - let value = - sql_count_user_ops_for_frame(tx, u64_to_i64(batch_index), i64::from(frame_in_batch))?; - Ok(i64_to_u32(value)) -} - -fn query_latest_safe_input_index_exclusive(tx: &Connection) -> Result { - let value = sql_select_max_safe_input_index(tx)?; - Ok(match value { - Some(last_index) => i64_to_u64(last_index).saturating_add(1), - None => 0, - }) -} - -fn query_current_safe_block(tx: &Connection) -> Result { - let value = sql_select_safe_block(tx)?; - Ok(i64_to_u64(value)) -} - -fn query_batch_policy(tx: &Transaction<'_>) -> Result { - let (log_recommended_fee, log_batch_size_target) = sql_select_batch_policy(tx)?; - let max_exp = sequencer_core::fee::MAX_EXPONENT; - Ok(BatchPolicy { - // Clamp to MAX_EXPONENT to prevent panics in fee_to_linear. - recommended_fee: i64_to_u16(log_recommended_fee).min(max_exp), - batch_size_target: i64_to_u16(log_batch_size_target).min(max_exp), - }) -} - -fn persist_frame_direct_sequence( - tx: &Transaction<'_>, - batch_index: u64, - frame_in_batch: u32, - drained_direct_range: SafeInputRange, -) -> Result<()> { - sql_insert_sequenced_direct_inputs( - tx, - u64_to_i64(batch_index), - i64::from(frame_in_batch), - drained_direct_range, - ) -} - -fn insert_open_batch(tx: &Transaction<'_>, created_at_ms: i64) -> Result { - sql_insert_open_batch(tx, created_at_ms)?; - Ok(i64_to_u64(tx.last_insert_rowid())) -} - -fn insert_open_batch_with_index( - tx: &Transaction<'_>, - batch_index: u64, - created_at_ms: i64, -) -> Result<()> { - sql_insert_open_batch_with_index(tx, u64_to_i64(batch_index), created_at_ms)?; - Ok(()) -} - -fn insert_open_frame( - tx: &Transaction<'_>, - batch_index: u64, - frame_in_batch: u32, - created_at_ms: i64, - frame_fee: u16, - safe_block: u64, -) -> Result<()> { - sql_insert_open_frame( - tx, - u64_to_i64(batch_index), - i64::from(frame_in_batch), - created_at_ms, - i64::from(frame_fee), - u64_to_i64(safe_block), - )?; - Ok(()) -} - -fn to_unix_ms(time: SystemTime) -> i64 { - time.duration_since(UNIX_EPOCH) - .unwrap_or_default() - .as_millis() - .try_into() - .unwrap_or(i64::MAX) -} - -fn from_unix_ms(ms: i64) -> SystemTime { - let clamped_ms = ms.max(0) as u64; - UNIX_EPOCH + Duration::from_millis(clamped_ms) -} - -fn now_unix_ms() -> i64 { - to_unix_ms(SystemTime::now()) -} - -fn u64_to_i64(value: u64) -> i64 { - i64::try_from(value).unwrap_or(i64::MAX) -} - -fn usize_to_i64(value: usize) -> i64 { - i64::try_from(value).unwrap_or(i64::MAX) -} - -fn i64_to_u64(value: i64) -> u64 { - value.max(0) as u64 -} - -fn i64_to_u16(value: i64) -> u16 { - u16::try_from(value.max(0)).unwrap_or(u16::MAX) -} - -fn i64_to_u32(value: i64) -> u32 { - u32::try_from(value.max(0)).unwrap_or(u32::MAX) -} - -#[cfg(test)] -mod tests { - use alloy_primitives::Address; - - use super::Storage; - use crate::storage::{SafeInputRange, StoredSafeInput}; - use sequencer_core::l2_tx::SequencedL2Tx; - use tempfile::TempDir; - - struct TestDb { - _dir: TempDir, - path: String, - } - - fn temp_db(name: &str) -> TestDb { - let dir = tempfile::Builder::new() - .prefix(format!("sequencer-{name}-").as_str()) - .tempdir() - .expect("create temporary test directory"); - let path = dir.path().join("sequencer.sqlite"); - TestDb { - _dir: dir, - path: path.to_string_lossy().into_owned(), - } - } - - #[test] - fn open_state_is_idempotent_and_rotation_is_atomic() { - let db = temp_db("open-state"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - - assert!( - storage - .load_open_state() - .expect("load open state") - .is_none(), - "fresh storage should not have an open frame yet" - ); - - let head_a = storage - .initialize_open_state(0, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - let head_b = storage - .load_open_state() - .expect("load existing open state") - .expect("open state should now exist"); - - assert_eq!(head_a.batch_index, head_b.batch_index); - assert_eq!(head_a.frame_in_batch, head_b.frame_in_batch); - assert_eq!(head_a.frame_fee, head_b.frame_fee); - // Default log_recommended_fee = 0+20+419+621 = 1060 - assert_eq!(head_a.frame_fee, 1060); - - let mut head_c = head_b; - let next_safe_block = head_c.safe_block; - storage - .close_frame_only(&mut head_c, next_safe_block, SafeInputRange::empty_at(0)) - .expect("rotate within same batch"); - assert_eq!(head_c.batch_index, head_b.batch_index); - assert_eq!(head_c.frame_in_batch, 1); - - let mut head_d = head_c; - let next_safe_block = head_d.safe_block; - storage - .close_frame_and_batch(&mut head_d, next_safe_block) - .expect("close batch and rotate"); - assert!(head_d.batch_index > head_c.batch_index); - assert_eq!(head_d.frame_in_batch, 0); - } - - #[test] - fn next_frame_fee_comes_from_batch_policy() { - let db = temp_db("batch-policy-fee"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - let policy = storage.batch_policy().expect("default policy"); - // Default: log_gas_price=0, log_recommended_fee = 0+20+419+621 = 1060 - assert_eq!(policy.recommended_fee, 1060); - - storage.set_log_gas_price(100).expect("set log gas price"); - - let mut head = storage - .initialize_open_state(0, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - let next_safe_block = head.safe_block; - storage - .close_frame_and_batch(&mut head, next_safe_block) - .expect("rotate batch"); - - let policy = storage.batch_policy().expect("read policy"); - // log_recommended_fee = 100+20+419+621 = 1160 - assert_eq!(head.frame_fee, 1160); - assert_eq!(head.frame_fee, policy.recommended_fee); - assert!( - head.max_batch_user_op_bytes > 0, - "batch size target should be set" - ); - } - - #[test] - fn high_gas_price_clamps_recommended_fee_to_max_exponent() { - let db = temp_db("clamp-fee"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - - // Set gas price high enough that log_recommended_fee > MAX_EXPONENT (17101). - // Default: log_recommended_fee = gas_price + 20 + 419 + 621. - // With gas_price = 17000: 17000 + 1060 = 18060 > 17101. - storage - .set_log_gas_price(17000) - .expect("set high gas price"); - - let policy = storage.batch_policy().expect("read policy"); - assert_eq!( - policy.recommended_fee, - sequencer_core::fee::MAX_EXPONENT, - "recommended_fee should be clamped to MAX_EXPONENT" - ); - - // fee_to_linear must not panic with the clamped value. - let _ = sequencer_core::fee::fee_to_linear(policy.recommended_fee); - } - - #[test] - #[should_panic(expected = "num + denom overflows u64")] - fn set_alpha_rejects_overflow() { - let db = temp_db("alpha-overflow"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - storage.set_alpha(u64::MAX, 1).unwrap(); - } - - #[test] - fn replay_returns_direct_inputs_in_drain_order() { - let db = temp_db("replay-order"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - let head = storage - .initialize_open_state(0, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - - let drained = vec![ - StoredSafeInput { - sender: Address::ZERO, - payload: vec![0xaa], - block_number: 10, - }, - StoredSafeInput { - sender: Address::ZERO, - payload: vec![0xbb], - block_number: 10, - }, - ]; - storage - .append_safe_inputs(10, drained.as_slice()) - .expect("insert direct inputs"); - let mut head = head; - storage - .close_frame_only(&mut head, 10, SafeInputRange::new(0, drained.len() as u64)) - .expect("close frame with directs"); - - let replay = storage.load_ordered_l2_txs_from(0).expect("load replay"); - assert_eq!(replay.len(), 2); - match &replay[0] { - SequencedL2Tx::Direct(value) => assert_eq!(value.payload.as_slice(), &[0xaa]), - _ => panic!("expected direct input at position 0"), - } - match &replay[1] { - SequencedL2Tx::Direct(value) => assert_eq!(value.payload.as_slice(), &[0xbb]), - _ => panic!("expected direct input at position 1"), - } - } - - #[test] - fn next_undrained_safe_input_index_is_derived_from_sequenced_directs() { - let db = temp_db("safe-cursor"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - assert_eq!( - storage - .load_next_undrained_safe_input_index() - .expect("empty cursor"), - 0 - ); - - let head = storage - .initialize_open_state(0, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - let drained = vec![ - StoredSafeInput { - sender: Address::ZERO, - payload: vec![0x00], - block_number: 10, - }, - StoredSafeInput { - sender: Address::ZERO, - payload: vec![0x02], - block_number: 10, - }, - ]; - storage - .append_safe_inputs(10, drained.as_slice()) - .expect("insert direct inputs"); - let mut head = head; - storage - .close_frame_only(&mut head, 10, SafeInputRange::new(0, drained.len() as u64)) - .expect("close frame with directs"); - - assert_eq!( - storage - .load_next_undrained_safe_input_index() - .expect("derived cursor"), - 2 - ); - } - - #[test] - fn safe_input_api_uses_half_open_intervals() { - let db = temp_db("safe-input-api"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - - assert_eq!(storage.safe_input_end_exclusive().expect("safe head"), 0); - let mut out = Vec::new(); - storage - .fill_safe_inputs(0, 0, &mut out) - .expect("query empty interval"); - assert!(out.is_empty()); - - let inserted = vec![ - StoredSafeInput { - sender: Address::ZERO, - payload: vec![0xa0], - block_number: 10, - }, - StoredSafeInput { - sender: Address::ZERO, - payload: vec![0xb1], - block_number: 10, - }, - ]; - storage - .append_safe_inputs(10, inserted.as_slice()) - .expect("insert safe directs"); - - assert_eq!(storage.safe_input_end_exclusive().expect("safe head"), 2); - - storage - .fill_safe_inputs(0, 2, &mut out) - .expect("query full interval"); - assert_eq!(out, inserted); - - out.clear(); - storage - .fill_safe_inputs(1, 1, &mut out) - .expect("query empty half-open interval"); - assert!(out.is_empty()); - } - - #[test] - fn ensure_minimum_safe_block_only_moves_forward() { - let db = temp_db("ensure-min-safe-block"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - - storage - .ensure_minimum_safe_block(7) - .expect("advance bootstrap safe head"); - assert_eq!(storage.current_safe_block().expect("read advanced"), 7); - - storage - .ensure_minimum_safe_block(3) - .expect("do not regress bootstrap safe head"); - assert_eq!(storage.current_safe_block().expect("read unchanged"), 7); - } - - #[test] - fn initialize_open_state_creates_first_real_batch_and_frame() { - let db = temp_db("initialize-open-state"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - - let head = storage - .initialize_open_state(12, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - - assert_eq!(head.batch_index, 0); - assert_eq!(head.frame_in_batch, 0); - assert_eq!(head.safe_block, 12); - - let loaded = storage - .load_open_state() - .expect("load open state") - .expect("open state should exist"); - assert_eq!(loaded.batch_index, 0); - assert_eq!(loaded.frame_in_batch, 0); - assert_eq!(loaded.safe_block, 12); - } - - #[test] - fn batch_for_submission_builds_from_storage() { - let db = temp_db("batch-for-submission"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - - let head = storage - .initialize_open_state(12, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - assert_eq!(head.batch_index, 0); - - let batch = storage - .load_batch_for_submission(0) - .expect("load batch for submission"); - - assert_eq!(batch.batch_index, 0); - assert_eq!(batch.batch.frames.len(), 1); - let frame = &batch.batch.frames[0]; - assert!(frame.user_ops.is_empty()); - assert_eq!(frame.safe_block, 12); - // Default log_recommended_fee = 0+20+419+621 = 1060 - assert_eq!(frame.fee_price, 1060); - assert!(batch.created_at_ms > 0); - } - - #[test] - fn batch_level_helpers_expose_latest_index_frames_and_txs() { - let db = temp_db("batch-level-helpers"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - - // Before initialization there should be no batches. - assert!( - storage - .latest_batch_index() - .expect("query latest batch nonce on empty db") - .is_none() - ); - - // Initialize first batch/frame and append some data. - let mut head = storage - .initialize_open_state(0, SafeInputRange::empty_at(0)) - .expect("initialize open state"); - - // Close current batch and move to next so batch 0 becomes closed. - let next_safe_block = head.safe_block; - storage - .close_frame_and_batch(&mut head, next_safe_block) - .expect("close batch and rotate"); - - // Latest batch nonce should now be 1 (open), with batch 0 closed. - let latest = storage - .latest_batch_index() - .expect("query latest batch nonce") - .expect("latest batch should exist"); - assert_eq!(latest, 1); - - // Batch 0 should still have at least one frame header. - let frames = storage - .load_frames_for_batch(0) - .expect("load frames for batch 0"); - assert!(!frames.is_empty()); - - // Ordered L2 txs for batch 0 should be queryable (even if empty). - let txs = storage - .load_ordered_l2_txs_for_batch(0) - .expect("load l2 txs for batch 0"); - assert!( - txs.is_empty(), - "fresh batch should not have sequenced txs yet" - ); - } - - /// Helper: insert safe inputs whose payloads are SSZ-encoded batches with - /// the given nonces, all attributed to `sender`. - fn seed_safe_inputs_with_batch_nonces( - storage: &mut Storage, - sender: Address, - safe_block: u64, - nonces: &[u64], - ) { - let inputs: Vec = nonces - .iter() - .map(|nonce| StoredSafeInput { - sender, - payload: ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { - nonce: *nonce, - frames: Vec::new(), - }), - block_number: safe_block, - }) - .collect(); - storage - .append_safe_inputs(safe_block, inputs.as_slice()) - .expect("append safe inputs"); - } - - const SENDER_A: Address = Address::repeat_byte(0xAA); - const SENDER_B: Address = Address::repeat_byte(0xBB); - - #[test] - fn advance_safe_batch_nonce_returns_zero_when_no_inputs_exist() { - let db = temp_db("advance-nonce-empty"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - let (_, next) = storage - .advance_safe_batch_nonce_for_sender(SENDER_A, 256) - .expect("advance nonce"); - assert_eq!(next, 0); - } - - #[test] - fn advance_safe_batch_nonce_contiguous_prefix() { - let db = temp_db("advance-nonce-contiguous"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_A, 10, &[0, 1, 2]); - - let (safe_block, next) = storage - .advance_safe_batch_nonce_for_sender(SENDER_A, 256) - .expect("advance nonce"); - assert_eq!(safe_block, 10); - assert_eq!(next, 3); - } - - #[test] - fn advance_safe_batch_nonce_stops_at_gap() { - let db = temp_db("advance-nonce-gap"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - // nonces: 0, 1, 3, 4, 5 — gap at 2 - seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_A, 10, &[0, 1, 3, 4, 5]); - - let (_, next) = storage - .advance_safe_batch_nonce_for_sender(SENDER_A, 256) - .expect("advance nonce"); - assert_eq!(next, 2); - } - - #[test] - fn advance_safe_batch_nonce_works_across_page_boundaries() { - let db = temp_db("advance-nonce-paged"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - // 5 contiguous nonces with page_size=2 → 3 pages - seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_A, 10, &[0, 1, 2, 3, 4]); - - let (_, next) = storage - .advance_safe_batch_nonce_for_sender(SENDER_A, 2) - .expect("advance nonce"); - assert_eq!(next, 5); - } - - #[test] - fn advance_safe_batch_nonce_gap_spans_page_boundary() { - let db = temp_db("advance-nonce-gap-across-page"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - // page_size=2: page0=[0,1], page1=[3,4], page2=[5] - // gap at nonce 2 — should still detect it - seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_A, 10, &[0, 1, 3, 4, 5]); - - let (_, next) = storage - .advance_safe_batch_nonce_for_sender(SENDER_A, 2) - .expect("advance nonce"); - assert_eq!(next, 2); - } - - #[test] - fn advance_safe_batch_nonce_filters_by_sender() { - let db = temp_db("advance-nonce-sender-filter"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_A, 10, &[0, 1, 2]); - seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_B, 11, &[0]); - - let (_, next_a) = storage - .advance_safe_batch_nonce_for_sender(SENDER_A, 2) - .expect("advance nonce A"); - let (_, next_b) = storage - .advance_safe_batch_nonce_for_sender(SENDER_B, 2) - .expect("advance nonce B"); - assert_eq!(next_a, 3); - assert_eq!(next_b, 1); - } - - #[test] - fn advance_safe_batch_nonce_page_size_one() { - let db = temp_db("advance-nonce-page-1"); - let mut storage = Storage::open(db.path.as_str(), "NORMAL").expect("open storage"); - seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_A, 10, &[0, 1, 2]); - - let (_, next) = storage - .advance_safe_batch_nonce_for_sender(SENDER_A, 1) - .expect("advance nonce"); - assert_eq!(next, 3); - } -} diff --git a/sequencer/src/storage/egress.rs b/sequencer/src/storage/egress.rs new file mode 100644 index 0000000..dbf98e7 --- /dev/null +++ b/sequencer/src/storage/egress.rs @@ -0,0 +1,134 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Egress reader: ordered-L2-tx queries used by the WS feed and catch-up replay. +//! +//! Read-only — every method here either pages the `valid_sequenced_l2_txs` view +//! or counts over it. The view encapsulates the exclusion of invalidated batches +//! so callers don't repeat the filter. + +use alloy_primitives::Address; +use rusqlite::{Result, params}; + +use super::Storage; +use super::convert::{i64_to_u64, u64_to_i64, usize_to_i64}; +use super::queries::decode_l2_tx_row; +use sequencer_core::l2_tx::SequencedL2Tx; + +impl Storage { + /// Load a page of ordered L2 transactions starting after the given offset. + /// Returns `(db_offset, tx)` pairs. Callers should track `db_offset` of the + /// last item as their cursor, not increment a counter. + pub fn ordered_l2_txs_page_from( + &mut self, + offset: u64, + limit: usize, + ) -> Result> { + if limit == 0 { + return Ok(Vec::new()); + } + + const SQL: &str = " + SELECT + s.offset, + CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN 0 ELSE 1 END AS kind, + CASE + WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.sender + WHEN s.safe_input_index IS NOT NULL THEN d.sender + ELSE NULL + END AS sender, + CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.data ELSE NULL END AS data, + CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN f.fee ELSE NULL END AS fee, + CASE WHEN s.safe_input_index IS NOT NULL THEN d.payload ELSE NULL END AS payload, + CASE WHEN s.safe_input_index IS NOT NULL THEN d.block_number ELSE NULL END AS block_number + FROM valid_sequenced_l2_txs s + LEFT JOIN user_ops u + ON u.batch_index = s.batch_index + AND u.frame_in_batch = s.frame_in_batch + AND u.pos_in_frame = s.user_op_pos_in_frame + LEFT JOIN frames f + ON f.batch_index = s.batch_index + AND f.frame_in_batch = s.frame_in_batch + LEFT JOIN safe_inputs d + ON d.safe_input_index = s.safe_input_index + WHERE s.offset > ?1 + ORDER BY s.offset ASC + LIMIT ?2 + "; + let mut stmt = self.conn.prepare_cached(SQL)?; + let rows = stmt.query_map(params![u64_to_i64(offset), usize_to_i64(limit)], |row| { + let db_offset: i64 = row.get(0)?; + let tx = decode_l2_tx_row( + row.get(1)?, + row.get(2)?, + row.get(3)?, + row.get(4)?, + row.get(5)?, + row.get(6)?, + ); + Ok((i64_to_u64(db_offset), tx)) + })?; + rows.collect::>>() + } + + /// Returns the maximum offset in `valid_sequenced_l2_txs`, or 0 if empty. + /// Used as the head cursor for feed subscribers. + pub fn ordered_l2_tx_head_offset(&mut self) -> Result { + let value: Option = self.conn.query_row( + "SELECT MAX(offset) FROM valid_sequenced_l2_txs", + [], + |row| row.get(0), + )?; + Ok(value.map(i64_to_u64).unwrap_or(0)) + } + + /// Count broadcastable events with offset > `from_offset`, capped at `limit`. + /// + /// Used for catch-up window checks. Excludes batch-submitter direct inputs + /// (which are filtered before WS delivery) so the count reflects what the + /// client actually receives. + pub fn count_broadcastable_events_after( + &mut self, + from_offset: u64, + limit: u64, + batch_submitter_address: Option

, + ) -> Result { + if limit == 0 { + return Ok(0); + } + + let value: i64 = match batch_submitter_address { + Some(addr) => { + const SQL: &str = " + SELECT COUNT(*) FROM ( + SELECT 1 FROM valid_sequenced_l2_txs s + WHERE s.offset > ?1 + AND NOT (s.safe_input_index IS NOT NULL + AND EXISTS (SELECT 1 FROM safe_inputs si + WHERE si.safe_input_index = s.safe_input_index + AND si.sender = ?2)) + LIMIT ?3 + )"; + self.conn.query_row( + SQL, + params![u64_to_i64(from_offset), addr.as_slice(), u64_to_i64(limit)], + |row| row.get(0), + )? + } + None => { + const SQL: &str = " + SELECT COUNT(*) FROM ( + SELECT 1 FROM valid_sequenced_l2_txs + WHERE offset > ?1 + LIMIT ?2 + )"; + self.conn.query_row( + SQL, + params![u64_to_i64(from_offset), u64_to_i64(limit)], + |row| row.get(0), + )? + } + }; + Ok(i64_to_u64(value)) + } +} diff --git a/sequencer/src/storage/ingress.rs b/sequencer/src/storage/ingress.rs new file mode 100644 index 0000000..a67c65c --- /dev/null +++ b/sequencer/src/storage/ingress.rs @@ -0,0 +1,543 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Inclusion-lane writer: opens the initial batch/frame, appends user-op chunks, +//! and rotates frame/batch boundaries on the hot path. +//! +//! The lane also reads `safe_inputs` (executed by the application) and the open +//! state (resumed on startup) — those reads live here too because they're driven +//! by the lane's flow, not by an L1 ingress event. + +use alloy_primitives::Address; +use rusqlite::{Result, Transaction, params}; + +use super::convert::{from_unix_ms, i64_to_u64, now_unix_ms, to_unix_ms, u64_to_i64}; +use super::mutations::{ + insert_new_batch, insert_open_frame, persist_frame_direct_sequence, seal_batch, +}; +use super::queries::{ + load_current_write_head, query_batch_policy, query_current_safe_block, + query_latest_safe_input_index_exclusive, +}; +use super::{ + BatchPolicy, SafeInputFrontier, SafeInputRange, Storage, StoredSafeInput, WriteHead, + batch_size_target_bytes, +}; +use crate::ingress::inclusion_lane::PendingUserOp; + +impl Storage { + /// Cursor for the next safe input to drain into a frame. Reads the highest + /// already-drained `safe_input_index` from the valid (non-invalidated) + /// `sequenced_l2_txs` rows and returns `MAX + 1` (or 0 if none). + /// + /// Using `MAX + 1` instead of `COUNT(*)` makes this robust against gaps: + /// when a batch is invalidated, those rows drop out of the view and the + /// cursor naturally rewinds, allowing the recovery batch to re-drain. + pub fn next_undrained_safe_input_index(&mut self) -> Result { + const SQL: &str = " + SELECT COALESCE(MAX(safe_input_index) + 1, 0) + FROM valid_sequenced_l2_txs + WHERE safe_input_index IS NOT NULL + "; + let value: i64 = self.conn.query_row(SQL, [], |row| row.get(0))?; + Ok(i64_to_u64(value)) + } + + /// Resume the lane on startup. Returns `None` if storage is empty (caller + /// should follow up with [`Storage::initialize_open_state`]). + pub fn open_state(&mut self) -> Result> { + self.read(load_current_write_head) + } + + /// Bootstrap the very first batch + frame. Asserts that no open state + /// exists; call only when [`Storage::open_state`] returns `None`. + pub fn initialize_open_state( + &mut self, + safe_block: u64, + leading_direct_range: SafeInputRange, + ) -> Result { + self.write(|tx| { + assert!( + load_current_write_head(tx)?.is_none(), + "open state already exists" + ); + + let now_ms = now_unix_ms(); + let policy = query_batch_policy(tx)?; + // Genesis: explicit batch_index = 0, parent = None, nonce = 0. + insert_new_batch(tx, Some(0), None, now_ms)?; + insert_open_frame(tx, 0, 0, now_ms, policy.recommended_fee, safe_block)?; + persist_frame_direct_sequence(tx, 0, 0, leading_direct_range)?; + + Ok(WriteHead { + batch_index: 0, + batch_created_at: from_unix_ms(now_ms), + frame_fee: policy.recommended_fee, + safe_block, + batch_user_op_count: 0, + open_frame_user_op_count: 0, + frame_in_batch: 0, + max_batch_user_op_bytes: batch_size_target_bytes(policy), + }) + }) + } + + /// Snapshot the current L1 view: safe block + exclusive safe-input cursor. + /// The lane uses this to decide whether to advance. + pub fn safe_input_frontier(&mut self) -> Result { + self.read(|tx| { + Ok(SafeInputFrontier { + safe_block: query_current_safe_block(tx)?, + end_exclusive: query_latest_safe_input_index_exclusive(tx)?, + }) + }) + } + + /// Replace `out`'s contents with the safe-input rows in `range`. Asserts + /// contiguity — gaps in `safe_input_index` are a bug, not a runtime + /// condition. + pub fn fill_safe_inputs( + &mut self, + range: SafeInputRange, + out: &mut Vec, + ) -> Result<()> { + out.clear(); + if range.is_empty() { + return Ok(()); + } + + const SQL: &str = " + SELECT safe_input_index, sender, payload, block_number + FROM safe_inputs + WHERE safe_input_index >= ?1 AND safe_input_index < ?2 + ORDER BY safe_input_index ASC + "; + let mut stmt = self.conn.prepare_cached(SQL)?; + let rows = stmt.query_map( + params![u64_to_i64(range.start()), u64_to_i64(range.end())], + |row| { + Ok(( + row.get::<_, i64>(0)?, + row.get::<_, Vec>(1)?, + row.get::<_, Vec>(2)?, + row.get::<_, i64>(3)?, + )) + }, + )?; + + let mut fetched_count = 0_u64; + for (offset, row) in rows.enumerate() { + let (index_i64, sender, payload, block_number_i64) = row?; + let index = i64_to_u64(index_i64); + let expected = range.start().saturating_add(offset as u64); + + assert_eq!( + index, expected, + "non-contiguous safe-input index: expected {expected}, found {index}" + ); + + out.push(StoredSafeInput { + sender: Address::from_slice(sender.as_slice()), + payload, + block_number: i64_to_u64(block_number_i64), + }); + fetched_count = fetched_count.saturating_add(1); + } + + assert_eq!( + range.start().saturating_add(fetched_count), + range.end(), + "safe-input range {range:?} not fully populated" + ); + + Ok(()) + } + + /// Persist a chunk of user ops into the open frame and bump `head`'s + /// counters. + /// + /// `head` is treated as authoritative: the lane is the only writer of + /// open-frame state, so a stale `WriteHead` indicates a bug in the lane, + /// not a runtime condition. The schema's FK + PK constraints catch the + /// dangerous failure modes (write to a non-existent frame, duplicate + /// `pos_in_frame`) by failing the INSERT. + pub fn append_user_ops_chunk( + &mut self, + head: &mut WriteHead, + user_ops: &[PendingUserOp], + ) -> Result<()> { + if user_ops.is_empty() { + return Ok(()); + } + self.write(|tx| { + insert_user_ops_batch( + tx, + head.batch_index, + head.frame_in_batch, + head.open_frame_user_op_count, + user_ops, + ) + })?; + head.increment_batch_user_op_count(user_ops.len()); + Ok(()) + } + + /// Rotate to the next frame inside the same batch. Used when the safe + /// block advances but batch policy hasn't triggered a batch close — the + /// new frame inherits the batch and gets a fresh fee/safe-block. + pub fn close_frame_only( + &mut self, + head: &mut WriteHead, + next_safe_block: u64, + leading_direct_range: SafeInputRange, + ) -> Result<()> { + let policy = self.write(|tx| { + let now_ms = now_unix_ms(); + let policy = query_batch_policy(tx)?; + let next_frame_in_batch = head.frame_in_batch.saturating_add(1); + insert_open_frame( + tx, + head.batch_index, + next_frame_in_batch, + now_ms, + policy.recommended_fee, + next_safe_block, + )?; + persist_frame_direct_sequence( + tx, + head.batch_index, + next_frame_in_batch, + leading_direct_range, + )?; + Ok(policy) + })?; + head.advance_frame(policy, next_safe_block); + Ok(()) + } + + /// Close the current batch and open a fresh one with its first frame. + /// Used when batch policy (size/deadline) triggers a batch close. + /// + /// Atomically: seal the current Tip (sets `sealed_at_ms`), insert the new + /// Tip with `parent_batch_index = head.batch_index`, open its first frame. + /// Order matters: sealing first removes the old row from the + /// `ux_single_valid_tip` partial index, making room for the new Tip. + pub fn close_frame_and_batch( + &mut self, + head: &mut WriteHead, + next_safe_block: u64, + ) -> Result<()> { + let (next_batch_index, now_ms, policy) = self.write(|tx| { + let now_ms = now_unix_ms(); + // Batch policy is sampled here: the derived fee is committed to the newly + // opened frame, and the batch size target is stored on the write head. + let policy = query_batch_policy(tx)?; + seal_batch(tx, head.batch_index, now_ms)?; + let next_batch_index = insert_new_batch(tx, None, Some(head.batch_index), now_ms)?; + insert_open_frame( + tx, + next_batch_index, + 0, + now_ms, + policy.recommended_fee, + next_safe_block, + )?; + Ok((next_batch_index, now_ms, policy)) + })?; + head.move_to_next_batch( + next_batch_index, + from_unix_ms(now_ms), + policy, + next_safe_block, + ); + Ok(()) + } + + pub fn batch_policy(&mut self) -> Result { + query_batch_policy(&self.conn) + } +} + +/// Insert user ops into `user_ops`. The `trg_sequence_user_op` trigger then +/// appends the matching `sequenced_l2_txs` row for each insert. +fn insert_user_ops_batch( + tx: &Transaction<'_>, + batch_index: u64, + frame_in_batch: u32, + frame_pos_start: u32, + user_ops: &[PendingUserOp], +) -> Result<()> { + if user_ops.is_empty() { + return Ok(()); + } + let mut stmt = tx.prepare_cached( + "INSERT INTO user_ops ( + batch_index, frame_in_batch, pos_in_frame, + sender, nonce, max_fee, data, sig, received_at_ms + ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)", + )?; + for (offset, item) in user_ops.iter().enumerate() { + let pos_in_frame = frame_pos_start.saturating_add(offset as u32); + let sig = item.signed.signature.as_bytes(); + stmt.execute(params![ + u64_to_i64(batch_index), + i64::from(frame_in_batch), + i64::from(pos_in_frame), + item.signed.sender.as_slice(), + i64::from(item.signed.user_op.nonce), + i64::from(item.signed.user_op.max_fee), + item.signed.user_op.data.as_ref(), + &sig[..], + to_unix_ms(item.received_at), + ])?; + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use crate::storage::{ + SafeInputRange, Storage, StoredSafeInput, + test_helpers::{default_protocol_config, temp_db}, + }; + use alloy_primitives::Address; + use sequencer_core::l2_tx::SequencedL2Tx; + + #[test] + fn open_state_is_idempotent_and_rotation_is_atomic() { + let db = temp_db("open-state"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + assert!( + storage.open_state().expect("load open state").is_none(), + "fresh storage should not have an open frame yet" + ); + + let head_a = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + let head_b = storage + .open_state() + .expect("load existing open state") + .expect("open state should now exist"); + + assert_eq!(head_a.batch_index, head_b.batch_index); + assert_eq!(head_a.frame_in_batch, head_b.frame_in_batch); + assert_eq!(head_a.frame_fee, head_b.frame_fee); + // Default log_recommended_fee = 0+20+419+621 = 1060 + assert_eq!(head_a.frame_fee, 1060); + + let mut head_c = head_b; + let next_safe_block = head_c.safe_block; + storage + .close_frame_only(&mut head_c, next_safe_block, SafeInputRange::empty_at(0)) + .expect("rotate within same batch"); + assert_eq!(head_c.batch_index, head_b.batch_index); + assert_eq!(head_c.frame_in_batch, 1); + + let mut head_d = head_c; + let next_safe_block = head_d.safe_block; + storage + .close_frame_and_batch(&mut head_d, next_safe_block) + .expect("close batch and rotate"); + assert!(head_d.batch_index > head_c.batch_index); + assert_eq!(head_d.frame_in_batch, 0); + } + + #[test] + fn next_frame_fee_comes_from_batch_policy() { + let db = temp_db("batch-policy-fee"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let policy = storage.batch_policy().expect("default policy"); + // Default: log_gas_price=0, log_recommended_fee = 0+20+419+621 = 1060 + assert_eq!(policy.recommended_fee, 1060); + + storage.set_log_gas_price(100).expect("set log gas price"); + + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + let next_safe_block = head.safe_block; + storage + .close_frame_and_batch(&mut head, next_safe_block) + .expect("rotate batch"); + + let policy = storage.batch_policy().expect("read policy"); + // log_recommended_fee = 100+20+419+621 = 1160 + assert_eq!(head.frame_fee, 1160); + assert_eq!(head.frame_fee, policy.recommended_fee); + assert!( + head.max_batch_user_op_bytes > 0, + "batch size target should be set" + ); + } + + #[test] + fn frame_fee_is_immutable_for_the_lifetime_of_the_frame() { + // §3.2.3: once a frame is opened at fee F, a policy update mid-frame + // must NOT change the open frame's committed fee. Only the *next* + // frame (after close) sees the new policy. This pins the write-once + // contract `frames.fee` relies on — users submitting against the open + // frame know the fee they're paying, regardless of upstream policy + // drift during their round-trip. + let db = temp_db("frame-fee-immutable"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + let original_batch_index = head.batch_index; + let original_frame_in_batch = head.frame_in_batch; + // Default: log_gas_price=0 → log_recommended_fee = 0+20+419+621 = 1060 + assert_eq!(head.frame_fee, 1060); + + // Simulate an operator policy update mid-frame: fee oracle reports a + // higher gas price. The derived view reflects the new fee immediately. + storage + .set_log_gas_price(100) + .expect("set higher log gas price"); + let new_policy = storage.batch_policy().expect("read updated policy"); + assert_eq!( + new_policy.recommended_fee, 1160, + "policy-derived fee should reflect the new gas price", + ); + + // Invariant: the already-open frame's persisted fee stays at 1060. + let persisted_frame_fee: i64 = storage + .conn + .query_row( + "SELECT fee FROM frames WHERE batch_index = ?1 AND frame_in_batch = ?2", + rusqlite::params![original_batch_index as i64, original_frame_in_batch as i64,], + |row| row.get(0), + ) + .expect("query open frame fee"); + assert_eq!( + persisted_frame_fee, 1060, + "open frame's committed fee must not change across policy updates", + ); + + // And the in-memory WriteHead mirror must also be stable — the lane + // submitting against this head should see a consistent fee. + assert_eq!( + head.frame_fee, 1060, + "WriteHead.frame_fee must stay stable until advance_frame runs", + ); + + // Closing the frame picks up the new policy — the *next* frame opens + // at 1160. This is the expected policy-flow boundary. + let next_safe_block = head.safe_block; + storage + .close_frame_only(&mut head, next_safe_block, SafeInputRange::empty_at(0)) + .expect("rotate within same batch"); + assert_eq!( + head.frame_fee, 1160, + "the next frame must use the updated policy's fee (policy flows in at close)", + ); + } + + #[test] + fn next_undrained_safe_input_index_is_derived_from_sequenced_directs() { + let db = temp_db("safe-cursor"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + assert_eq!( + storage + .next_undrained_safe_input_index() + .expect("empty cursor"), + 0 + ); + + let head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + let drained = vec![ + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0x00], + block_number: 10, + }, + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0x02], + block_number: 10, + }, + ]; + storage + .append_safe_inputs(10, drained.as_slice(), &default_protocol_config()) + .expect("insert direct inputs"); + let mut head = head; + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, drained.len() as u64)) + .expect("close frame with directs"); + + assert_eq!( + storage + .next_undrained_safe_input_index() + .expect("derived cursor"), + 2 + ); + } + + #[test] + fn initialize_open_state_creates_first_real_batch_and_frame() { + let db = temp_db("initialize-open-state"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let head = storage + .initialize_open_state(12, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + + assert_eq!(head.batch_index, 0); + assert_eq!(head.frame_in_batch, 0); + assert_eq!(head.safe_block, 12); + + let loaded = storage + .open_state() + .expect("load open state") + .expect("open state should exist"); + assert_eq!(loaded.batch_index, 0); + assert_eq!(loaded.frame_in_batch, 0); + assert_eq!(loaded.safe_block, 12); + } + + #[test] + fn replay_returns_direct_inputs_in_drain_order() { + let db = temp_db("replay-order"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + + let drained = vec![ + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xaa], + block_number: 10, + }, + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xbb], + block_number: 10, + }, + ]; + storage + .append_safe_inputs(10, drained.as_slice(), &default_protocol_config()) + .expect("insert direct inputs"); + let mut head = head; + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, drained.len() as u64)) + .expect("close frame with directs"); + + let replay = storage + .ordered_l2_txs_page_from(0, 100) + .expect("load replay"); + assert_eq!(replay.len(), 2); + match &replay[0].1 { + SequencedL2Tx::Direct(value) => assert_eq!(value.payload.as_slice(), &[0xaa]), + _ => panic!("expected direct input at position 0"), + } + match &replay[1].1 { + SequencedL2Tx::Direct(value) => assert_eq!(value.payload.as_slice(), &[0xbb]), + _ => panic!("expected direct input at position 1"), + } + } +} diff --git a/sequencer/src/storage/l1_inputs.rs b/sequencer/src/storage/l1_inputs.rs new file mode 100644 index 0000000..063f455 --- /dev/null +++ b/sequencer/src/storage/l1_inputs.rs @@ -0,0 +1,310 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Input reader writer: ingests L1 InputBox events into `safe_inputs`, +//! advances `l1_safe_head`, and maintains the L1 bootstrap cache. +//! +//! Also exposes the read-side queries the input reader and other callers need +//! (current safe block, safe-input bounds, last safe-progress timestamp). + +use alloy_primitives::Address; +use rusqlite::{OptionalExtension, Result, Transaction, params}; + +use super::Storage; +use super::StoredSafeInput; +use super::convert::{i64_to_u64, now_unix_ms, u64_to_i64}; +use super::queries::{query_current_safe_block, query_latest_safe_input_index_exclusive}; +use super::safe_accepted_batches::populate_safe_accepted_batches; +use sequencer_core::protocol::ProtocolConfig; + +impl Storage { + /// `MAX(safe_input_index) + 1` (or 0 if empty). The exclusive bound on the + /// `safe_inputs` table — the next index a fresh row would receive. + pub fn safe_input_end_exclusive(&mut self) -> Result { + query_latest_safe_input_index_exclusive(&self.conn) + } + + pub fn current_safe_block(&mut self) -> Result { + query_current_safe_block(&self.conn) + } + + /// Advance `l1_safe_head.block_number` to `minimum_safe_block` if it is + /// behind. One-shot bootstrap helper — does NOT touch `synced_at_ms`, so + /// it doesn't masquerade as a real L1 sync to the wall-clock danger + /// estimator. + pub fn ensure_minimum_safe_block(&mut self, minimum_safe_block: u64) -> Result<()> { + self.write(|tx| { + let current = query_current_safe_block(tx)?; + if current < minimum_safe_block { + // `synced_at_ms` is intentionally NOT touched here: this is a + // bootstrap setup (genesis-block sync), not a real L1 read. + // Leaving it preserves the wall-clock danger estimate's "time + // since last real sync" semantics. + let changed = tx.execute( + "UPDATE l1_safe_head SET block_number = ?1 WHERE singleton_id = 0", + params![u64_to_i64(minimum_safe_block)], + )?; + if changed != 1 { + return Err(rusqlite::Error::StatementChangedRows(changed)); + } + } + Ok(()) + }) + } + + /// Record the first real safe-head observation if no prior observation was + /// persisted yet. + /// + /// Used when the input reader successfully contacts L1 but the observed + /// safe block matches the bootstrap floor (for example, first startup on a + /// chain that has not advanced past genesis). This seeds the wall-clock + /// estimator once without repeatedly refreshing it while the safe head is + /// frozen. + pub fn initialize_safe_progress_if_unset(&mut self) -> Result<()> { + let now_ms = now_unix_ms(); + self.conn.execute( + "UPDATE l1_safe_head SET synced_at_ms = ?1 \ + WHERE singleton_id = 0 AND synced_at_ms = 0", + params![now_ms], + )?; + Ok(()) + } + + /// Atomically: insert `inputs` (assigned contiguous indexes starting from + /// the current MAX+1), advance `l1_safe_head.block_number` to `safe_block`, + /// stamp `synced_at_ms` as the wall-clock time when the safe frontier + /// advanced, and update `safe_accepted_batches` via `protocol` so the + /// scheduler-accepted frontier view stays consistent with the safe head. + /// + /// The materialized `safe_accepted_batches` view is an invariant of this + /// operation: after a successful `append_safe_inputs`, every safe input up + /// to `safe_block` has been evaluated against the scheduler's acceptance + /// rules and recorded in `safe_accepted_batches`. Readers (submitter, + /// recovery, danger checks) never need to populate separately. + /// + /// Asserts `safe_block` is monotonic and that it strictly advances when + /// `inputs` is non-empty. + pub fn append_safe_inputs( + &mut self, + safe_block: u64, + inputs: &[StoredSafeInput], + protocol: &ProtocolConfig, + ) -> Result<()> { + self.write(|tx| { + let current = query_current_safe_block(tx)?; + assert!( + safe_block >= current, + "safe block regressed: current={current}, next={safe_block}" + ); + assert!( + safe_block > current || inputs.is_empty(), + "safe block must advance when appending new safe inputs" + ); + + let next_index = query_latest_safe_input_index_exclusive(tx)?; + insert_safe_inputs_batch(tx, next_index, inputs)?; + + let changed = tx.execute( + "UPDATE l1_safe_head SET block_number = ?1, synced_at_ms = ?2 WHERE singleton_id = 0", + params![u64_to_i64(safe_block), now_unix_ms()], + )?; + if changed != 1 { + return Err(rusqlite::Error::StatementChangedRows(changed)); + } + + populate_safe_accepted_batches(tx, protocol) + }) + } + + /// Wall-clock timestamp (Unix ms) of the last observed safe-head advance. + /// Returns 0 if no real safe-head observation has occurred yet. + pub fn last_safe_progress_ms(&self) -> Result { + let value: i64 = self.conn.query_row( + "SELECT synced_at_ms FROM l1_safe_head WHERE singleton_id = 0", + [], + |row| row.get(0), + )?; + Ok(i64_to_u64(value)) + } + + /// Read cached L1 bootstrap data (input_box_address, genesis_block, chain_id). + /// Returns `None` on first startup. + pub fn l1_bootstrap_cache(&self) -> Result> { + let row: Option<(Vec, i64, i64)> = self + .conn + .query_row( + "SELECT input_box_address, genesis_block, chain_id \ + FROM l1_bootstrap_cache WHERE singleton_id = 0", + [], + |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)), + ) + .optional()?; + Ok(row.map(|(addr_bytes, genesis, chain_id)| { + let addr = Address::from_slice(&addr_bytes); + (addr, i64_to_u64(genesis), i64_to_u64(chain_id)) + })) + } + + /// Cache L1 bootstrap data so future startups can boot without L1. + pub fn save_l1_bootstrap_cache( + &mut self, + input_box_address: Address, + genesis_block: u64, + chain_id: u64, + ) -> Result<()> { + self.conn.execute( + "INSERT OR REPLACE INTO l1_bootstrap_cache \ + (singleton_id, input_box_address, genesis_block, chain_id) \ + VALUES (0, ?1, ?2, ?3)", + params![ + input_box_address.as_slice(), + u64_to_i64(genesis_block), + u64_to_i64(chain_id), + ], + )?; + Ok(()) + } +} + +fn insert_safe_inputs_batch( + tx: &Transaction<'_>, + start_index: u64, + inputs: &[StoredSafeInput], +) -> Result<()> { + if inputs.is_empty() { + return Ok(()); + } + let mut stmt = tx.prepare_cached( + "INSERT INTO safe_inputs (safe_input_index, sender, payload, block_number) \ + VALUES (?1, ?2, ?3, ?4)", + )?; + for (offset, input) in inputs.iter().enumerate() { + stmt.execute(params![ + u64_to_i64(start_index.saturating_add(offset as u64)), + input.sender.as_slice(), + input.payload.as_slice(), + u64_to_i64(input.block_number), + ])?; + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::{thread, time::Duration}; + + use crate::storage::{ + SafeInputRange, Storage, StoredSafeInput, + test_helpers::{default_protocol_config, temp_db}, + }; + use alloy_primitives::Address; + + #[test] + fn safe_input_api_uses_half_open_intervals() { + let db = temp_db("safe-input-api"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let protocol = default_protocol_config(); + + assert_eq!(storage.safe_input_end_exclusive().expect("safe head"), 0); + let mut out = Vec::new(); + storage + .fill_safe_inputs(SafeInputRange::new(0, 0), &mut out) + .expect("query empty interval"); + assert!(out.is_empty()); + + let inserted = vec![ + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xa0], + block_number: 10, + }, + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xb1], + block_number: 10, + }, + ]; + storage + .append_safe_inputs(10, inserted.as_slice(), &protocol) + .expect("insert safe directs"); + + assert_eq!(storage.safe_input_end_exclusive().expect("safe head"), 2); + + storage + .fill_safe_inputs(SafeInputRange::new(0, 2), &mut out) + .expect("query full interval"); + assert_eq!(out, inserted); + + storage + .fill_safe_inputs(SafeInputRange::new(1, 1), &mut out) + .expect("query empty half-open interval"); + assert!(out.is_empty()); + } + + #[test] + fn ensure_minimum_safe_block_only_moves_forward() { + let db = temp_db("ensure-min-safe-block"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + storage + .ensure_minimum_safe_block(7) + .expect("advance bootstrap safe head"); + assert_eq!(storage.current_safe_block().expect("read advanced"), 7); + + storage + .ensure_minimum_safe_block(3) + .expect("do not regress bootstrap safe head"); + assert_eq!(storage.current_safe_block().expect("read unchanged"), 7); + } + + #[test] + fn ensure_minimum_safe_block_does_not_record_safe_progress() { + let db = temp_db("ensure-min-safe-block-no-sync"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + storage + .ensure_minimum_safe_block(7) + .expect("advance bootstrap safe head"); + assert_eq!( + storage + .last_safe_progress_ms() + .expect("read sync timestamp"), + 0, + "bootstrap safe-head initialization must not count as safe progress" + ); + + storage + .initialize_safe_progress_if_unset() + .expect("record first real safe-head observation"); + let recorded_sync = storage + .last_safe_progress_ms() + .expect("read sync timestamp"); + assert!( + recorded_sync > 0, + "initial observation should record wall-clock time" + ); + + thread::sleep(Duration::from_millis(5)); + storage + .initialize_safe_progress_if_unset() + .expect("do not refresh unchanged safe head"); + assert_eq!( + storage + .last_safe_progress_ms() + .expect("read unchanged sync timestamp"), + recorded_sync, + "repeat observations of the same safe head must not refresh the marker" + ); + + storage + .ensure_minimum_safe_block(9) + .expect("advance bootstrap safe head again"); + assert_eq!( + storage + .last_safe_progress_ms() + .expect("read sync timestamp"), + recorded_sync, + "bootstrap safe-head updates must preserve the last real safe-progress timestamp" + ); + } +} diff --git a/sequencer/src/storage/l1_submission.rs b/sequencer/src/storage/l1_submission.rs new file mode 100644 index 0000000..c855957 --- /dev/null +++ b/sequencer/src/storage/l1_submission.rs @@ -0,0 +1,838 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Batch-aggregate reads: frontier lookup, per-batch frames + user ops, the +//! catch-up / per-batch replay reader, and the SSZ-encoded pending-batch list +//! the submitter pulls each tick. +//! +//! Despite the historical name, nothing in this file does writes — structural +//! nonces are assigned by the `batches.nonce` trigger at close time (see +//! `ingress`), and `safe_accepted_batches` is maintained by `append_safe_inputs` +//! (see `l1_inputs`). The reads here are shared between the batch submitter +//! (hot-path tick) and the egress replay path (catch-up reader); they live +//! together because they all aggregate at the batch level. + +use rusqlite::{Result, params}; + +use super::Storage; +use super::convert::{i64_to_u16, i64_to_u32, i64_to_u64, u64_to_i64}; +use super::queries::{decode_l2_tx_row, query_current_safe_block}; +use super::safe_accepted_batches::query_latest_safe_accepted_batch; +use super::{FrameHeader, PendingBatch, SubmitterFrontier}; +use sequencer_core::batch::{Batch, Frame as BatchFrame, WireUserOp}; +use sequencer_core::l2_tx::SequencedL2Tx; + +impl Storage { + /// Read-only frontier view used by the submitter each tick to derive the + /// next batch nonce. `accepted_next_nonce` is the next nonce the scheduler + /// is expected to accept, derived from `safe_accepted_batches`. + /// + /// The scheduler-accepted frontier is maintained by + /// [`Storage::append_safe_inputs`], so this is a pure read. + pub fn submitter_frontier(&mut self) -> Result { + self.read(|tx| { + Ok(SubmitterFrontier { + safe_block: query_current_safe_block(tx)?, + accepted_next_nonce: query_latest_safe_accepted_batch(tx)? + .map(|row| i64_to_u64(row.nonce).saturating_add(1)) + .unwrap_or(0), + }) + }) + } + + /// Highest valid (non-invalidated) `batch_index`, or `None` if no valid + /// batches exist. The open batch is included. + pub fn latest_batch_index(&mut self) -> Result> { + let value: Option = + self.conn + .query_row("SELECT MAX(batch_index) FROM valid_batches", [], |row| { + row.get(0) + })?; + Ok(value.map(i64_to_u64)) + } + + /// Frame headers for `batch_index` in `frame_in_batch` order. Reads the + /// raw `frames` table — does NOT filter on validity, since callers only + /// reach this method after they already know the batch is valid. + pub fn frames_for_batch(&mut self, batch_index: u64) -> Result> { + let mut stmt = self.conn.prepare_cached( + "SELECT frame_in_batch, fee, safe_block FROM frames \ + WHERE batch_index = ?1 ORDER BY frame_in_batch ASC", + )?; + let rows = stmt.query_map(params![u64_to_i64(batch_index)], |row| { + Ok(FrameHeader { + frame_in_batch: i64_to_u32(row.get(0)?), + fee: i64_to_u16(row.get(1)?), + safe_block: i64_to_u64(row.get(2)?), + }) + })?; + rows.collect::>>() + } + + /// Materialize all sequenced L2 txs in one batch (used by the catch-up / + /// per-batch replay paths). Returns `[]` for invalidated batches. + pub fn ordered_l2_txs_for_batch(&mut self, batch_index: u64) -> Result> { + const SQL: &str = " + SELECT + CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN 0 ELSE 1 END AS kind, + CASE + WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.sender + WHEN s.safe_input_index IS NOT NULL THEN d.sender + ELSE NULL + END AS sender, + CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.data ELSE NULL END AS data, + CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN f.fee ELSE NULL END AS fee, + CASE WHEN s.safe_input_index IS NOT NULL THEN d.payload ELSE NULL END AS payload, + CASE WHEN s.safe_input_index IS NOT NULL THEN d.block_number ELSE NULL END AS block_number + FROM valid_sequenced_l2_txs s + LEFT JOIN user_ops u + ON u.batch_index = s.batch_index + AND u.frame_in_batch = s.frame_in_batch + AND u.pos_in_frame = s.user_op_pos_in_frame + LEFT JOIN frames f + ON f.batch_index = s.batch_index + AND f.frame_in_batch = s.frame_in_batch + LEFT JOIN safe_inputs d + ON d.safe_input_index = s.safe_input_index + WHERE s.batch_index = ?1 + ORDER BY s.offset ASC + "; + let mut stmt = self.conn.prepare_cached(SQL)?; + let rows = stmt.query_map(params![u64_to_i64(batch_index)], |row| { + Ok(decode_l2_tx_row( + row.get(0)?, + row.get(1)?, + row.get(2)?, + row.get(3)?, + row.get(4)?, + row.get(5)?, + )) + })?; + rows.collect::>>() + } + + /// Load all valid closed batches with nonce >= `min_nonce`, in nonce order, + /// each one fully assembled and SSZ-encoded with its authoritative nonce. + /// + /// Authoritative because the nonce stamped into the wire payload is the + /// one the DB persists on the batch row (via the `parent.nonce + 1` + /// structural invariant). The caller never sees an unstamped batch — + /// there is no way to accidentally encode with the wrong nonce. + pub fn pending_batches(&mut self, min_nonce: u64) -> Result> { + const SQL: &str = "SELECT batch_index, nonce FROM valid_closed_batches \ + WHERE nonce >= ?1 ORDER BY nonce ASC"; + let pending_refs: Vec<(u64, u64)> = { + let mut stmt = self.conn.prepare_cached(SQL)?; + let rows = stmt.query_map(params![u64_to_i64(min_nonce)], |row| { + let bi: i64 = row.get(0)?; + let nonce: i64 = row.get(1)?; + Ok((i64_to_u64(bi), i64_to_u64(nonce))) + })?; + rows.collect::>>()? + }; + + let mut batches = Vec::with_capacity(pending_refs.len()); + for (batch_index, nonce) in pending_refs { + let frames = self.load_batch_frames(batch_index)?; + let batch = Batch { nonce, frames }; + let encoded = ssz::Encode::as_ssz_bytes(&batch); + batches.push(PendingBatch { + batch_index, + nonce, + encoded, + }); + } + Ok(batches) + } + + /// Load every frame (header + user ops) of `batch_index` in frame order. + /// Internal helper for [`Self::pending_batches`]; does NOT filter on + /// validity — callers only reach this after they know the batch is valid. + fn load_batch_frames(&mut self, batch_index: u64) -> Result> { + let frame_headers = self.frames_for_batch(batch_index)?; + let mut frames = Vec::with_capacity(frame_headers.len()); + for header in frame_headers { + let mut stmt = self.conn.prepare_cached( + "SELECT nonce, max_fee, data, sig FROM user_ops \ + WHERE batch_index = ?1 AND frame_in_batch = ?2 \ + ORDER BY pos_in_frame ASC", + )?; + let rows = stmt.query_map( + params![u64_to_i64(batch_index), i64::from(header.frame_in_batch)], + |row| { + Ok(WireUserOp { + nonce: i64_to_u32(row.get(0)?), + max_fee: i64_to_u16(row.get(1)?), + data: row.get(2)?, + signature: row.get(3)?, + }) + }, + )?; + let user_ops: Vec = rows.collect::>()?; + frames.push(BatchFrame { + user_ops, + safe_block: header.safe_block, + fee_price: header.fee, + }); + } + Ok(frames) + } +} + +#[cfg(test)] +mod tests { + use super::super::test_helpers::{ + SENDER_A, SENDER_B, protocol_config_for, seed_closed_batches, + seed_safe_inputs_with_batch_nonces, temp_db, + }; + use crate::storage::{SafeInputRange, Storage, StoredSafeInput}; + use alloy_primitives::Address; + use sequencer_core::batch::{Batch, Frame as BatchFrame}; + use sequencer_core::protocol::ProtocolConfig; + + #[test] + fn pending_batches_stamps_authoritative_nonce_into_wire_bytes() { + // The landmine we removed: an earlier `batch_for_submission` returned a + // `Batch { nonce: 0, … }` placeholder, and callers had to remember to + // stamp the real nonce via `encode_for_scheduler_with_nonce`. The new + // `pending_batches` reads the DB-authoritative nonce from + // `valid_closed_batches` and bakes it straight into the SSZ bytes — so + // decoding the payload must round-trip back to that nonce, and the + // frame body must match what storage persisted. + let db = temp_db("pending-batches-nonce-baked-in"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(12, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + // Close batch 0 so it becomes eligible for submission. + storage + .close_frame_and_batch(&mut head, 12) + .expect("close batch 0"); + + let pending = storage.pending_batches(0).expect("load pending batches"); + assert_eq!(pending.len(), 1); + let entry = &pending[0]; + assert_eq!(entry.batch_index, 0); + assert_eq!(entry.nonce, 0, "genesis batch has nonce 0"); + + // The wire bytes must decode back to the authoritative nonce AND the + // frame body storage persisted. + let decoded: Batch = + ssz::Decode::from_ssz_bytes(&entry.encoded).expect("decode pending wire bytes"); + assert_eq!(decoded.nonce, entry.nonce); + assert_eq!(decoded.frames.len(), 1); + let frame = &decoded.frames[0]; + assert!(frame.user_ops.is_empty()); + assert_eq!(frame.safe_block, 12); + // Default log_recommended_fee = 0+20+419+621 = 1060. + assert_eq!(frame.fee_price, 1060); + } + + #[test] + fn batch_level_helpers_expose_latest_index_frames_and_txs() { + let db = temp_db("batch-level-helpers"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + // Before initialization there should be no batches. + assert!( + storage + .latest_batch_index() + .expect("query latest batch nonce on empty db") + .is_none() + ); + + // Initialize first batch/frame and append some data. + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + + // Close current batch and move to next so batch 0 becomes closed. + let next_safe_block = head.safe_block; + storage + .close_frame_and_batch(&mut head, next_safe_block) + .expect("close batch and rotate"); + + // Latest batch nonce should now be 1 (open), with batch 0 closed. + let latest = storage + .latest_batch_index() + .expect("query latest batch nonce") + .expect("latest batch should exist"); + assert_eq!(latest, 1); + + // Batch 0 should still have at least one frame header. + let frames = storage + .frames_for_batch(0) + .expect("load frames for batch 0"); + assert!(!frames.is_empty()); + + // Ordered L2 txs for batch 0 should be queryable (even if empty). + let txs = storage + .ordered_l2_txs_for_batch(0) + .expect("load l2 txs for batch 0"); + assert!( + txs.is_empty(), + "fresh batch should not have sequenced txs yet" + ); + } + + #[test] + fn closed_batch_becomes_eligible_for_submission_with_assigned_nonce() { + // §3.3.3: closing a batch transitions it from "open Tip" to "eligible + // for L1 submission" — it appears in `valid_closed_batches` with a + // nonce derived from its parent pointer. Pins the submitter's + // contract: open batches are NOT pulled into the submission pipeline, + // and closed batches ARE, at the schema-guaranteed nonce. + let db = temp_db("closed-batch-eligible"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + + // Before close: the open batch must not appear in pending-batches. + let pending_before = storage + .pending_batches(0) + .expect("load pending batches (pre-close)"); + assert!( + pending_before.is_empty(), + "open batch must not be eligible for submission: {pending_before:?}", + ); + + // Close batch 0 — this rotates the Tip to batch 1 and seals batch 0. + let safe_block = head.safe_block; + storage + .close_frame_and_batch(&mut head, safe_block) + .expect("close batch 0"); + + // After close: batch 0 is eligible with nonce 0 (genesis, parent + // NULL → trigger assigns nonce 0). + let pending_after = storage + .pending_batches(0) + .expect("load pending batches (post-close)"); + assert_eq!( + pending_after.len(), + 1, + "exactly one batch should be eligible after the first close", + ); + assert_eq!(pending_after[0].batch_index, 0); + assert_eq!( + pending_after[0].nonce, 0, + "closed batch 0 must carry nonce 0 (genesis, no parent)", + ); + // The new open Tip (batch 1) must NOT be eligible even though it + // exists — eligibility requires sealed_at_ms NOT NULL. + assert!( + pending_after.iter().all(|b| b.batch_index != 1), + "open batch 1 (the new Tip) must not be eligible: {pending_after:?}", + ); + } + + #[test] + fn submitter_frontier_returns_zero_when_no_batches_were_accepted() { + let db = temp_db("submitter-frontier-empty"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let frontier = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!(frontier.safe_block, 0); + assert_eq!(frontier.accepted_next_nonce, 0); + } + + #[test] + fn submitter_frontier_tracks_accepted_prefix() { + let db = temp_db("submitter-frontier-prefix"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + // seed_safe_inputs_with_batch_nonces already calls append_safe_inputs, + // which auto-populates safe_accepted_batches. + seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_A, 10, &[0, 1, 3, 4, 5]); + + let frontier = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!(frontier.safe_block, 10); + assert_eq!(frontier.accepted_next_nonce, 2); + } + + fn default_test_protocol() -> ProtocolConfig { + ProtocolConfig { + batch_submitter: SENDER_A, + max_wait_blocks: 1200, + preemptive_margin_blocks: 75, + seconds_per_block: 12, + } + } + + fn unix_now_ms() -> u64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_millis() as u64 + } + + #[test] + fn check_danger_reports_strict_on_closed_frontier() { + let db = temp_db("check-danger-strict"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 1"); + + let protocol = default_test_protocol(); + storage + .append_safe_inputs( + 1135, + &[StoredSafeInput { + sender: SENDER_A, + payload: ssz::Encode::as_ssz_bytes(&Batch { + nonce: 0, + frames: vec![BatchFrame { + user_ops: vec![], + safe_block: 10, + fee_price: 0, + }], + }), + block_number: 20, + }], + &protocol, + ) + .expect("append accepted batch 0"); + + let status = storage + .check_danger(&protocol, unix_now_ms()) + .expect("check_danger"); + assert_eq!(status, crate::storage::DangerStatus::Strict(1)); + } + + #[test] + fn check_danger_reports_stalled_on_wall_clock_drift() { + // Strict block-based check wouldn't fire (batch 1 has first_frame_safe_block + // = 100 and safe_block = 200, age = 100 < 1125). But wall-clock says the + // safe head hasn't advanced in ~25 blocks — effective threshold drops to + // 1100, batch 1's age jumps past it via the wall-clock correction. + let db = temp_db("check-danger-stalled"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let mut head = storage + .initialize_open_state(100, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 0"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 1"); + + let protocol = default_test_protocol(); + storage + .append_safe_inputs( + 1200, + &[StoredSafeInput { + sender: SENDER_A, + payload: ssz::Encode::as_ssz_bytes(&Batch { + nonce: 0, + frames: vec![BatchFrame { + user_ops: vec![], + safe_block: 100, + fee_price: 0, + }], + }), + block_number: 200, + }], + &protocol, + ) + .expect("append accepted batch 0"); + + // Pretend safe-progress was recorded 25 blocks' worth of wall-clock ago. + let now_ms = unix_now_ms(); + storage + .conn + .execute( + "UPDATE l1_safe_head SET synced_at_ms = ?1 WHERE singleton_id = 0", + [i64::try_from(now_ms.saturating_sub(25 * 12 * 1000)).unwrap_or(i64::MAX)], + ) + .expect("rewind safe-progress timestamp"); + + let status = storage + .check_danger(&protocol, now_ms) + .expect("check_danger"); + assert_eq!(status, crate::storage::DangerStatus::Stalled(1)); + } + + #[test] + fn check_danger_safe_when_never_synced() { + // Fresh DB, no prior safe-progress observation. check_danger reports + // Safe — never-synced is benign at this layer; callers that need + // "refuse on never-synced" (startup L1-unreachable) check explicitly. + let db = temp_db("check-danger-never-synced"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let status = storage + .check_danger(&default_test_protocol(), unix_now_ms()) + .expect("check_danger"); + assert_eq!(status, crate::storage::DangerStatus::Safe); + } + + #[test] + fn populate_safe_accepted_batches_resumes_from_latest_row() { + let db = temp_db("safe-accepted-frontier-resume"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let protocol = protocol_config_for(SENDER_A); + + seed_safe_inputs_with_batch_nonces(&mut storage, SENDER_A, 10, &[0, 1]); + + // Mixed-sender wave: the SENDER_B row must be ignored, SENDER_A rows + // must resume from the cursor and advance the frontier. + let second_wave = vec![ + StoredSafeInput { + sender: SENDER_B, + payload: ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: 99, + frames: Vec::new(), + }), + block_number: 11, + }, + StoredSafeInput { + sender: SENDER_A, + payload: ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: 2, + frames: Vec::new(), + }), + block_number: 11, + }, + StoredSafeInput { + sender: SENDER_A, + payload: ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: 3, + frames: Vec::new(), + }), + block_number: 11, + }, + ]; + storage + .append_safe_inputs(11, second_wave.as_slice(), &protocol) + .expect("append second wave"); + + let frontier = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!(frontier.safe_block, 11); + assert_eq!(frontier.accepted_next_nonce, 4); + + let accepted_count: i64 = storage + .conn + .query_row("SELECT COUNT(*) FROM safe_accepted_batches", [], |row| { + row.get(0) + }) + .expect("count accepted rows"); + assert_eq!(accepted_count, 4); + } + + #[test] + fn safe_accepted_frontier_skips_stale_payloads() { + let db = temp_db("safe-accepted-frontier-skip-stale"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let protocol = default_test_protocol(); + + // Seed a non-stale batch with nonce 0 (safe_block=100, block_number=200, max_wait=1200 → not stale) + let non_stale_payload = ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: 0, + frames: vec![sequencer_core::batch::Frame { + user_ops: Vec::new(), + safe_block: 100, + fee_price: 0, + }], + }); + // Seed a stale batch with nonce 1 (safe_block=100, block_number=2000, max_wait=1200 → stale) + let stale_payload = ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: 1, + frames: vec![sequencer_core::batch::Frame { + user_ops: Vec::new(), + safe_block: 100, + fee_price: 0, + }], + }); + // Seed a non-stale batch with nonce 1 (safe_block=1900, block_number=2000 → not stale) + let non_stale_payload_2 = ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: 1, + frames: vec![sequencer_core::batch::Frame { + user_ops: Vec::new(), + safe_block: 1900, + fee_price: 0, + }], + }); + + let inputs = vec![ + StoredSafeInput { + sender: SENDER_A, + payload: non_stale_payload, + block_number: 200, + }, + StoredSafeInput { + sender: SENDER_A, + payload: stale_payload, + block_number: 2000, + }, + StoredSafeInput { + sender: SENDER_A, + payload: non_stale_payload_2, + block_number: 2000, + }, + ]; + storage + .append_safe_inputs(2000, inputs.as_slice(), &protocol) + .expect("append"); + + let frontier = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!(frontier.accepted_next_nonce, 2); + } + + #[test] + fn frontier_accepts_future_safe_block_batch_by_design() { + // The scheduler rejects batches where frame safe_block > inclusion_block, + // but the sequencer trusts its own output and does not re-validate these + // invariants during recovery. This test documents the intentional design + // choice: populate_safe_accepted_batches accepts such batches because + // the sequencer would never produce them. + let db = temp_db("frontier-future-safe-block"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let future_safe_block_payload = ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: 0, + frames: vec![sequencer_core::batch::Frame { + user_ops: Vec::new(), + safe_block: 500, + fee_price: 0, + }], + }); + let non_monotonic_payload = ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: 1, + frames: vec![ + sequencer_core::batch::Frame { + user_ops: Vec::new(), + safe_block: 200, + fee_price: 0, + }, + sequencer_core::batch::Frame { + user_ops: Vec::new(), + safe_block: 100, + fee_price: 0, + }, + ], + }); + + let batch_submitter = Address::repeat_byte(0xCC); + let protocol = ProtocolConfig { + batch_submitter, + max_wait_blocks: u64::MAX, + preemptive_margin_blocks: 75, + seconds_per_block: 12, + }; + let inputs = vec![ + StoredSafeInput { + sender: batch_submitter, + payload: future_safe_block_payload, + block_number: 100, + }, + StoredSafeInput { + sender: batch_submitter, + payload: non_monotonic_payload, + block_number: 200, + }, + ]; + storage + .append_safe_inputs(200, inputs.as_slice(), &protocol) + .expect("append"); + + let frontier = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!( + frontier.accepted_next_nonce, 2, + "both batches should be in accepted frontier" + ); + } + + #[test] + fn pending_batches_skips_invalidated_and_respects_min_nonce() { + let db = temp_db("load-pending-batches-filter"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + seed_closed_batches(&mut storage, 3); + storage.insert_invalid_batch(1).expect("invalidate batch 1"); + + // From nonce 0: batches 0 and 2 remain valid. + let from_zero = storage + .pending_batches(0) + .expect("load pending batches from 0"); + let nonces: Vec = from_zero.iter().map(|b| b.nonce).collect(); + assert_eq!(nonces, vec![0, 2], "batch 1 must be filtered out"); + + // From nonce 1: only batch 2 remains (batch 0 is below min_nonce). + let from_one = storage + .pending_batches(1) + .expect("load pending batches from 1"); + let nonces: Vec = from_one.iter().map(|b| b.nonce).collect(); + assert_eq!(nonces, vec![2]); + + // Past the suffix: empty. + let from_three = storage + .pending_batches(3) + .expect("load pending batches from 3"); + assert!( + from_three.is_empty(), + "no batch should remain at nonce >= 3" + ); + } + + #[test] + fn nonce_is_reused_after_torn_cascade() { + // After a torn cascade invalidates every batch (including genesis), + // the recovery batch has no valid ancestor. Its parent is NULL, + // so its nonce resets to 0 — effectively reusing the nonce of the + // original genesis. The scheduler's "expected next nonce" also + // resets to 0, since no accepted batches were ever submitted. + let db = temp_db("nonce-reuse-after-torn-cascade"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + storage.insert_invalid_batch(0).expect("invalidate batch 0"); + storage.insert_invalid_batch(1).expect("invalidate batch 1"); + storage + .detect_and_recover(1200) + .expect("open recovery batch after torn invalidation"); + + let head = storage + .open_state() + .expect("load open state") + .expect("recovery batch"); + assert_eq!(head.batch_index, 2); + + // Recovery Tip has no valid ancestor → parent NULL → nonce 0. + let recovery_nonce: i64 = storage + .conn + .query_row( + "SELECT nonce FROM batches WHERE batch_index = 2", + [], + |row| row.get(0), + ) + .expect("query recovery nonce"); + assert_eq!(recovery_nonce, 0, "recovery Tip reuses nonce 0"); + } + + #[test] + fn populate_safe_accepted_batches_skips_duplicate_nonces() { + let db = temp_db("populate-dup-nonces"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let protocol = default_test_protocol(); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("init"); + storage.close_frame_and_batch(&mut head, 10).expect("close"); + + storage + .append_safe_inputs( + 20, + &[ + StoredSafeInput { + sender: SENDER_A, + payload: super::super::test_helpers::make_stale_batch_payload(0, 10), + block_number: 20, + }, + StoredSafeInput { + sender: SENDER_A, + payload: super::super::test_helpers::make_stale_batch_payload(0, 10), + block_number: 20, + }, + ], + &protocol, + ) + .expect("append"); + + let frontier = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!( + frontier.accepted_next_nonce, 1, + "duplicate nonce must be skipped" + ); + } + + #[test] + fn populate_safe_accepted_batches_handles_large_nonce_gap() { + let db = temp_db("populate-nonce-gap"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let protocol = default_test_protocol(); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("init"); + storage.close_frame_and_batch(&mut head, 10).expect("close"); + + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: SENDER_A, + payload: super::super::test_helpers::make_stale_batch_payload(5, 10), + block_number: 20, + }], + &protocol, + ) + .expect("append"); + + let frontier = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!(frontier.accepted_next_nonce, 0, "gap must stall frontier"); + } + + #[test] + fn populate_safe_accepted_batches_out_of_order_arrivals_stalls_frontier() { + let db = temp_db("populate-out-of-order"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let protocol = default_test_protocol(); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("init"); + storage.close_frame_and_batch(&mut head, 10).expect("close"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close 2"); + + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: SENDER_A, + payload: super::super::test_helpers::make_stale_batch_payload(1, 10), + block_number: 20, + }], + &protocol, + ) + .expect("append"); + + let frontier = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!( + frontier.accepted_next_nonce, 0, + "out of order must stall frontier" + ); + + storage + .append_safe_inputs( + 21, + &[StoredSafeInput { + sender: SENDER_A, + payload: super::super::test_helpers::make_stale_batch_payload(0, 10), + block_number: 21, + }], + &protocol, + ) + .expect("append nonce 0"); + + let frontier2 = storage.submitter_frontier().expect("submitter frontier"); + assert_eq!( + frontier2.accepted_next_nonce, 1, + "frontier must remain stalled" + ); + } +} diff --git a/sequencer/src/storage/migrations/0001_schema.sql b/sequencer/src/storage/migrations/0001_schema.sql index 4dde509..e4e3108 100644 --- a/sequencer/src/storage/migrations/0001_schema.sql +++ b/sequencer/src/storage/migrations/0001_schema.sql @@ -1,11 +1,133 @@ +-- --------------------------------------------------------------------------- +-- Batch lifecycle +-- +-- A batch has two monotonic events in its lifetime, each stored as a nullable +-- write-once timestamp on the row: +-- +-- * `sealed_at_ms` — inclusion lane closed the batch (no more ops). +-- * `invalidated_at_ms` — recovery cascade-invalidated the batch. +-- +-- NULL means the event hasn't happened. Once set, triggers below make the +-- column write-once. The only "mutable" state on the row is these two NULL→value +-- transitions, each owned by exactly one writer (inclusion lane vs recovery). +-- +-- The **Tip** is the one batch currently accepting ops: sealed_at_ms IS NULL +-- AND invalidated_at_ms IS NULL. A partial unique index enforces at-most-one. +-- +-- `nonce` is structural: equal to `parent.nonce + 1`, or 0 for genesis (parent +-- NULL). Enforced by trigger on INSERT. The scheduler's view of a batch's +-- identity; reused across recovery cascades (new Tip forks from last valid +-- ancestor, inheriting nonce via the +1 rule). +-- --------------------------------------------------------------------------- CREATE TABLE IF NOT EXISTS batches ( - batch_index INTEGER PRIMARY KEY, - created_at_ms INTEGER NOT NULL + batch_index INTEGER PRIMARY KEY, + parent_batch_index INTEGER REFERENCES batches(batch_index), -- NULL only for genesis + nonce INTEGER NOT NULL CHECK (nonce >= 0), + created_at_ms INTEGER NOT NULL, + sealed_at_ms INTEGER + CHECK (sealed_at_ms IS NULL OR sealed_at_ms >= created_at_ms), + invalidated_at_ms INTEGER + CHECK (invalidated_at_ms IS NULL OR invalidated_at_ms >= created_at_ms) ); +-- "At most one valid Tip" — structural via partial unique index. The predicate +-- references only local columns of `batches`, so SQLite accepts it. +-- +-- We index on COALESCE(sealed_at_ms, 0) instead of sealed_at_ms directly +-- because SQLite UNIQUE indexes treat NULLs as distinct — so indexing directly +-- on `sealed_at_ms` would allow many NULL rows. COALESCE maps all matching +-- rows to the same non-NULL value (0), forcing real uniqueness. +CREATE UNIQUE INDEX IF NOT EXISTS ux_single_valid_tip + ON batches(COALESCE(sealed_at_ms, 0)) + WHERE sealed_at_ms IS NULL AND invalidated_at_ms IS NULL; + +-- Submitter hot path: "give me valid closed batches with nonce >= N", ordered. +CREATE INDEX IF NOT EXISTS idx_batches_valid_closed_by_nonce + ON batches(nonce) + WHERE invalidated_at_ms IS NULL AND sealed_at_ms IS NOT NULL; + +-- ── Views ────────────────────────────────────────────────────────────────── +CREATE VIEW IF NOT EXISTS valid_batches AS + SELECT * FROM batches WHERE invalidated_at_ms IS NULL; + +CREATE VIEW IF NOT EXISTS valid_closed_batches AS + SELECT * FROM valid_batches WHERE sealed_at_ms IS NOT NULL; + +-- At most one row by the partial unique index above. +CREATE VIEW IF NOT EXISTS valid_open_batch AS + SELECT * FROM valid_batches WHERE sealed_at_ms IS NULL; + +-- ── Triggers ─────────────────────────────────────────────────────────────── +-- +-- These enforce invariants the writer could otherwise violate with a bug. +-- Keep them declarative: each one names an invariant and refuses writes that +-- would break it. The Rust writer is still the source of truth for the +-- transition sequence — triggers just ensure the DB never reaches an +-- inconsistent state if the writer misbehaves. + +-- Nonce contiguity: `nonce = parent.nonce + 1`, or 0 for genesis. +CREATE TRIGGER IF NOT EXISTS trg_enforce_nonce_contiguity +AFTER INSERT ON batches +FOR EACH ROW +BEGIN + SELECT CASE + WHEN NEW.parent_batch_index IS NULL AND NEW.nonce != 0 + THEN RAISE(ABORT, 'genesis batch must have nonce 0') + WHEN NEW.parent_batch_index IS NOT NULL + AND NEW.nonce != (SELECT nonce + 1 FROM batches WHERE batch_index = NEW.parent_batch_index) + THEN RAISE(ABORT, 'batch nonce must equal parent.nonce + 1') + END; +END; + +-- Write-once: sealed_at_ms transitions only NULL → non-NULL. +CREATE TRIGGER IF NOT EXISTS trg_sealed_at_ms_write_once +BEFORE UPDATE OF sealed_at_ms ON batches +FOR EACH ROW +WHEN OLD.sealed_at_ms IS NOT NULL +BEGIN + SELECT RAISE(ABORT, 'sealed_at_ms is write-once'); +END; + +-- Write-once: invalidated_at_ms transitions only NULL → non-NULL. +CREATE TRIGGER IF NOT EXISTS trg_invalidated_at_ms_write_once +BEFORE UPDATE OF invalidated_at_ms ON batches +FOR EACH ROW +WHEN OLD.invalidated_at_ms IS NOT NULL +BEGIN + SELECT RAISE(ABORT, 'invalidated_at_ms is write-once'); +END; + +-- parent_batch_index is immutable after insert. +CREATE TRIGGER IF NOT EXISTS trg_parent_batch_index_immutable +BEFORE UPDATE OF parent_batch_index ON batches +FOR EACH ROW +WHEN (OLD.parent_batch_index IS NULL) != (NEW.parent_batch_index IS NULL) + OR OLD.parent_batch_index IS NOT NULL AND NEW.parent_batch_index IS NOT NULL + AND OLD.parent_batch_index != NEW.parent_batch_index +BEGIN + SELECT RAISE(ABORT, 'parent_batch_index is immutable'); +END; + +-- nonce is immutable after insert. +CREATE TRIGGER IF NOT EXISTS trg_nonce_immutable +BEFORE UPDATE OF nonce ON batches +FOR EACH ROW +WHEN OLD.nonce != NEW.nonce +BEGIN + SELECT RAISE(ABORT, 'nonce is immutable'); +END; + +-- --------------------------------------------------------------------------- +-- Frames and user ops: must target the current Tip. +-- +-- These catch "stale WriteHead" bugs — where a writer holds an in-memory +-- batch_index that's no longer the Tip (sealed or invalidated between reads). +-- A PK lookup per row: microseconds, negligible overhead even on hot paths. +-- --------------------------------------------------------------------------- + CREATE TABLE IF NOT EXISTS frames ( batch_index INTEGER NOT NULL REFERENCES batches(batch_index), - frame_in_batch INTEGER NOT NULL, + frame_in_batch INTEGER NOT NULL CHECK (frame_in_batch >= 0), created_at_ms INTEGER NOT NULL, -- Fee committed by the sequencer for this whole frame. fee INTEGER NOT NULL CHECK (fee >= 0), @@ -14,21 +136,46 @@ CREATE TABLE IF NOT EXISTS frames ( PRIMARY KEY(batch_index, frame_in_batch) ); +CREATE TRIGGER IF NOT EXISTS trg_frames_target_must_be_tip +BEFORE INSERT ON frames +FOR EACH ROW +WHEN NOT EXISTS ( + SELECT 1 FROM batches + WHERE batch_index = NEW.batch_index + AND sealed_at_ms IS NULL + AND invalidated_at_ms IS NULL +) +BEGIN + SELECT RAISE(ABORT, 'frames can only be inserted into the current Tip'); +END; + CREATE TABLE IF NOT EXISTS user_ops ( batch_index INTEGER NOT NULL, frame_in_batch INTEGER NOT NULL, - pos_in_frame INTEGER NOT NULL, - sender BLOB NOT NULL, - nonce INTEGER NOT NULL, - max_fee INTEGER NOT NULL, + pos_in_frame INTEGER NOT NULL CHECK (pos_in_frame >= 0), + sender BLOB NOT NULL CHECK (length(sender) = 20), + nonce INTEGER NOT NULL CHECK (nonce >= 0), + max_fee INTEGER NOT NULL CHECK (max_fee >= 0), data BLOB NOT NULL, - sig BLOB NOT NULL, + sig BLOB NOT NULL CHECK (length(sig) = 65), received_at_ms INTEGER NOT NULL, PRIMARY KEY(batch_index, frame_in_batch, pos_in_frame), - FOREIGN KEY(batch_index, frame_in_batch) REFERENCES frames(batch_index, frame_in_batch), - UNIQUE(sender, nonce) + FOREIGN KEY(batch_index, frame_in_batch) REFERENCES frames(batch_index, frame_in_batch) ); +CREATE TRIGGER IF NOT EXISTS trg_user_ops_target_must_be_tip +BEFORE INSERT ON user_ops +FOR EACH ROW +WHEN NOT EXISTS ( + SELECT 1 FROM batches + WHERE batch_index = NEW.batch_index + AND sealed_at_ms IS NULL + AND invalidated_at_ms IS NULL +) +BEGIN + SELECT RAISE(ABORT, 'user_ops can only be inserted into the current Tip'); +END; + -- Automatically sequence every user-op into the global replay order on insert. -- Note: safe_inputs do NOT have an analogous trigger because their -- batch_index/frame_in_batch are not known at INSERT time — safe inputs @@ -50,6 +197,9 @@ CREATE TABLE IF NOT EXISTS safe_inputs ( block_number INTEGER NOT NULL CHECK (block_number >= 0) ); +CREATE INDEX IF NOT EXISTS idx_safe_inputs_sender + ON safe_inputs(sender); + -- Global append-only replay order consumed by catch-up and feed readers. -- It is a cache, containing the merged and flattened txs of safe_inputs and user_ops. CREATE TABLE IF NOT EXISTS sequenced_l2_txs ( @@ -77,22 +227,72 @@ CREATE TABLE IF NOT EXISTS sequenced_l2_txs ( ), -- At most one sequenced user-op row for each user-op key. - UNIQUE(batch_index, frame_in_batch, user_op_pos_in_frame), - -- A direct input can only be sequenced once. - UNIQUE(safe_input_index) + UNIQUE(batch_index, frame_in_batch, user_op_pos_in_frame) + -- A direct input may be sequenced more than once if its original batch is + -- invalidated and a recovery batch re-drains it. The read-side query filters + -- out rows from invalid batches, so only the latest valid drain is visible. + -- (No UNIQUE constraint on safe_input_index.) ); +CREATE TRIGGER IF NOT EXISTS trg_sequenced_l2_txs_target_must_be_tip +BEFORE INSERT ON sequenced_l2_txs +FOR EACH ROW +WHEN NOT EXISTS ( + SELECT 1 FROM batches + WHERE batch_index = NEW.batch_index + AND sealed_at_ms IS NULL + AND invalidated_at_ms IS NULL +) +BEGIN + SELECT RAISE(ABORT, 'sequenced_l2_txs can only target the current Tip'); +END; + CREATE INDEX IF NOT EXISTS idx_sequenced_l2_txs_frame ON sequenced_l2_txs(batch_index, frame_in_batch); +-- Partial index for efficient MAX(safe_input_index) lookups used to compute +-- the next undrained direct-input cursor at frame-close time. +CREATE INDEX IF NOT EXISTS idx_sequenced_l2_txs_safe_input + ON sequenced_l2_txs(safe_input_index) WHERE safe_input_index IS NOT NULL; + +CREATE VIEW IF NOT EXISTS valid_sequenced_l2_txs AS +SELECT * FROM sequenced_l2_txs +WHERE batch_index NOT IN (SELECT batch_index FROM batches WHERE invalidated_at_ms IS NOT NULL); + +-- Derived log of batch submissions the scheduler would actually execute. +-- Unlike a raw log of all safe submissions, this only contains the accepted +-- prefix: batches whose nonce matched the expected sequence and were not stale. +-- Maintained atomically by Storage::append_safe_inputs (via +-- populate_safe_accepted_batches_inner), which simulates the scheduler's +-- acceptance logic over new safe_inputs rows. +CREATE TABLE IF NOT EXISTS safe_accepted_batches ( + safe_input_index INTEGER PRIMARY KEY REFERENCES safe_inputs(safe_input_index), + nonce INTEGER NOT NULL, + first_frame_safe_block INTEGER NOT NULL, + inclusion_block INTEGER NOT NULL +); + CREATE TABLE IF NOT EXISTS l1_safe_head ( singleton_id INTEGER PRIMARY KEY CHECK (singleton_id = 0), -- Highest L1 safe block the input reader has observed and atomically synced into storage. - block_number INTEGER NOT NULL CHECK (block_number >= 0) + block_number INTEGER NOT NULL CHECK (block_number >= 0), + -- Wall-clock time (Unix ms) of the last successful L1 sync. + -- Used for wall-clock danger estimation when L1 is unreachable. + synced_at_ms INTEGER NOT NULL DEFAULT 0 +); + +INSERT OR IGNORE INTO l1_safe_head (singleton_id, block_number, synced_at_ms) +VALUES (0, 0, 0); + +-- L1 bootstrap cache: discovered addresses and block numbers from on-chain contracts. +-- Allows the sequencer to start without L1 if it has run before. +CREATE TABLE IF NOT EXISTS l1_bootstrap_cache ( + singleton_id INTEGER PRIMARY KEY CHECK (singleton_id = 0), + input_box_address BLOB NOT NULL CHECK (length(input_box_address) = 20), + genesis_block INTEGER NOT NULL CHECK (genesis_block >= 0), + chain_id INTEGER NOT NULL CHECK (chain_id > 0) ); -INSERT OR IGNORE INTO l1_safe_head (singleton_id, block_number) -VALUES (0, 0); -- --------------------------------------------------------------------------- -- Batch policy singleton diff --git a/sequencer/src/storage/mod.rs b/sequencer/src/storage/mod.rs index c3fb30f..bce53e5 100644 --- a/sequencer/src/storage/mod.rs +++ b/sequencer/src/storage/mod.rs @@ -1,14 +1,52 @@ // (c) Cartesi and individual authors (see AUTHORS) // SPDX-License-Identifier: Apache-2.0 (see LICENSE) -mod db; -mod sql; +//! SQLite-backed storage for the sequencer. +//! +//! [`Storage`] is the single entry point. Methods are clustered by caller role +//! across sibling files — mostly "one file per writer", plus one read-only +//! batch-aggregate file that two roles share: +//! +//! - `ingress` — inclusion lane: user-op append, frame/batch close +//! - `egress` — WS feed and catch-up replay (read-only) +//! - `l1_inputs` — input reader: safe-input ingestion, L1 head, bootstrap cache +//! - `l1_submission` — batch-aggregate reads (submitter frontier, pending +//! batches, per-batch replay) shared between the submitter and egress +//! - `recovery` — cascade invalidation, recovery-batch open, danger checks +//! - `admin` — operator policy tunables (gas price, alpha) +//! +//! Cross-writer helpers are split by concern: +//! +//! - `convert` — int width + time conversions +//! - `queries` — shared read helpers (`query_*`, `load_current_write_head`) +//! - `mutations` — shared write helpers (`insert_new_batch`, `seal_batch`, …) +//! +//! The schema and `valid_*` views live in `migrations/0001_schema.sql`. See +//! `docs/recovery/README.md` for the recovery design and TLA+ specs. + +mod admin; +mod convert; +mod egress; +mod ingress; +mod l1_inputs; +mod l1_submission; +mod mutations; +mod open; +mod queries; +mod recovery; +mod safe_accepted_batches; + +#[cfg(test)] +pub(crate) mod test_helpers; use std::time::SystemTime; use thiserror::Error; -pub use db::Storage; +pub use open::Storage; +pub use recovery::DangerStatus; +/// One safe input as stored on the L1 InputBox: sender, opaque payload, and +/// the L1 block where it was included. #[derive(Debug, Clone, PartialEq, Eq)] pub struct StoredSafeInput { pub sender: alloy_primitives::Address, @@ -17,10 +55,16 @@ pub struct StoredSafeInput { pub block_number: u64, } +/// Half-open range `[start, end)` over `safe_input_index` values. Used to +/// describe which safe inputs a frame drained. +/// +/// Fields are private so the `new`-time invariant (`end >= start`) can't be +/// broken by direct mutation. Read via [`start`](Self::start) / +/// [`end`](Self::end); construct via [`new`](Self::new) / [`empty_at`](Self::empty_at). #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct SafeInputRange { - pub start_inclusive: u64, - pub end_exclusive: u64, + start_inclusive: u64, + end_exclusive: u64, } impl SafeInputRange { @@ -39,21 +83,76 @@ impl SafeInputRange { Self::new(index, index) } + /// Extend the range forward, producing `[self.end, new_end)`. Panics if + /// `new_end < self.end` — this is the "advance" direction only. pub fn advance_to(self, end_exclusive: u64) -> Self { Self::new(self.end_exclusive, end_exclusive) } + pub fn start(self) -> u64 { + self.start_inclusive + } + + pub fn end(self) -> u64 { + self.end_exclusive + } + pub fn is_empty(self) -> bool { self.start_inclusive == self.end_exclusive } + + /// Split the range into consecutive sub-ranges of at most `max_len` + /// elements. The last chunk may be shorter. Yields nothing if empty. + pub fn chunks(self, max_len: u64) -> SafeInputRangeChunks { + assert!(max_len > 0, "chunk size must be positive"); + SafeInputRangeChunks { + cursor: self.start_inclusive, + end: self.end_exclusive, + max_len, + } + } +} + +/// Iterator returned by [`SafeInputRange::chunks`]. +pub struct SafeInputRangeChunks { + cursor: u64, + end: u64, + max_len: u64, +} + +impl Iterator for SafeInputRangeChunks { + type Item = SafeInputRange; + + fn next(&mut self) -> Option { + if self.cursor >= self.end { + return None; + } + let chunk_end = self.end.min(self.cursor.saturating_add(self.max_len)); + let chunk = SafeInputRange::new(self.cursor, chunk_end); + self.cursor = chunk_end; + Some(chunk) + } } +/// Snapshot of the L1 view: current safe block plus the exclusive cursor into +/// `safe_inputs`. Read by the inclusion lane to decide when to advance. #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct SafeFrontier { +pub struct SafeInputFrontier { pub safe_block: u64, pub end_exclusive: u64, } +/// Snapshot of the scheduler-accepted frontier: current safe block plus the +/// next nonce the scheduler is expected to accept. Read by the batch submitter +/// each tick to derive the next unresolved nonce. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct SubmitterFrontier { + pub safe_block: u64, + pub accepted_next_nonce: u64, +} + +/// Per-frame metadata: position within batch, committed fee, and the +/// safe-block boundary the frame draws against. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct FrameHeader { pub frame_in_batch: u32, @@ -62,6 +161,16 @@ pub struct FrameHeader { pub safe_block: u64, } +/// A batch ready for L1 submission: its local index, assigned nonce, and SSZ-encoded payload. +#[derive(Debug)] +pub struct PendingBatch { + pub batch_index: u64, + pub nonce: u64, + pub encoded: Vec, +} + +/// Returned by [`Storage::open`] and friends; either the SQLite handle failed +/// to open or migrations refused to apply. #[derive(Debug, Error)] pub enum StorageOpenError { #[error(transparent)] @@ -80,6 +189,9 @@ pub struct BatchPolicy { pub batch_size_target: u16, } +/// In-memory mirror of the latest open batch + frame. Mutated by `Storage` +/// methods that change the open state (`append_user_ops_chunk`, `close_*`). +/// The lane keeps one `WriteHead` and threads it through every call. #[derive(Debug, Clone, Copy)] pub struct WriteHead { pub batch_index: u64, diff --git a/sequencer/src/storage/mutations.rs b/sequencer/src/storage/mutations.rs new file mode 100644 index 0000000..39203f1 --- /dev/null +++ b/sequencer/src/storage/mutations.rs @@ -0,0 +1,134 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Write-side helpers shared across writer-role files. +//! +//! Like [`super::queries`] these take `&Transaction` so they compose inside a +//! larger atomic unit. The two consumers today are ingress (batch/frame close +//! + re-drain) and recovery (opening a recovery batch after cascade). + +use rusqlite::{Result, Transaction, params}; + +use super::SafeInputRange; +use super::convert::{i64_to_u64, u64_to_i64}; + +/// Insert a new batch. Nonce is derived from `parent_batch_index`: +/// `parent.nonce + 1`, or 0 if `parent_batch_index` is None (genesis or +/// post-cascade torn-state new Tip). +/// +/// If `batch_index_opt` is None, SQLite auto-assigns (highest existing +1). +/// The explicit form is used only by `initialize_open_state` to pin the +/// very first genesis batch at `batch_index = 0`. +/// +/// The `trg_enforce_nonce_contiguity` trigger verifies the nonce matches +/// `parent.nonce + 1`, so caller and schema agree. +pub(super) fn insert_new_batch( + tx: &Transaction<'_>, + batch_index_opt: Option, + parent_batch_index: Option, + created_at_ms: i64, +) -> Result { + let nonce = compute_next_nonce(tx, parent_batch_index)?; + match batch_index_opt { + Some(bi) => { + tx.execute( + "INSERT INTO batches (batch_index, parent_batch_index, nonce, created_at_ms) \ + VALUES (?1, ?2, ?3, ?4)", + params![ + u64_to_i64(bi), + parent_batch_index.map(u64_to_i64), + u64_to_i64(nonce), + created_at_ms + ], + )?; + Ok(bi) + } + None => { + tx.execute( + "INSERT INTO batches (parent_batch_index, nonce, created_at_ms) \ + VALUES (?1, ?2, ?3)", + params![ + parent_batch_index.map(u64_to_i64), + u64_to_i64(nonce), + created_at_ms + ], + )?; + Ok(i64_to_u64(tx.last_insert_rowid())) + } + } +} + +fn compute_next_nonce(tx: &Transaction<'_>, parent_batch_index: Option) -> Result { + match parent_batch_index { + None => Ok(0), + Some(parent_bi) => { + let parent_nonce: i64 = tx.query_row( + "SELECT nonce FROM batches WHERE batch_index = ?1", + params![u64_to_i64(parent_bi)], + |row| row.get(0), + )?; + Ok(i64_to_u64(parent_nonce).saturating_add(1)) + } + } +} + +/// Mark a batch as sealed (inclusion lane closed it). Write-once per the +/// `trg_sealed_at_ms_write_once` trigger. +pub(super) fn seal_batch(tx: &Transaction<'_>, batch_index: u64, sealed_at_ms: i64) -> Result<()> { + let changed = tx.execute( + "UPDATE batches SET sealed_at_ms = ?1 WHERE batch_index = ?2", + params![sealed_at_ms, u64_to_i64(batch_index)], + )?; + if changed != 1 { + return Err(rusqlite::Error::StatementChangedRows(changed)); + } + Ok(()) +} + +pub(super) fn insert_open_frame( + tx: &Transaction<'_>, + batch_index: u64, + frame_in_batch: u32, + created_at_ms: i64, + frame_fee: u16, + safe_block: u64, +) -> Result<()> { + tx.execute( + "INSERT INTO frames (batch_index, frame_in_batch, created_at_ms, fee, safe_block) \ + VALUES (?1, ?2, ?3, ?4, ?5)", + params![ + u64_to_i64(batch_index), + i64::from(frame_in_batch), + created_at_ms, + i64::from(frame_fee), + u64_to_i64(safe_block), + ], + )?; + Ok(()) +} + +/// Insert one `sequenced_l2_txs` row per safe-input index in `range` for the +/// given (batch, frame). Used by ingress (frame close) and recovery (re-drain +/// after cascade invalidation). +pub(super) fn persist_frame_direct_sequence( + tx: &Transaction<'_>, + batch_index: u64, + frame_in_batch: u32, + range: SafeInputRange, +) -> Result<()> { + if range.is_empty() { + return Ok(()); + } + let mut stmt = tx.prepare_cached( + "INSERT INTO sequenced_l2_txs (batch_index, frame_in_batch, user_op_pos_in_frame, safe_input_index) \ + VALUES (?1, ?2, NULL, ?3)", + )?; + for safe_input_index in range.start()..range.end() { + stmt.execute(params![ + u64_to_i64(batch_index), + i64::from(frame_in_batch), + u64_to_i64(safe_input_index), + ])?; + } + Ok(()) +} diff --git a/sequencer/src/storage/open.rs b/sequencer/src/storage/open.rs new file mode 100644 index 0000000..4a63514 --- /dev/null +++ b/sequencer/src/storage/open.rs @@ -0,0 +1,123 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! `Storage` struct definition plus connection-open and migration entry points. +//! +//! Method clusters live in sibling files (`ingress`, `egress`, `l1_inputs`, +//! `l1_submission`, `recovery`, `admin`) — each adds its own `impl Storage`. + +use rusqlite::{Connection, OpenFlags, Result, Transaction, TransactionBehavior}; +use rusqlite_migration::{M, Migrations}; + +use super::StorageOpenError; + +const MIGRATION_0001_SCHEMA: &str = include_str!("migrations/0001_schema.sql"); + +/// SQLite `synchronous` pragma used by every production writer connection. +/// `NORMAL` is appropriate under WAL — fsyncs at checkpoint boundaries, not +/// per-transaction. Tests use the same value; if a future test needs +/// `FULL`/`OFF`, add a `#[cfg(test)]` override. +const SYNCHRONOUS_PRAGMA: &str = "NORMAL"; + +/// Sequencer storage backed by a single SQLite database. +/// +/// All methods take `&mut self` to enforce exclusive access at the Rust level, +/// matching SQLite's single-writer model. Read-only access uses a separate +/// `Storage` instance opened via [`Storage::open_read_only`]. +pub struct Storage { + pub(super) conn: Connection, +} + +impl Storage { + /// Production open: runs migrations, uses the canonical synchronous pragma. + pub fn open(path: &str) -> Result { + let mut conn = open_writer_connection(path)?; + run_migrations(&mut conn)?; + Ok(Self { conn }) + } + + /// Read-only handle. Uses a 50ms `busy_timeout` (vs. 5s for writers) so + /// readers fail fast under write pressure and don't block on hot paths. + pub fn open_read_only(path: &str) -> Result { + let conn = open_reader_connection(path)?; + Ok(Self { conn }) + } + + /// Test-only: open without running migrations. Lets tests pre-seed the + /// schema before the migration runner touches it. + #[cfg(test)] + pub fn open_without_migrations(path: &str) -> Result { + let conn = open_writer_connection(path)?; + Ok(Self { conn }) + } + + /// Test-only: return a raw `Connection` with the same pragmas as + /// [`Storage::open`]. Used by tests that need to reach past the typed API + /// (e.g., rewinding `synced_at_ms`, installing failure triggers). + #[cfg(test)] + pub fn open_connection(path: &str) -> std::result::Result { + open_writer_connection(path) + } + + /// Run `f` inside a Deferred transaction, commit on success. For pure reads. + /// + /// Using Deferred rather than Immediate matches SQLite's default — readers + /// don't hold a write lock and don't block writers. If `f` returns `Err` + /// the transaction is dropped unsent (auto-rollback); on success the + /// commit is issued before returning `Ok`. + pub fn read(&mut self, f: F) -> Result + where + F: FnOnce(&Transaction<'_>) -> Result, + { + let tx = self + .conn + .transaction_with_behavior(TransactionBehavior::Deferred)?; + let out = f(&tx)?; + tx.commit()?; + Ok(out) + } + + /// Run `f` inside an Immediate transaction, commit on success. For any + /// mutation. + /// + /// Using Immediate acquires the write lock upfront so contending writers + /// see `SQLITE_BUSY` immediately rather than mid-transaction — this is + /// the right cadence under WAL + single-writer discipline. Same commit / + /// auto-rollback semantics as [`Storage::read`]. + pub fn write(&mut self, f: F) -> Result + where + F: FnOnce(&Transaction<'_>) -> Result, + { + let tx = self + .conn + .transaction_with_behavior(TransactionBehavior::Immediate)?; + let out = f(&tx)?; + tx.commit()?; + Ok(out) + } +} + +/// Open a read-write connection with WAL + `NORMAL` sync + 5s busy timeout. +fn open_writer_connection(path: &str) -> Result { + let conn = Connection::open(path)?; + conn.pragma_update(None, "foreign_keys", "ON")?; + conn.pragma_update(None, "journal_mode", "WAL")?; + conn.pragma_update(None, "synchronous", SYNCHRONOUS_PRAGMA)?; + conn.pragma_update(None, "busy_timeout", 5000)?; + Ok(conn) +} + +/// Open a read-only connection with `query_only` + 50ms busy timeout. +fn open_reader_connection(path: &str) -> Result { + let conn = Connection::open_with_flags(path, OpenFlags::SQLITE_OPEN_READ_ONLY)?; + conn.pragma_update(None, "query_only", "ON")?; + conn.pragma_update(None, "busy_timeout", 50)?; + Ok(conn) +} + +/// Apply all migrations. Package-private — callers use [`Storage::open`] +/// which runs this automatically. +pub(super) fn run_migrations(conn: &mut Connection) -> Result<(), StorageOpenError> { + Migrations::from_slice(&[M::up(MIGRATION_0001_SCHEMA)]).to_latest(conn)?; + Ok(()) +} diff --git a/sequencer/src/storage/queries.rs b/sequencer/src/storage/queries.rs new file mode 100644 index 0000000..c8ddd61 --- /dev/null +++ b/sequencer/src/storage/queries.rs @@ -0,0 +1,149 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Read-side helpers shared across writer-role files. +//! +//! These take a `&Connection` (or `&Transaction`, which derefs) rather than +//! `&mut Storage`, so they can compose inside a larger transaction built by +//! any writer role. Single-caller reads stay inline in the writer that owns +//! them; only the reads reused by two or more roles live here. + +use alloy_primitives::Address; +use rusqlite::{Connection, Result, Transaction, params}; + +use super::convert::{from_unix_ms, i64_to_u16, i64_to_u32, i64_to_u64}; +use super::{BatchPolicy, WriteHead}; +use sequencer_core::l2_tx::{DirectInput, SequencedL2Tx, ValidUserOp}; + +// ── Write-head loading ─────────────────────────────────────────────────── +// +// Used by ingress (initialize/resume open state) and recovery (open recovery +// batch after cascade). The WriteHead is the in-memory mirror of the latest +// open batch/frame and must always match what's persisted in `batches` and +// `frames`. + +pub(super) fn load_current_write_head(tx: &Transaction<'_>) -> Result> { + // The Tip is the single row in `valid_open_batch` (enforced by + // `ux_single_valid_tip`). Returns None if there's no Tip (fresh DB, + // or torn state between cascade and recovery-batch open). + let latest_batch = match tx.query_row( + "SELECT + b.batch_index, + b.created_at_ms, + (SELECT COUNT(*) FROM user_ops u WHERE u.batch_index = b.batch_index) AS user_op_count + FROM valid_open_batch b", + [], + |row| { + Ok(( + row.get::<_, i64>(0)?, + row.get::<_, i64>(1)?, + row.get::<_, i64>(2)?, + )) + }, + ) { + Ok(row) => row, + Err(rusqlite::Error::QueryReturnedNoRows) => return Ok(None), + Err(other) => return Err(other), + }; + let (batch_index_i64, batch_created_at_ms, batch_user_op_count_i64) = latest_batch; + + let (frame_in_batch_i64, frame_fee_i64, safe_block_i64): (i64, i64, i64) = tx.query_row( + "SELECT frame_in_batch, fee, safe_block FROM frames \ + WHERE batch_index = ?1 ORDER BY frame_in_batch DESC LIMIT 1", + params![batch_index_i64], + |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)), + )?; + + let open_frame_user_op_count: i64 = tx.query_row( + "SELECT COUNT(*) FROM user_ops WHERE batch_index = ?1 AND frame_in_batch = ?2", + params![batch_index_i64, frame_in_batch_i64], + |row| row.get(0), + )?; + + let policy = query_batch_policy(tx)?; + Ok(Some(WriteHead { + batch_index: i64_to_u64(batch_index_i64), + batch_created_at: from_unix_ms(batch_created_at_ms), + frame_fee: i64_to_u16(frame_fee_i64), + safe_block: i64_to_u64(safe_block_i64), + batch_user_op_count: i64_to_u64(batch_user_op_count_i64), + open_frame_user_op_count: i64_to_u32(open_frame_user_op_count), + frame_in_batch: i64_to_u32(frame_in_batch_i64), + max_batch_user_op_bytes: super::batch_size_target_bytes(policy), + })) +} + +// ── Cross-writer scalar reads ───────────────────────────────────────────── + +pub(super) fn query_latest_safe_input_index_exclusive(conn: &Connection) -> Result { + let value: Option = + conn.query_row("SELECT MAX(safe_input_index) FROM safe_inputs", [], |row| { + row.get(0) + })?; + Ok(match value { + Some(last_index) => i64_to_u64(last_index).saturating_add(1), + None => 0, + }) +} + +pub(super) fn query_current_safe_block(conn: &Connection) -> Result { + let value: i64 = conn.query_row( + "SELECT block_number FROM l1_safe_head WHERE singleton_id = 0 LIMIT 1", + [], + |row| row.get(0), + )?; + Ok(i64_to_u64(value)) +} + +pub(super) fn query_batch_policy(conn: &Connection) -> Result { + let (log_recommended_fee, log_batch_size_target): (i64, i64) = conn.query_row( + "SELECT log_recommended_fee, log_batch_size_target FROM batch_policy_derived \ + WHERE singleton_id = 0 LIMIT 1", + [], + |row| Ok((row.get(0)?, row.get(1)?)), + )?; + let max_exp = sequencer_core::fee::MAX_EXPONENT; + Ok(BatchPolicy { + // Clamp to MAX_EXPONENT to prevent panics in fee_to_linear. + recommended_fee: i64_to_u16(log_recommended_fee).min(max_exp), + batch_size_target: i64_to_u16(log_batch_size_target).min(max_exp), + }) +} + +// ── Ordered L2-tx row decoding ─────────────────────────────────────────── +// +// Used by egress paging and the per-batch replay reader. Each caller builds +// the row shape inside its own `query_map` closure and hands the fields to +// this decoder rather than defining an intermediate struct. + +pub(super) fn decode_l2_tx_row( + kind: i64, + sender: Option>, + data: Option>, + fee: Option, + payload: Option>, + block_number: Option, +) -> SequencedL2Tx { + let sender_bytes = sender.expect("ordered replay row: missing sender"); + assert_eq!( + sender_bytes.len(), + 20, + "ordered replay row: sender must be 20 bytes" + ); + if kind == 0 { + SequencedL2Tx::UserOp(ValidUserOp { + sender: Address::from_slice(sender_bytes.as_slice()), + // Replay uses the persisted frame fee (log-space exponent) to mirror canonical execution. + fee: i64_to_u16(fee.expect("ordered replay row: missing fee")), + data: data.expect("ordered replay row: missing data"), + }) + } else { + SequencedL2Tx::Direct(DirectInput { + sender: Address::from_slice(sender_bytes.as_slice()), + block_number: i64_to_u64( + block_number.expect("ordered replay row: missing block_number"), + ), + payload: payload.expect("ordered replay row: missing payload"), + }) + } +} diff --git a/sequencer/src/storage/queries/insert_sequenced_direct_input.sql b/sequencer/src/storage/queries/insert_sequenced_direct_input.sql deleted file mode 100644 index b382c5a..0000000 --- a/sequencer/src/storage/queries/insert_sequenced_direct_input.sql +++ /dev/null @@ -1,6 +0,0 @@ -INSERT INTO sequenced_l2_txs ( - batch_index, - frame_in_batch, - user_op_pos_in_frame, - safe_input_index -) VALUES (?1, ?2, NULL, ?3) diff --git a/sequencer/src/storage/queries/insert_user_op.sql b/sequencer/src/storage/queries/insert_user_op.sql deleted file mode 100644 index d86a72a..0000000 --- a/sequencer/src/storage/queries/insert_user_op.sql +++ /dev/null @@ -1,11 +0,0 @@ -INSERT INTO user_ops ( - batch_index, - frame_in_batch, - pos_in_frame, - sender, - nonce, - max_fee, - data, - sig, - received_at_ms -) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9) diff --git a/sequencer/src/storage/queries/select_latest_batch_with_user_op_count.sql b/sequencer/src/storage/queries/select_latest_batch_with_user_op_count.sql deleted file mode 100644 index ca7f9d0..0000000 --- a/sequencer/src/storage/queries/select_latest_batch_with_user_op_count.sql +++ /dev/null @@ -1,11 +0,0 @@ -SELECT - b.batch_index, - b.created_at_ms, - ( - SELECT COUNT(*) - FROM user_ops u - WHERE u.batch_index = b.batch_index - ) AS user_op_count -FROM batches b -ORDER BY b.batch_index DESC -LIMIT 1 diff --git a/sequencer/src/storage/queries/select_latest_frame_in_batch_for_batch.sql b/sequencer/src/storage/queries/select_latest_frame_in_batch_for_batch.sql deleted file mode 100644 index c2b5a43..0000000 --- a/sequencer/src/storage/queries/select_latest_frame_in_batch_for_batch.sql +++ /dev/null @@ -1,8 +0,0 @@ -SELECT - f.frame_in_batch, - f.fee, - f.safe_block -FROM frames f -WHERE f.batch_index = ?1 -ORDER BY f.frame_in_batch DESC -LIMIT 1 diff --git a/sequencer/src/storage/queries/select_ordered_l2_txs_for_batch.sql b/sequencer/src/storage/queries/select_ordered_l2_txs_for_batch.sql deleted file mode 100644 index 3dd8361..0000000 --- a/sequencer/src/storage/queries/select_ordered_l2_txs_for_batch.sql +++ /dev/null @@ -1,23 +0,0 @@ -SELECT - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN 0 ELSE 1 END AS kind, - CASE - WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.sender - WHEN s.safe_input_index IS NOT NULL THEN d.sender - ELSE NULL - END AS sender, - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.data ELSE NULL END AS data, - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN f.fee ELSE NULL END AS fee, - CASE WHEN s.safe_input_index IS NOT NULL THEN d.payload ELSE NULL END AS payload, - CASE WHEN s.safe_input_index IS NOT NULL THEN d.block_number ELSE NULL END AS block_number -FROM sequenced_l2_txs s -LEFT JOIN user_ops u - ON u.batch_index = s.batch_index - AND u.frame_in_batch = s.frame_in_batch - AND u.pos_in_frame = s.user_op_pos_in_frame -LEFT JOIN frames f - ON f.batch_index = s.batch_index - AND f.frame_in_batch = s.frame_in_batch -LEFT JOIN safe_inputs d - ON d.safe_input_index = s.safe_input_index -WHERE s.batch_index = ?1 -ORDER BY s.offset ASC diff --git a/sequencer/src/storage/queries/select_ordered_l2_txs_from_offset.sql b/sequencer/src/storage/queries/select_ordered_l2_txs_from_offset.sql deleted file mode 100644 index 5c3d52a..0000000 --- a/sequencer/src/storage/queries/select_ordered_l2_txs_from_offset.sql +++ /dev/null @@ -1,23 +0,0 @@ -SELECT - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN 0 ELSE 1 END AS kind, - CASE - WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.sender - WHEN s.safe_input_index IS NOT NULL THEN d.sender - ELSE NULL - END AS sender, - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.data ELSE NULL END AS data, - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN f.fee ELSE NULL END AS fee, - CASE WHEN s.safe_input_index IS NOT NULL THEN d.payload ELSE NULL END AS payload, - CASE WHEN s.safe_input_index IS NOT NULL THEN d.block_number ELSE NULL END AS block_number -FROM sequenced_l2_txs s -LEFT JOIN user_ops u - ON u.batch_index = s.batch_index - AND u.frame_in_batch = s.frame_in_batch - AND u.pos_in_frame = s.user_op_pos_in_frame -LEFT JOIN frames f - ON f.batch_index = s.batch_index - AND f.frame_in_batch = s.frame_in_batch -LEFT JOIN safe_inputs d - ON d.safe_input_index = s.safe_input_index -WHERE s.offset > ?1 -ORDER BY s.offset ASC diff --git a/sequencer/src/storage/queries/select_ordered_l2_txs_page_from_offset.sql b/sequencer/src/storage/queries/select_ordered_l2_txs_page_from_offset.sql deleted file mode 100644 index 9b3d8a6..0000000 --- a/sequencer/src/storage/queries/select_ordered_l2_txs_page_from_offset.sql +++ /dev/null @@ -1,24 +0,0 @@ -SELECT - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN 0 ELSE 1 END AS kind, - CASE - WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.sender - WHEN s.safe_input_index IS NOT NULL THEN d.sender - ELSE NULL - END AS sender, - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN u.data ELSE NULL END AS data, - CASE WHEN s.user_op_pos_in_frame IS NOT NULL THEN f.fee ELSE NULL END AS fee, - CASE WHEN s.safe_input_index IS NOT NULL THEN d.payload ELSE NULL END AS payload, - CASE WHEN s.safe_input_index IS NOT NULL THEN d.block_number ELSE NULL END AS block_number -FROM sequenced_l2_txs s -LEFT JOIN user_ops u - ON u.batch_index = s.batch_index - AND u.frame_in_batch = s.frame_in_batch - AND u.pos_in_frame = s.user_op_pos_in_frame -LEFT JOIN frames f - ON f.batch_index = s.batch_index - AND f.frame_in_batch = s.frame_in_batch -LEFT JOIN safe_inputs d - ON d.safe_input_index = s.safe_input_index -WHERE s.offset > ?1 -ORDER BY s.offset ASC -LIMIT ?2 diff --git a/sequencer/src/storage/queries/select_safe_inputs_range.sql b/sequencer/src/storage/queries/select_safe_inputs_range.sql deleted file mode 100644 index 3d82d7e..0000000 --- a/sequencer/src/storage/queries/select_safe_inputs_range.sql +++ /dev/null @@ -1,4 +0,0 @@ -SELECT safe_input_index, sender, payload, block_number -FROM safe_inputs -WHERE safe_input_index >= ?1 AND safe_input_index < ?2 -ORDER BY safe_input_index ASC diff --git a/sequencer/src/storage/queries/select_user_op_count_for_frame.sql b/sequencer/src/storage/queries/select_user_op_count_for_frame.sql deleted file mode 100644 index e28ada7..0000000 --- a/sequencer/src/storage/queries/select_user_op_count_for_frame.sql +++ /dev/null @@ -1,3 +0,0 @@ -SELECT COUNT(*) -FROM user_ops -WHERE batch_index = ?1 AND frame_in_batch = ?2 diff --git a/sequencer/src/storage/recovery.rs b/sequencer/src/storage/recovery.rs new file mode 100644 index 0000000..afef85c --- /dev/null +++ b/sequencer/src/storage/recovery.rs @@ -0,0 +1,389 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Recovery writer: cascade-invalidates stale batches, opens recovery batches, +//! and composes the startup-recovery transaction. +//! +//! See `docs/recovery/README.md` for the full design (batch tree, coloring, +//! nonce poisoning, TLA+ proof). This file's job is to enforce that design +//! locally — read the design first if you're touching this code. +//! +//! Free functions here are shared with the batch submitter +//! (`l1_submission.rs`); they take `&Connection` / `&Transaction` so the +//! startup path can compose them into one atomic transaction. +//! +//! ## Fault model +//! +//! Recovery is robust to submission and outage failures (crashes, network +//! errors, mempool drops, extended downtime). It is NOT designed to defend +//! against arbitrarily malformed self-submissions: the scheduler-frontier +//! materialization in [`super::safe_accepted_batches`] trusts that on-chain +//! batches from the sequencer's own address are structurally valid. The +//! sequencer controls its own submissions — this is a deliberate system +//! assumption, not a gap. + +use rusqlite::{Connection, OptionalExtension, Result, Transaction, params}; +use sequencer_core::protocol::{ProtocolConfig, age_exceeds}; + +use super::Storage; +use super::convert::{i64_to_u64, now_unix_ms, u64_to_i64}; +use super::mutations::{insert_new_batch, insert_open_frame, persist_frame_direct_sequence}; +use super::queries::{ + query_batch_policy, query_current_safe_block, query_latest_safe_input_index_exclusive, +}; +use super::safe_accepted_batches::query_latest_safe_accepted_batch; + +/// Outcome of a danger-zone check. +/// +/// Callers pattern-match on the variant to decide what action the condition +/// warrants. The runtime danger detector treats Strict and Stalled the same +/// (both trigger a crash-for-recovery); the startup recovery path distinguishes +/// because the two variants imply different responses (fresh-L1 +/// flush-and-cascade vs stalled-L1 refuse-boot). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DangerStatus { + /// No danger detected — neither check tripped. + Safe, + /// Strict, block-based check tripped: a closed batch past the accepted + /// frontier is aged beyond `protocol.danger_threshold()` against the + /// observed safe block. L1 view is fresh; flushing and cascading is + /// meaningful. + Strict(u64), + /// Wall-clock-adjusted check tripped: an unresolved batch is estimated + /// past the adjusted threshold because wall-clock time has elapsed past + /// our last safe-head observation. The safe-head view may be stale or + /// frozen — flushing against L1 may not terminate. + Stalled(u64), +} + +/// Wall-clock-adjusted danger threshold, if a correction applies. +/// +/// Returns `None` when either: +/// - `last_safe_progress_ms == 0` (no baseline — correction is undefined). +/// - Elapsed wall-clock hasn't reached at least one block interval yet (no +/// correction needed). +/// +/// Returns `Some(adjusted_threshold)` where +/// `adjusted = danger_threshold - (elapsed_secs / seconds_per_block)`, +/// saturating at 0. The caller picks which DB-view query to run against this +/// threshold. +pub(super) fn wall_clock_adjusted_threshold( + last_safe_progress_ms: u64, + now_ms: u64, + protocol: &ProtocolConfig, +) -> Option { + if last_safe_progress_ms == 0 { + return None; + } + let elapsed_secs = now_ms.saturating_sub(last_safe_progress_ms) / 1000; + let missed = elapsed_secs / protocol.seconds_per_block.max(1); + if missed == 0 { + return None; + } + Some(protocol.danger_threshold().saturating_sub(missed)) +} + +impl Storage { + /// Unified danger-zone detection. + /// + /// Runs two checks inside a single read transaction: + /// + /// 1. **Strict (block-based)**: `find_closed_frontier_batch_in_danger` + /// against `protocol.danger_threshold()`. Uses the observed safe block. + /// 2. **Wall-clock adjusted**: if a correction applies + /// ([`wall_clock_adjusted_threshold`] returns `Some`), widens to + /// `find_first_batch_in_danger` against `danger_threshold − missed_blocks`. + /// + /// Returns [`DangerStatus::Strict`] if (1) fires (stronger statement about + /// fresh data takes priority), [`DangerStatus::Stalled`] if only (2) fires, + /// [`DangerStatus::Safe`] otherwise. + /// + /// `now_ms` is passed in (rather than read from `SystemTime::now()` here) + /// so the storage layer stays testable without time mocking. Production + /// callers pass the current Unix-ms clock. + pub fn check_danger(&mut self, protocol: &ProtocolConfig, now_ms: u64) -> Result { + self.read(|tx| { + if let Some(idx) = + find_closed_frontier_batch_in_danger(tx, protocol.danger_threshold())? + { + return Ok(DangerStatus::Strict(idx)); + } + + let last_safe_progress_ms: i64 = tx.query_row( + "SELECT synced_at_ms FROM l1_safe_head WHERE singleton_id = 0", + [], + |row| row.get(0), + )?; + let last_safe_progress_ms = i64_to_u64(last_safe_progress_ms); + + if let Some(adjusted) = + wall_clock_adjusted_threshold(last_safe_progress_ms, now_ms, protocol) + && let Some(idx) = find_first_batch_in_danger(tx, adjusted)? + { + return Ok(DangerStatus::Stalled(idx)); + } + + Ok(DangerStatus::Safe) + }) + } + + /// Test-only wrapper around the strict (closed-frontier) danger helper, + /// isolated so tests can target it directly without also running the + /// wall-clock arm inside `check_danger`. + #[cfg(test)] + pub(crate) fn check_danger_zone(&mut self, danger_threshold: u64) -> Result> { + find_closed_frontier_batch_in_danger(&self.conn, danger_threshold) + } + + /// Test-only wrapper around the broader (any-unresolved) danger helper. + /// Same role as `check_danger_zone`: targeted testing of one arm in + /// isolation. + #[cfg(test)] + pub(crate) fn check_any_unresolved_batch_in_danger( + &mut self, + threshold: u64, + ) -> Result> { + find_first_batch_in_danger(&self.conn, threshold) + } + + /// Mark a single batch as invalid. Test-only seeder — production code goes + /// through [`Storage::detect_and_recover`]. + #[cfg(test)] + pub(crate) fn insert_invalid_batch(&mut self, batch_index: u64) -> Result<()> { + let now_ms = now_unix_ms(); + // Only set if currently NULL — leaves already-invalid rows alone so this + // remains idempotent (matching the previous `INSERT OR IGNORE` semantic). + self.conn.execute( + "UPDATE batches SET invalidated_at_ms = ?1 \ + WHERE batch_index = ?2 AND invalidated_at_ms IS NULL", + params![now_ms, u64_to_i64(batch_index)], + )?; + Ok(()) + } + + /// Detect stale batches, cascade-invalidate, and restore the open-batch + /// invariant. Called once per boot and by direct tests. + /// + /// Runs detection, cascade invalidation, and recovery-batch opening inside + /// a single `Immediate` transaction so the operation is crash-safe and + /// atomic. + /// + /// Handles the edge case where a previous boot invalidated the suffix but + /// crashed before opening the fresh batch: if no new invalidations are + /// found but no valid open batch exists, a recovery batch is opened. + /// + /// Does NOT populate `safe_accepted_batches` — the caller is expected to + /// have already synced L1 state via [`Storage::append_safe_inputs`], which + /// maintains the frontier view atomically with each sync. + /// + /// Returns the newly invalidated batch indices (empty if none). + pub fn detect_and_recover(&mut self, max_wait_blocks: u64) -> Result> { + self.write(|tx| detect_and_recover_inner(tx, max_wait_blocks)) + } +} + +// ── Free functions used by both recovery and the batch submitter ────────── + +/// Detect stale batches, cascade-invalidate, and restore the open-batch invariant. +/// See `Storage::detect_and_recover` for full doc. +fn detect_and_recover_inner(tx: &Transaction<'_>, max_wait_blocks: u64) -> Result> { + let invalidated = match find_first_batch_in_danger(tx, max_wait_blocks)? { + Some(bi) => cascade_invalidate_from(tx, bi)?, + None => Vec::new(), + }; + + if !invalidated.is_empty() || !has_valid_open_batch(tx)? { + open_recovery_batch_in_tx(tx)?; + } + Ok(invalidated) +} + +/// The oldest unresolved batch (closed-unaccepted OR open) whose first frame is +/// older than `current_safe_block - threshold`, or `None` if no such batch. +/// +/// "Unresolved" means either: +/// (a) a closed batch past the accepted frontier, or +/// (b) the current Tip (still at risk of aging into danger). +/// +/// Closed-unaccepted batches are strictly older than the Tip (the sequencer +/// opens new batches at monotonically non-decreasing `safe_block`), so the +/// closed-frontier check takes precedence. Cascading from that batch covers +/// the Tip automatically via `batch_index >= N`. +/// +/// Used by: +/// - [`Storage::check_danger`]'s wall-clock-adjusted arm. +/// - [`detect_and_recover_inner`] — atomic cascade-invalidation path. +/// +/// Keeping both call sites behind this single helper keeps the "any unresolved +/// batch may already be too old" logic symmetric between the startup fallback +/// and the recovery transaction. +/// +/// Reads `safe_accepted_batches`, which is maintained atomically with each +/// [`Storage::append_safe_inputs`] call. +pub(super) fn find_first_batch_in_danger(conn: &Connection, threshold: u64) -> Result> { + if let Some(bi) = find_closed_frontier_batch_in_danger(conn, threshold)? { + return Ok(Some(bi)); + } + find_tip_batch_in_danger(conn, threshold) +} + +/// First valid closed batch past the accepted frontier whose `first_frame_safe_block` +/// is older than `current_safe_block - threshold`. Returns `None` if no such +/// batch matches. +/// +/// Does not consider the Tip — the submitter's zombie-detection check must +/// NOT flag the Tip (it has no L1 tx to become a zombie). The unified +/// entrypoint `find_first_batch_in_danger` falls through to +/// `find_tip_batch_in_danger` for that case. +pub(super) fn find_closed_frontier_batch_in_danger( + conn: &Connection, + threshold: u64, +) -> Result> { + let frontier_nonce = query_latest_safe_accepted_batch(conn)? + .map(|row| i64_to_u64(row.nonce).saturating_add(1)) + .unwrap_or(0); + + let batch_ref: Option<(i64, i64)> = conn + .query_row( + "SELECT batch_index, nonce FROM valid_closed_batches \ + WHERE nonce >= ?1 ORDER BY nonce ASC LIMIT 1", + rusqlite::params![u64_to_i64(frontier_nonce)], + |row| Ok((row.get(0)?, row.get(1)?)), + ) + .optional()?; + let Some((batch_index, batch_nonce)) = batch_ref else { + return Ok(None); + }; + if i64_to_u64(batch_nonce) != frontier_nonce { + return Ok(None); + } + + let first_frame_safe_block = first_frame_safe_block_of(conn, batch_index)?; + let safe_block = query_current_safe_block(conn)?; + if age_exceeds(safe_block, first_frame_safe_block, threshold) { + Ok(Some(i64_to_u64(batch_index))) + } else { + Ok(None) + } +} + +/// The Tip (if any) whose `first_frame_safe_block` is older than +/// `current_safe_block - threshold`. Returns `None` if no Tip exists or it's +/// not yet in danger. +fn find_tip_batch_in_danger(conn: &Connection, threshold: u64) -> Result> { + let tip_bi: Option = conn + .query_row("SELECT batch_index FROM valid_open_batch", [], |row| { + row.get(0) + }) + .optional()?; + let Some(tip_bi) = tip_bi else { + return Ok(None); + }; + + let first_frame_safe_block = first_frame_safe_block_of(conn, tip_bi)?; + let safe_block = query_current_safe_block(conn)?; + if age_exceeds(safe_block, first_frame_safe_block, threshold) { + Ok(Some(i64_to_u64(tip_bi))) + } else { + Ok(None) + } +} + +/// `frames.safe_block` of the lowest `frame_in_batch` in `batch_index`. +/// Returns 0 if the batch has no frames yet. +fn first_frame_safe_block_of(conn: &Connection, batch_index: i64) -> Result { + let value: Option = conn + .query_row( + "SELECT safe_block FROM frames \ + WHERE batch_index = ?1 ORDER BY frame_in_batch ASC LIMIT 1", + params![batch_index], + |row| row.get(0), + ) + .optional()?; + Ok(i64_to_u64(value.unwrap_or(0))) +} + +/// Cascade-invalidate all valid batches with `batch_index >= from_batch_index`. +/// +/// Reads the list BEFORE mutating — the SELECT must see the rows the UPDATE +/// will then mark invalid. The `invalidated_at_ms IS NULL` guard on the UPDATE +/// keeps this idempotent: rows already invalid are untouched. +fn cascade_invalidate_from(tx: &Transaction<'_>, from_batch_index: u64) -> Result> { + let from_i64 = u64_to_i64(from_batch_index); + + let invalidated: Vec = { + let mut stmt = tx.prepare( + "SELECT batch_index FROM valid_batches \ + WHERE batch_index >= ?1 ORDER BY batch_index ASC", + )?; + stmt.query_map(params![from_i64], |row| { + row.get::<_, i64>(0).map(i64_to_u64) + })? + .collect::>()? + }; + + if !invalidated.is_empty() { + let now_ms = now_unix_ms(); + tx.execute( + "UPDATE batches SET invalidated_at_ms = ?1 \ + WHERE batch_index >= ?2 AND invalidated_at_ms IS NULL", + params![now_ms, from_i64], + )?; + } + + Ok(invalidated) +} + +/// Check whether the DB has a valid Tip (`sealed_at_ms IS NULL AND +/// `invalidated_at_ms IS NULL`). +fn has_valid_open_batch(tx: &Connection) -> Result { + let count: i64 = tx.query_row("SELECT COUNT(*) FROM valid_open_batch", [], |row| { + row.get(0) + })?; + Ok(count > 0) +} + +/// Open a fresh recovery batch inside an existing transaction. +/// +/// The new Tip's parent is the highest-indexed valid batch (the last valid +/// ancestor after the cascade). If none exists — the torn-state case where +/// every batch has been invalidated — the new Tip has no parent (nonce 0, +/// like a fresh genesis). +fn open_recovery_batch_in_tx(tx: &Transaction<'_>) -> Result<()> { + let now_ms = now_unix_ms(); + let safe_block = query_current_safe_block(tx)?; + + let parent_batch_index: Option = tx + .query_row("SELECT MAX(batch_index) FROM valid_batches", [], |row| { + row.get::<_, Option>(0) + })? + .map(i64_to_u64); + + let policy = query_batch_policy(tx)?; + let next_bi = insert_new_batch(tx, None, parent_batch_index, now_ms)?; + insert_open_frame(tx, next_bi, 0, now_ms, policy.recommended_fee, safe_block)?; + + // Drain leading directs into the new batch's first frame. + // Direct inputs from invalidated batches are re-drained into the recovery batch + // (the UNIQUE(safe_input_index) constraint was removed to allow this). + let next_undrained: u64 = { + // MAX(safe_input_index) + 1 over the valid drained rows. Cursor rewinds + // when a batch is invalidated, so the recovery batch sees the same + // undrained range its invalidated predecessor was working from. + let value: i64 = tx.query_row( + "SELECT COALESCE(MAX(safe_input_index) + 1, 0) FROM valid_sequenced_l2_txs \ + WHERE safe_input_index IS NOT NULL", + [], + |row| row.get(0), + )?; + i64_to_u64(value) + }; + let safe_input_end = query_latest_safe_input_index_exclusive(tx)?; + let leading_range = super::SafeInputRange::new(next_undrained, safe_input_end); + persist_frame_direct_sequence(tx, next_bi, 0, leading_range)?; + Ok(()) +} + +#[cfg(test)] +#[path = "recovery_tests.rs"] +mod tests; diff --git a/sequencer/src/storage/recovery_tests.rs b/sequencer/src/storage/recovery_tests.rs new file mode 100644 index 0000000..a90cf08 --- /dev/null +++ b/sequencer/src/storage/recovery_tests.rs @@ -0,0 +1,1996 @@ +use super::super::test_helpers::{ + SENDER_A, all_ordered_l2_txs, default_protocol_config, make_stale_batch_payload, + seed_closed_batches, temp_db, +}; +use crate::storage::{SafeInputRange, Storage, StoredSafeInput}; +use alloy_primitives::Address; +use sequencer_core::l2_tx::SequencedL2Tx; + +mod invalid_batches { + use super::*; + + // ── invalid_batches filtering ────────────────────────────────────── + + #[test] + fn invalid_batches_excluded_from_latest_batch_index() { + let db = temp_db("invalid-latest-batch"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + seed_closed_batches(&mut storage, 3); + assert_eq!( + storage.latest_batch_index().expect("latest").unwrap(), + 3, + "open batch should be 3" + ); + + storage.insert_invalid_batch(3).expect("mark invalid"); + assert_eq!(storage.latest_batch_index().expect("latest").unwrap(), 2,); + + storage.insert_invalid_batch(2).expect("mark invalid"); + assert_eq!(storage.latest_batch_index().expect("latest").unwrap(), 1,); + } + + #[test] + fn invalid_batches_excluded_from_ordered_l2_txs() { + let db = temp_db("invalid-ordered-txs"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + let directs_0 = vec![StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xaa], + block_number: 10, + }]; + storage + .append_safe_inputs(10, directs_0.as_slice(), &default_protocol_config()) + .expect("append"); + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, 1)) + .expect("close frame"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + let directs_1 = vec![StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xbb], + block_number: 20, + }]; + storage + .append_safe_inputs(20, directs_1.as_slice(), &default_protocol_config()) + .expect("append"); + storage + .close_frame_only(&mut head, 20, SafeInputRange::new(1, 2)) + .expect("close frame"); + + let all = all_ordered_l2_txs(&mut storage); + assert_eq!(all.len(), 2); + + storage.insert_invalid_batch(0).expect("mark invalid"); + + let filtered = all_ordered_l2_txs(&mut storage); + assert_eq!(filtered.len(), 1); + match &filtered[0] { + SequencedL2Tx::Direct(d) => assert_eq!(d.payload.as_slice(), &[0xbb]), + _ => panic!("expected direct input"), + } + } + + #[test] + fn invalid_batches_excluded_from_ordered_l2_txs_for_batch() { + let db = temp_db("invalid-ordered-for-batch"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + let directs = vec![StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xaa], + block_number: 10, + }]; + storage + .append_safe_inputs(10, directs.as_slice(), &default_protocol_config()) + .expect("append"); + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, 1)) + .expect("close frame"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + let txs = storage.ordered_l2_txs_for_batch(0).expect("load batch 0"); + assert_eq!(txs.len(), 1); + + storage.insert_invalid_batch(0).expect("mark invalid"); + let txs = storage + .ordered_l2_txs_for_batch(0) + .expect("load batch 0 after invalidation"); + assert!(txs.is_empty(), "invalid batch should return no txs"); + } + + #[test] + fn invalid_batches_excluded_from_drained_direct_count() { + let db = temp_db("invalid-drained-count"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + let directs = vec![ + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xaa], + block_number: 10, + }, + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xbb], + block_number: 10, + }, + ]; + storage + .append_safe_inputs(10, directs.as_slice(), &default_protocol_config()) + .expect("append"); + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, 2)) + .expect("close frame"); + assert_eq!( + storage.next_undrained_safe_input_index().expect("cursor"), + 2 + ); + + storage.insert_invalid_batch(0).expect("mark invalid"); + assert_eq!( + storage + .next_undrained_safe_input_index() + .expect("cursor after invalidation"), + 0 + ); + } +} + +mod detect_and_recover { + use super::*; + + // ── detect_and_recover ───────────────────────────────────────────── + + #[test] + fn detect_and_recover_cascades_from_stale() { + let db = temp_db("detect-stale"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + for _ in 0..3 { + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch"); + } + + let batch_submitter = Address::repeat_byte(0xAA); + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + &default_protocol_config(), + ) + .expect("append safe input"); + let invalidated = storage + .detect_and_recover(1200) + .expect("detect and recover"); + assert_eq!(invalidated, vec![0, 1, 2, 3]); + + let head = storage.open_state().expect("load open state"); + assert!(head.is_some(), "recovery should have opened a fresh batch"); + assert_eq!(head.unwrap().batch_index, 4); + } + + #[test] + fn detect_and_recover_is_idempotent() { + let db = temp_db("detect-idempotent"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch"); + + let batch_submitter = Address::repeat_byte(0xAA); + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + &default_protocol_config(), + ) + .expect("append safe input"); + let first = storage.detect_and_recover(1200).expect("first detect"); + assert_eq!(first, vec![0, 1]); + + let second = storage.detect_and_recover(1200).expect("second detect"); + assert!(second.is_empty()); + } + + #[test] + fn detect_and_recover_does_not_false_match_after_nonce_reuse() { + let db = temp_db("detect-nonce-reuse"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + let batch_submitter = Address::repeat_byte(0xAA); + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + &default_protocol_config(), + ) + .expect("append stale safe input"); + let first = storage.detect_and_recover(1200).expect("first recovery"); + assert_eq!(first, vec![0, 1]); + + let mut head = storage.open_state().expect("load").unwrap(); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close recovery batch"); + + let second = storage.detect_and_recover(1200).expect("second recovery"); + assert!( + second.is_empty(), + "old stale row must not false-match new-generation batch with reused nonce" + ); + } + + #[test] + fn detect_and_recover_detects_stale_reused_nonce_in_new_generation() { + let db = temp_db("detect-reused-stale"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + let batch_submitter = Address::repeat_byte(0xAA); + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + &default_protocol_config(), + ) + .expect("append gen1 stale safe input"); + let first = storage.detect_and_recover(1200).expect("gen1 recovery"); + assert_eq!(first, vec![0, 1]); + + let mut head = storage.open_state().expect("load").unwrap(); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close gen2 batch"); + + storage + .append_safe_inputs( + 2410, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 100), + block_number: 2410, + }], + &default_protocol_config(), + ) + .expect("append gen2 stale safe input"); + let second = storage.detect_and_recover(1200).expect("gen2 recovery"); + assert_eq!( + second, + vec![2, 3], + "stale reused nonce in gen2 must still be detected" + ); + } +} + +mod tip_staleness { + use super::*; + + // ── §7.3 — Tip staleness regression ─────────────────────────────────── + // + // Original bug: a Tip (unsealed) whose first frame was pinned to an old + // safe_block escaped detection. The frontier lookup only considered + // closed batches, leaving the Tip out of scope. + // + // Fix: `find_first_batch_in_danger` first tries the closed-frontier + // check, then falls through to `find_tip_batch_in_danger`. Both the + // preemptive danger check and the reactive cascade path go through this + // helper, so they can never diverge on what counts as "in danger". + // + // Below covers four cases: + // - positive: Tip IS stale → invalidated + // - negative: Tip is fresh → NOT invalidated (no false positives) + // - combined: closed+stale AND tip+stale → both invalidated in one cascade + // - no-batch: empty DB with no Tip → no-op, no panic + + #[test] + fn open_batch_stale_by_current_safe_block_is_invalidated() { + // Scenario: sequencer opened batch 0 at safe_block=10, never closed it, + // then stayed down until safe advanced to 1500 (>1200 past safe_block). + // Recovery must invalidate the open batch. + let db = temp_db("open-batch-stale"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize open state at safe_block=10"); + + // Advance the safe head so the open batch's first frame (safe_block=10) + // is now stale: 1500 - 10 >= 1200. + storage + .append_safe_inputs(1500, &[], &default_protocol_config()) + .expect("advance safe head past MAX_WAIT_BLOCKS"); + + let invalidated = storage + .detect_and_recover(1200) + .expect("recover from stale open batch"); + assert_eq!( + invalidated, + vec![0], + "open batch 0 should be invalidated by current staleness" + ); + + // A fresh recovery batch must be opened at batch_index=1. + let head = storage.open_state().expect("load").expect("head"); + assert_eq!(head.batch_index, 1, "recovery batch is the next index"); + } + + #[test] + fn open_batch_not_yet_stale_is_not_invalidated() { + // Negative: open batch's first frame safe_block=10 with current safe=1100. + // 1100 - 10 = 1090 < 1200. Must NOT cascade. + // Catches false-positive regressions in the open-batch arm of + // `find_first_batch_in_danger`. + let db = temp_db("open-batch-fresh"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize open state at safe_block=10"); + + storage + .append_safe_inputs(1100, &[], &default_protocol_config()) + .expect("advance safe head below threshold"); + + let invalidated = storage + .detect_and_recover(1200) + .expect("recover with non-stale open batch"); + assert!( + invalidated.is_empty(), + "fresh open batch must not be cascade-invalidated, got: {invalidated:?}" + ); + + // The open batch must still be the live one (no recovery batch opened). + let head = storage.open_state().expect("load").expect("head"); + assert_eq!( + head.batch_index, 0, + "original open batch 0 must still be the head" + ); + } + + #[test] + fn open_batch_exactly_at_threshold_is_invalidated() { + // Boundary: 1210 - 10 = 1200, which is >= MAX_WAIT_BLOCKS. + // The staleness comparison is `>=`, so this must invalidate. + let db = temp_db("open-batch-boundary"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + + storage + .append_safe_inputs(1210, &[], &default_protocol_config()) + .expect("advance safe head to exact threshold"); + + let invalidated = storage.detect_and_recover(1200).expect("recover"); + assert_eq!(invalidated, vec![0], "boundary (>= threshold) invalidates"); + } + + #[test] + fn open_batch_one_block_below_threshold_is_not_invalidated() { + // Boundary: 1209 - 10 = 1199 < 1200. One-block margin must NOT invalidate. + let db = temp_db("open-batch-below-boundary"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + + storage + .append_safe_inputs(1209, &[], &default_protocol_config()) + .expect("advance safe head to one block below threshold"); + + let invalidated = storage.detect_and_recover(1200).expect("recover"); + assert!( + invalidated.is_empty(), + "one-block-below-threshold must not invalidate, got: {invalidated:?}" + ); + } + + #[test] + fn closed_unsubmitted_stale_and_open_stale_both_cascade() { + // Scenario: batch 0 is closed and nonced but never submitted to L1 + // (safe_accepted_batches is empty). Batch 1 is open and also stale. + // `find_first_batch_in_danger` should return closed batch 0 at the + // frontier (nonce 0, no acceptance yet) and cascade through batch 1. + let db = temp_db("closed-unsubmitted-and-open-stale"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize at safe_block=10"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + // Advance safe head so batch 0's first frame (safe_block=10) is stale. + storage + .append_safe_inputs(1500, &[], &default_protocol_config()) + .expect("advance safe head past staleness"); + + let invalidated = storage.detect_and_recover(1200).expect("recover"); + assert_eq!( + invalidated, + vec![0, 1], + "closed unsubmitted batch 0 and subsequent open batch 1 cascade together" + ); + } + + #[test] + fn detect_and_recover_opens_batch_after_torn_invalidation() { + let db = temp_db("detect-torn"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + storage.insert_invalid_batch(0).expect("invalidate 0"); + storage.insert_invalid_batch(1).expect("invalidate 1"); + + let invalidated = storage + .detect_and_recover(1200) + .expect("recover from torn state"); + assert!(invalidated.is_empty(), "no new invalidations"); + + let head = storage.open_state().expect("load open state"); + assert!(head.is_some(), "recovery should have opened a fresh batch"); + assert_eq!(head.unwrap().batch_index, 2); + } + + #[test] + fn detect_and_recover_rolls_back_when_cascade_update_aborts() { + let db = temp_db("detect-cascade-abort"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + // Advance safe head so batch 0's first frame (safe_block=10) is stale. + storage + .append_safe_inputs(1500, &[], &default_protocol_config()) + .expect("advance safe head past staleness"); + + storage + .conn + .execute_batch( + "CREATE TRIGGER fail_cascade_invalidation + AFTER UPDATE OF invalidated_at_ms ON batches + WHEN NEW.invalidated_at_ms IS NOT NULL + AND OLD.invalidated_at_ms IS NULL + BEGIN + SELECT RAISE(ABORT, 'injected cascade failure'); + END;", + ) + .expect("install failure trigger"); + + let err = storage + .detect_and_recover(1200) + .expect_err("trigger should abort recovery transaction"); + assert!( + err.to_string().contains("injected cascade failure"), + "unexpected error: {err:?}" + ); + drop(storage); + + let conn = Storage::open_connection(db.path.as_str()).expect("open read conn"); + let invalidated_count: i64 = conn + .query_row( + "SELECT COUNT(*) FROM batches WHERE invalidated_at_ms IS NOT NULL", + [], + |row| row.get(0), + ) + .expect("count invalidated"); + assert_eq!( + invalidated_count, 0, + "failed cascade must not persist torn invalidation state" + ); + + let batch_count: i64 = conn + .query_row("SELECT COUNT(*) FROM batches", [], |row| row.get(0)) + .expect("count batches"); + assert_eq!( + batch_count, 2, + "failed recovery must not open an extra batch" + ); + + let open_batch_index: i64 = conn + .query_row("SELECT batch_index FROM valid_open_batch", [], |row| { + row.get(0) + }) + .expect("query valid open batch"); + assert_eq!( + open_batch_index, 1, + "failed recovery must leave the original Tip in place" + ); + } + + #[test] + fn recovery_redrains_direct_inputs_and_replay_sees_them_once() { + let db = temp_db("recovery-redrain-e2e"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + let deposits = vec![ + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xd1], + block_number: 10, + }, + StoredSafeInput { + sender: Address::ZERO, + payload: vec![0xd2], + block_number: 10, + }, + ]; + storage + .append_safe_inputs(10, deposits.as_slice(), &default_protocol_config()) + .expect("append deposits"); + storage + .close_frame_only(&mut head, 10, SafeInputRange::new(0, 2)) + .expect("close frame with deposits"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + let before = all_ordered_l2_txs(&mut storage); + assert_eq!(before.len(), 2, "both deposits should be visible"); + + let batch_submitter = Address::repeat_byte(0xAA); + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + &default_protocol_config(), + ) + .expect("append stale batch submission"); + let invalidated = storage + .detect_and_recover(1200) + .expect("detect and recover"); + assert!(!invalidated.is_empty(), "should have invalidated batches"); + + let after = all_ordered_l2_txs(&mut storage); + let direct_payloads: Vec<&[u8]> = after + .iter() + .filter_map(|tx| match tx { + SequencedL2Tx::Direct(d) if d.sender != batch_submitter => { + Some(d.payload.as_slice()) + } + _ => None, + }) + .collect(); + assert_eq!( + direct_payloads, + vec![&[0xd1][..], &[0xd2][..]], + "deposits must appear exactly once in replay after recovery" + ); + + let recovery_batch = storage.open_state().expect("load").unwrap(); + let recovery_txs = storage + .ordered_l2_txs_for_batch(recovery_batch.batch_index) + .expect("load recovery batch txs"); + let recovery_direct_count = recovery_txs + .iter() + .filter(|tx| matches!(tx, SequencedL2Tx::Direct(d) if d.sender != batch_submitter)) + .count(); + assert_eq!( + recovery_direct_count, 2, + "both deposits should be in the recovery batch" + ); + } + + #[test] + fn undrained_safe_input_appears_in_recovery_batch_first_frame() { + // §7.4.2: a deposit ingested into safe_inputs but not yet drained + // into any frame must be sequenced into the recovery batch's first + // frame after cascade. Complements §7.4.1 (re-drain from + // invalidated) with the never-drained case. + let db = temp_db("recovery-includes-undrained"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0 with no deposits"); + + let non_submitter = Address::repeat_byte(0xCC); + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: non_submitter, + payload: vec![0xde, 0xad], + block_number: 20, + }], + &default_protocol_config(), + ) + .expect("append undrained deposit"); + let before = all_ordered_l2_txs(&mut storage); + assert!( + before.iter().all(|tx| !matches!( + tx, + SequencedL2Tx::Direct(d) if d.sender == non_submitter + )), + "undrained deposit must not be sequenced before drain", + ); + + let batch_submitter = SENDER_A; + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + &default_protocol_config(), + ) + .expect("append stale batch submission"); + let invalidated = storage.detect_and_recover(1200).expect("recover"); + assert!(!invalidated.is_empty(), "stale batch must cascade"); + + let recovery = storage.open_state().expect("load").unwrap(); + let recovery_txs = storage + .ordered_l2_txs_for_batch(recovery.batch_index) + .expect("load recovery batch txs"); + let deposit_payloads: Vec<&[u8]> = recovery_txs + .iter() + .filter_map(|tx| match tx { + SequencedL2Tx::Direct(d) if d.sender == non_submitter => Some(d.payload.as_slice()), + _ => None, + }) + .collect(); + assert_eq!( + deposit_payloads, + vec![&[0xde, 0xad][..]], + "undrained deposit must land in the recovery batch's first frame", + ); + } + + #[test] + fn recovery_batch_opens_empty_when_no_direct_inputs_pending() { + // §7.4.3: no drained-into-invalidated inputs AND no undrained safe + // inputs → recovery batch opens with an empty first frame (aside + // from the batch-submitter's own self-submission, which is drained + // but carries no user-visible payload). + let db = temp_db("recovery-empty-first-frame"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + let batch_submitter = SENDER_A; + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + &default_protocol_config(), + ) + .expect("append stale batch submission"); + let invalidated = storage.detect_and_recover(1200).expect("recover"); + assert_eq!(invalidated, vec![0, 1]); + + let recovery = storage.open_state().expect("load").unwrap(); + let recovery_txs = storage + .ordered_l2_txs_for_batch(recovery.batch_index) + .expect("load recovery batch txs"); + let user_visible: Vec<_> = recovery_txs + .iter() + .filter(|tx| match tx { + SequencedL2Tx::Direct(d) => d.sender != batch_submitter, + SequencedL2Tx::UserOp(_) => true, + }) + .collect(); + assert!( + user_visible.is_empty(), + "recovery batch must have no deposits or user-ops when none were pending: {user_visible:?}", + ); + } + + #[test] + fn first_batch_stale_recovery_reuses_nonce_zero() { + // §7.5.1: first-ever batch (nonce 0) goes stale before reaching + // Gold. Cascade invalidates it; recovery opens a fresh batch that + // reuses nonce 0 (no valid ancestor exists to advance the nonce). + let db = temp_db("first-batch-stale-nonce-zero"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0 (nonce 0)"); + + let batch_submitter = SENDER_A; + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + &default_protocol_config(), + ) + .expect("append stale batch submission"); + let invalidated = storage.detect_and_recover(1200).expect("recover"); + assert_eq!( + invalidated, + vec![0, 1], + "closed batch 0 and open batch 1 must both invalidate", + ); + + let recovery = storage.open_state().expect("load").unwrap(); + assert_eq!(recovery.batch_index, 2, "batch_index is monotonic (PK)"); + drop(storage); + + // Read the new Tip's nonce and parent pointer via raw SQL — no + // public accessor surfaces them. + let conn = Storage::open_connection(db.path.as_str()).expect("open read conn"); + let recovery_i64 = recovery.batch_index as i64; + let nonce: i64 = conn + .query_row( + "SELECT nonce FROM batches WHERE batch_index = ?1", + [recovery_i64], + |row| row.get(0), + ) + .expect("query nonce"); + assert_eq!( + nonce, 0, + "recovery batch must reuse nonce 0 after torn cascade", + ); + let parent: Option = conn + .query_row( + "SELECT parent_batch_index FROM batches WHERE batch_index = ?1", + [recovery_i64], + |row| row.get(0), + ) + .expect("query parent"); + assert_eq!( + parent, None, + "torn recovery has no valid ancestor; parent_batch_index is NULL", + ); + } + + #[test] + fn detect_and_recover_after_post_recovery_crash_is_no_op() { + // §7.6.3: simulate a crash AFTER open_recovery_batch has run. On + // restart, the state contains a valid open recovery batch (no stale + // tail remains). A fresh `detect_and_recover` call must be a no-op: + // no new invalidations, and the same recovery batch remains the Tip. + // + // Distinct from §7.6.1 (idempotent back-to-back call on the same + // Storage handle) — this test drops and reopens Storage to model a + // full restart over the persisted DB. + let db = temp_db("post-recovery-crash-idempotent"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + + let batch_submitter = SENDER_A; + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + &default_protocol_config(), + ) + .expect("append stale submission"); + // First call: full recovery runs to completion and opens a new Tip. + let invalidated = storage.detect_and_recover(1200).expect("recover"); + assert_eq!(invalidated, vec![0, 1]); + let recovery_index = storage + .open_state() + .expect("load open") + .expect("recovery batch exists") + .batch_index; + + // Simulate "crash immediately after open_recovery_batch" by + // dropping Storage (mimics process exit) and reopening against the + // same on-disk DB. + drop(storage); + let mut storage = Storage::open(db.path.as_str()).expect("reopen storage"); + + let second = storage.detect_and_recover(1200).expect("second detect"); + assert!( + second.is_empty(), + "post-recovery restart must be a no-op, got invalidations: {second:?}", + ); + let after = storage + .open_state() + .expect("load after restart") + .expect("recovery batch still Tip after restart"); + assert_eq!( + after.batch_index, recovery_index, + "the same recovery batch must remain the Tip after restart", + ); + } +} + +mod check_danger_zone { + use super::*; + + // ── check_danger_zone ────────────────────────────────────────────── + + #[test] + fn check_danger_zone_ignores_old_gold_batches() { + // Batch 0 is Gold (accepted, first_frame_safe_block=10). Batch 1 is + // the open tip at first_frame_safe_block=100. Advance safe head to + // 1200 so batch 0 is age=1190 > 1125 (past threshold, but it's Gold + // and therefore excluded) while batch 1 is age=1100 < 1125 (fresh). + // + // `check_danger_zone` must return None: no unresolved batch is in + // danger. Gold batches (accepted past the frontier) never participate, + // and the open tip isn't old enough to trip the threshold. + let db = temp_db("danger-zone-gold"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let batch_submitter = Address::repeat_byte(0xAA); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 0"); + + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 20, + }], + &default_protocol_config(), + ) + .expect("append safe input"); + // Advance to a current safe block where batch 0 (safe_block=10) is + // past threshold (1200-10=1190>=1125) but batch 1 (safe_block=100) + // is still fresh (1200-100=1100<1125). + storage + .append_safe_inputs(1200, &[], &default_protocol_config()) + .expect("advance safe block"); + + let result = storage.check_danger_zone(1125).expect("check danger zone"); + assert!( + result.is_none(), + "old Gold batches should not trigger danger zone; got batch_index={result:?}" + ); + } + + #[test] + fn check_danger_zone_does_not_flag_open_batch_zombie() { + // `check_danger_zone` is for zombie detection: it must NOT flag the + // open batch (which has no L1 tx to become a zombie). Flagging open + // batches here would put the live submitter into a shutdown/restart + // loop when an open batch ages into the danger zone without any + // pending wallet-nonce slots to flush. + // + // Scenario: only an open batch exists, aged past the danger + // threshold. `check_danger_zone` returns None. + let db = temp_db("danger-zone-open-no-zombie"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize open batch at safe_block=10"); + + storage + .append_safe_inputs(1200, &[], &default_protocol_config()) + .expect("advance safe head past danger threshold"); + + let result = storage.check_danger_zone(1125).expect("check danger zone"); + assert!( + result.is_none(), + "open batch (no zombie) must not trigger check_danger_zone; got batch_index={result:?}" + ); + } +} + +mod check_any_unresolved { + use super::*; + + // ── check_any_unresolved_batch_in_danger ─────────────────────────────── + + #[test] + fn check_any_unresolved_flags_stale_open_batch() { + // Wall-clock fallback regression: `check_any_unresolved_batch_in_danger` + // MUST flag a stale open batch. This is the semantic the wall-clock + // fallback relies on — if L1 is unreachable and an open batch may be + // past the threshold, refuse to boot rather than accept user ops + // into a batch that can't land. + let db = temp_db("any-unresolved-open-stale"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize open batch at safe_block=10"); + + storage + .append_safe_inputs(1200, &[], &default_protocol_config()) + .expect("advance safe head past threshold"); + + let result = storage + .check_any_unresolved_batch_in_danger(1125) + .expect("check any unresolved in danger"); + assert_eq!( + result, + Some(0), + "stale open batch (batch 0) must be flagged by the unified check" + ); + } + + #[test] + fn check_any_unresolved_does_not_flag_fresh_open_batch() { + // Negative counterpart. Fresh open batch below threshold must not + // trigger false positives in the unified check. + let db = temp_db("any-unresolved-open-fresh"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + + storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize open batch at safe_block=10"); + + storage + .append_safe_inputs(1100, &[], &default_protocol_config()) + .expect("advance safe head below threshold"); + + let result = storage + .check_any_unresolved_batch_in_danger(1125) + .expect("check any unresolved in danger"); + assert!( + result.is_none(), + "fresh open batch must not trigger the unified check; got batch_index={result:?}" + ); + } + + #[test] + fn check_danger_zone_triggers_on_frontier_batch() { + let db = temp_db("danger-zone-frontier"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let batch_submitter = Address::repeat_byte(0xAA); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 1"); + + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 20, + }], + &default_protocol_config(), + ) + .expect("append safe input"); + storage + .append_safe_inputs(1200, &[], &default_protocol_config()) + .expect("advance safe block"); + + let result = storage.check_danger_zone(1125).expect("check danger zone"); + assert_eq!(result, Some(1), "frontier batch should trigger danger zone"); + } + + #[test] + fn check_danger_zone_does_not_trigger_below_threshold() { + let db = temp_db("danger-zone-below"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let batch_submitter = Address::repeat_byte(0xAA); + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 10) + .expect("close batch 0"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 1"); + + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 20, + }], + &default_protocol_config(), + ) + .expect("append safe input"); + storage + .append_safe_inputs(1134, &[], &default_protocol_config()) + .expect("advance safe block"); + + let result = storage.check_danger_zone(1125).expect("check danger zone"); + assert!( + result.is_none(), + "should not trigger below threshold; got batch_index={result:?}" + ); + } +} + +mod boundary { + use super::*; + + // ── boundary tests ───────────────────────────────────────────────── + + #[test] + fn detect_and_recover_boundary_exactly_max_wait_is_stale() { + let db = temp_db("detect-boundary-exact"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let max_wait: u64 = 1200; + + let mut head = storage + .initialize_open_state(100, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch"); + + storage + .append_safe_inputs( + 1300, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 100), + block_number: 1300, + }], + &default_protocol_config(), + ) + .expect("append safe input"); + let invalidated = storage.detect_and_recover(max_wait).expect("detect"); + assert_eq!(invalidated, vec![0, 1], "exactly at max_wait must be stale"); + assert_eq!(storage.open_state().expect("load").unwrap().batch_index, 2); + } + + #[test] + fn detect_and_recover_boundary_one_below_max_wait_is_not_stale() { + let db = temp_db("detect-boundary-one-below"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let max_wait: u64 = 1200; + + let mut head = storage + .initialize_open_state(100, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch"); + + storage + .append_safe_inputs( + 1299, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 100), + block_number: 1299, + }], + &default_protocol_config(), + ) + .expect("append safe input"); + let invalidated = storage.detect_and_recover(max_wait).expect("detect"); + assert!( + invalidated.is_empty(), + "one below max_wait must not be stale" + ); + } + + #[test] + fn detect_and_recover_all_batches_invalidated_frontier_zero() { + let db = temp_db("detect-frontier-zero"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let max_wait: u64 = 1200; + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + for _ in 0..3 { + storage.close_frame_and_batch(&mut head, 10).expect("close"); + } + + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + &default_protocol_config(), + ) + .expect("append"); + let inv = storage.detect_and_recover(max_wait).expect("detect"); + assert_eq!(inv, vec![0, 1, 2, 3]); + assert!(storage.open_state().expect("open").is_some()); + } + + #[test] + fn detect_and_recover_recovery_batch_itself_becomes_stale() { + let db = temp_db("detect-recovery-stale"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let max_wait: u64 = 1200; + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage.close_frame_and_batch(&mut head, 10).expect("close"); + + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + &default_protocol_config(), + ) + .expect("append gen1"); + let inv1 = storage.detect_and_recover(max_wait).expect("recover gen1"); + assert_eq!(inv1, vec![0, 1]); + + let mut head2 = storage.open_state().expect("load").unwrap(); + storage + .close_frame_and_batch(&mut head2, 1210) + .expect("close gen2"); + + storage + .append_safe_inputs( + 2410, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 1210), + block_number: 2410, + }], + &default_protocol_config(), + ) + .expect("append gen2"); + let inv2 = storage.detect_and_recover(max_wait).expect("recover gen2"); + assert_eq!(inv2, vec![2, 3]); + assert!(storage.open_state().expect("open").is_some()); + } + + #[test] + fn detect_and_recover_multi_round_gen3_recovery() { + let db = temp_db("detect-gen3"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let max_wait: u64 = 1200; + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("init"); + storage.close_frame_and_batch(&mut head, 10).expect("close"); + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + &default_protocol_config(), + ) + .expect("append"); + storage.detect_and_recover(max_wait).expect("recover gen1"); + + let mut head2 = storage.open_state().expect("load").unwrap(); + storage + .close_frame_and_batch(&mut head2, 1210) + .expect("close gen2"); + storage + .append_safe_inputs( + 2410, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 1210), + block_number: 2410, + }], + &default_protocol_config(), + ) + .expect("append gen2"); + storage.detect_and_recover(max_wait).expect("recover gen2"); + + let mut head3 = storage.open_state().expect("load").unwrap(); + storage + .close_frame_and_batch(&mut head3, 2410) + .expect("close gen3"); + storage + .append_safe_inputs( + 2420, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 2410), + block_number: 2420, + }], + &default_protocol_config(), + ) + .expect("append gen3"); + let inv3 = storage.detect_and_recover(max_wait).expect("recover gen3"); + assert!(inv3.is_empty(), "gen3 should be healthy"); + } + + #[test] + fn detect_and_recover_large_cascade_50_batches() { + let db = temp_db("detect-large-cascade"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let max_wait: u64 = 1200; + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + for _ in 0..50 { + storage.close_frame_and_batch(&mut head, 10).expect("close"); + } + + storage + .append_safe_inputs( + 1210, + &[StoredSafeInput { + sender: SENDER_A, + payload: make_stale_batch_payload(0, 10), + block_number: 1210, + }], + &default_protocol_config(), + ) + .expect("append"); + let inv = storage.detect_and_recover(max_wait).expect("detect"); + assert_eq!(inv.len(), 51); + } +} + +mod schema_invariants { + use super::*; + use rusqlite::params; + + // ── Schema-invariant regression tests ───────────────────────────────── + // + // These exercise the triggers + partial unique index in the schema + // directly. Each one checks a specific invariant that previously lived + // in writer discipline and now has a schema-level tripwire. + // + // They're here (rather than in a dedicated file) because they share the + // recovery tests' setup: same helpers, same fixture. Failures here mean + // the schema guard regressed, which is the whole point of making the + // invariants declarative. + + #[test] + fn schema_rejects_second_valid_tip() { + // The partial unique index `ux_single_valid_tip` catches a writer that + // opens a new Tip without sealing the old one first. + let db = temp_db("schema-second-tip"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + + // Try to bypass the lane and insert a second valid Tip directly. + let err = storage.conn.execute( + "INSERT INTO batches (batch_index, parent_batch_index, nonce, created_at_ms) \ + VALUES (99, 0, 1, 1000)", + [], + ); + let msg = format!("{err:?}"); + assert!( + msg.contains("UNIQUE constraint failed") && msg.contains("ux_single_valid_tip"), + "expected ux_single_valid_tip violation, got: {msg}" + ); + } + + #[test] + fn schema_rejects_bad_nonce_contiguity() { + // Nonce must equal parent.nonce + 1 — trigger enforces it. + // Insert the bad-nonce batch as already-sealed so it doesn't collide + // with the existing Tip on `ux_single_valid_tip`. + let db = temp_db("schema-bad-nonce"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 0) + .expect("close batch 0; batch 1 is now Tip"); + // Batch 1 has nonce 1 (0 + 1). Insert child with nonce 99 (should be 2). + let err = storage.conn.execute( + "INSERT INTO batches (batch_index, parent_batch_index, nonce, created_at_ms, sealed_at_ms) \ + VALUES (999, 1, 99, \ + (SELECT created_at_ms FROM batches WHERE batch_index = 1), \ + (SELECT created_at_ms FROM batches WHERE batch_index = 1))", + [], + ); + assert!( + format!("{err:?}").contains("batch nonce must equal parent.nonce + 1"), + "expected nonce trigger, got: {err:?}" + ); + } + + #[test] + fn schema_rejects_genesis_with_nonzero_nonce() { + let db = temp_db("schema-genesis-nonzero"); + let storage = Storage::open(db.path.as_str()).expect("open storage"); + let err = storage.conn.execute( + "INSERT INTO batches (batch_index, parent_batch_index, nonce, created_at_ms) \ + VALUES (0, NULL, 7, 100)", + [], + ); + assert!( + format!("{err:?}").contains("genesis batch must have nonce 0"), + "expected genesis-nonce trigger, got: {err:?}" + ); + } + + #[test] + fn schema_rejects_re_seal() { + let db = temp_db("schema-re-seal"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 0) + .expect("close batch 0 (seals it)"); + // Batch 0 is sealed. Attempt to re-seal with a different timestamp. + let err = storage.conn.execute( + "UPDATE batches SET sealed_at_ms = sealed_at_ms + 1 WHERE batch_index = 0", + [], + ); + assert!( + format!("{err:?}").contains("sealed_at_ms is write-once"), + "expected write-once trigger, got: {err:?}" + ); + } + + #[test] + fn schema_rejects_re_invalidate() { + let db = temp_db("schema-re-invalidate"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + // Seed via test helper (uses now_unix_ms internally). + storage.insert_invalid_batch(0).expect("first invalidate"); + let err = storage.conn.execute( + "UPDATE batches SET invalidated_at_ms = invalidated_at_ms + 1 \ + WHERE batch_index = 0", + [], + ); + assert!( + format!("{err:?}").contains("invalidated_at_ms is write-once"), + "expected write-once trigger, got: {err:?}" + ); + } + + #[test] + fn schema_rejects_frame_insert_into_sealed_batch() { + // This is the bug class we've been fighting: writer holds a stale + // WriteHead and writes to a batch that's no longer the Tip. + let db = temp_db("schema-frame-into-sealed"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 0) + .expect("close batch 0; batch 0 is now sealed"); + // Batch 0 is sealed. Any direct insert into its frames must fail. + let err = storage.conn.execute( + "INSERT INTO frames (batch_index, frame_in_batch, created_at_ms, fee, safe_block) \ + VALUES (0, 1, 100, 1060, 0)", + [], + ); + assert!( + format!("{err:?}").contains("frames can only be inserted into the current Tip"), + "expected tip-only-frames trigger, got: {err:?}" + ); + } + + #[test] + fn schema_rejects_frame_insert_into_invalidated_batch() { + let db = temp_db("schema-frame-into-invalid"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + // Invalidate (without sealing) — Tip that never closed, now dead. + storage.insert_invalid_batch(0).expect("invalidate tip"); + let err = storage.conn.execute( + "INSERT INTO frames (batch_index, frame_in_batch, created_at_ms, fee, safe_block) \ + VALUES (0, 1, 100, 1060, 0)", + [], + ); + assert!( + format!("{err:?}").contains("frames can only be inserted into the current Tip"), + "expected tip-only-frames trigger, got: {err:?}" + ); + } + + #[test] + fn schema_rejects_parent_batch_index_mutation() { + let db = temp_db("schema-parent-immutable"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + storage + .close_frame_and_batch(&mut head, 0) + .expect("close batch 0"); + // Try to change parent of batch 1 — should be rejected. + let err = storage.conn.execute( + "UPDATE batches SET parent_batch_index = NULL WHERE batch_index = 1", + [], + ); + assert!( + format!("{err:?}").contains("parent_batch_index is immutable"), + "expected parent-immutable trigger, got: {err:?}" + ); + } + + #[test] + fn nonce_reuse_after_cascade_with_valid_ancestor() { + // Beautiful part of parent-pointer + structural nonce: after a cascade + // that invalidates only the suffix (keeping an ancestor valid), the + // new Tip's parent is the last valid ancestor, so its nonce is + // `ancestor.nonce + 1` — the same nonce the invalidated suffix's + // first batch had. Nonce reuse is automatic. + // + // Scenario: batch 0 is accepted (safe_accepted_batches advances past + // nonce 0). Batch 1 is stale and triggers cascade. Batches 1, 2, 3 + // invalidated; batch 0 remains valid. + let db = temp_db("nonce-reuse-with-ancestor"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let batch_submitter = SENDER_A; + + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize at safe_block=10"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 0 (nonce 0)"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 1 (nonce 1)"); + storage + .close_frame_and_batch(&mut head, 100) + .expect("close batch 2 (nonce 2)"); + // Head is now batch 3 (nonce 3, first_frame_safe_block=100). + + // Batch 0 lands on L1 (accepted): safe_input at block 20 with nonce 0. + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 20, + }], + &default_protocol_config(), + ) + .expect("append batch 0 submission"); + // Advance safe head so batches 1, 2, 3 (first_frame=100) are stale. + // current_safe=1400 → 1400-100=1300 >= 1200. + storage + .append_safe_inputs(1400, &[], &default_protocol_config()) + .expect("advance past threshold"); + + let inv = storage.detect_and_recover(1200).expect("recover"); + // Batches 1, 2, 3 invalidated; batch 0 (accepted) stays valid. + assert_eq!(inv, vec![1, 2, 3], "only the suffix cascades, got {inv:?}"); + + // The NEW Tip has parent=0 (the last valid ancestor), nonce=1. + // This is what nonce reuse looks like: the invalidated batch 1 had + // nonce 1; the recovery batch gets the same nonce via +1-from-parent. + let (tip_nonce, tip_parent): (i64, i64) = storage + .conn + .query_row( + "SELECT nonce, parent_batch_index FROM valid_open_batch", + [], + |row| Ok((row.get(0)?, row.get(1)?)), + ) + .expect("query recovery tip"); + assert_eq!(tip_nonce, 1, "recovery Tip reuses nonce 1"); + assert_eq!(tip_parent, 0, "recovery Tip's parent is batch 0"); + } + + // ── §12.1.1 CHECK-constraint regressions ────────────────────────── + // + // These differ from the trigger-based tests above: they exercise raw + // `CHECK` clauses declared in `migrations/0001_schema.sql`. The + // type-safe `Storage` API would reject these values Rust-side; we go + // through `storage.conn.execute` to prove the schema itself refuses. + + #[test] + fn schema_rejects_safe_input_with_wrong_sender_length() { + // §12.1.1: `safe_inputs.sender` must be exactly 20 bytes (an + // Ethereum address). A shorter or longer blob must be refused + // by the schema even if it bypasses the Rust API. + let db = temp_db("schema-safe-input-sender-len"); + let storage = Storage::open(db.path.as_str()).expect("open storage"); + let err = storage.conn.execute( + "INSERT INTO safe_inputs (safe_input_index, sender, payload, block_number) \ + VALUES (0, X'DEADBEEF', X'00', 10)", + [], + ); + assert!( + format!("{err:?}").contains("CHECK constraint failed"), + "expected CHECK constraint error on safe_inputs.sender, got: {err:?}", + ); + } + + #[test] + fn schema_rejects_user_op_with_wrong_sender_length() { + // §12.1.1: `user_ops.sender` must be 20 bytes. + let db = temp_db("schema-user-op-sender-len"); + let storage = Storage::open(db.path.as_str()).expect("open storage"); + // Seed a frame to satisfy the composite FK — initialize_open_state + // creates batch 0 frame 0 as the Tip. + let mut storage = storage; + storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + let err = storage.conn.execute( + "INSERT INTO user_ops \ + (batch_index, frame_in_batch, pos_in_frame, sender, nonce, max_fee, data, sig, received_at_ms) \ + VALUES (0, 0, 0, X'010203', 0, 0, X'', ?1, 0)", + params![vec![0u8; 65]], + ); + assert!( + format!("{err:?}").contains("CHECK constraint failed"), + "expected CHECK constraint error on user_ops.sender length, got: {err:?}", + ); + } + + #[test] + fn schema_rejects_user_op_with_wrong_signature_length() { + // §12.1.1: `user_ops.sig` must be exactly 65 bytes (secp256k1 + // r || s || v). Regression for "accidentally accepted a non-65 + // signature and crashed a downstream consumer." + let db = temp_db("schema-user-op-sig-len"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + let valid_sender = vec![0u8; 20]; + let short_sig = vec![0u8; 32]; // Should be 65. + let err = storage.conn.execute( + "INSERT INTO user_ops \ + (batch_index, frame_in_batch, pos_in_frame, sender, nonce, max_fee, data, sig, received_at_ms) \ + VALUES (0, 0, 0, ?1, 0, 0, X'', ?2, 0)", + params![valid_sender, short_sig], + ); + assert!( + format!("{err:?}").contains("CHECK constraint failed"), + "expected CHECK constraint error on user_ops.sig length, got: {err:?}", + ); + } + + #[test] + fn schema_rejects_sequenced_l2_tx_with_neither_xor_branch() { + // §12.1.1: `sequenced_l2_txs` must be either a user-op row + // (user_op_pos_in_frame IS NOT NULL) or a direct-input row + // (safe_input_index IS NOT NULL), never both and never neither. + // Setting both to NULL is the clean XOR violation to test — + // FKs are only triggered on non-NULL values so we isolate the + // CHECK constraint. + let db = temp_db("schema-sequenced-l2-tx-xor-neither"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize"); + let err = storage.conn.execute( + "INSERT INTO sequenced_l2_txs \ + (offset, batch_index, frame_in_batch, user_op_pos_in_frame, safe_input_index) \ + VALUES (0, 0, 0, NULL, NULL)", + [], + ); + assert!( + format!("{err:?}").contains("CHECK constraint failed"), + "expected CHECK constraint error on sequenced_l2_txs XOR, got: {err:?}", + ); + } + + #[test] + fn schema_rejects_l1_bootstrap_cache_with_zero_chain_id() { + // §12.1.1: `l1_bootstrap_cache.chain_id > 0`. chain_id = 0 would + // collide with the EIP-712 domain's unspecified-chain sentinel + // and break signature recovery; the CHECK refuses to persist it + // in the first place. + let db = temp_db("schema-bootstrap-chain-id-zero"); + let storage = Storage::open(db.path.as_str()).expect("open storage"); + let input_box = vec![0u8; 20]; + let err = storage.conn.execute( + "INSERT INTO l1_bootstrap_cache \ + (singleton_id, input_box_address, genesis_block, chain_id) \ + VALUES (0, ?1, 0, 0)", + params![input_box], + ); + assert!( + format!("{err:?}").contains("CHECK constraint failed"), + "expected CHECK constraint error on chain_id > 0, got: {err:?}", + ); + } + + #[test] + fn schema_rejects_safe_input_with_negative_block_number() { + // §12.1.1: `safe_inputs.block_number >= 0`. Catches a regression + // that would let a negative block number slip through — the rest + // of the system assumes non-negative and could panic on cast. + let db = temp_db("schema-safe-input-neg-block"); + let storage = Storage::open(db.path.as_str()).expect("open storage"); + let sender = vec![0u8; 20]; + let err = storage.conn.execute( + "INSERT INTO safe_inputs (safe_input_index, sender, payload, block_number) \ + VALUES (0, ?1, X'00', -1)", + params![sender], + ); + assert!( + format!("{err:?}").contains("CHECK constraint failed"), + "expected CHECK constraint error on block_number >= 0, got: {err:?}", + ); + } +} + +mod tree_invariants { + use super::*; + + // ── §12.5 Parent-pointer tree invariants ────────────────────────────── + use crate::storage::convert::{i64_to_u64, u64_to_i64}; + use rusqlite::params; + + /// Check the tree invariants that should hold at every quiescent state: + /// - Every valid batch has `nonce = parent.nonce + 1`, or `nonce = 0` + /// with `parent_batch_index IS NULL` (genesis/post-torn-cascade). + /// - Every `parent_batch_index` either is NULL or references an + /// existing batch (FK handles this, but we assert explicitly). + /// - Walking up `parent_batch_index` from any valid batch terminates + /// at a NULL-parent row within `batch_index` hops (no cycles). + /// - The valid path is strictly contiguous in `nonce`: the set of + /// nonces among valid batches is `{0, 1, ..., max_valid_nonce}`. + /// - At most one `valid_open_batch` row exists. + fn assert_tree_invariants(storage: &mut Storage) { + // 1. Nonce = parent.nonce + 1 (or nonce=0 for NULL parent). + let mut stmt = storage + .conn + .prepare( + "SELECT b.batch_index, b.parent_batch_index, b.nonce, p.nonce \ + FROM batches b LEFT JOIN batches p ON p.batch_index = b.parent_batch_index", + ) + .expect("prepare"); + let rows: Vec<(i64, Option, i64, Option)> = stmt + .query_map([], |row| { + Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?)) + }) + .expect("query") + .collect::>() + .expect("collect"); + drop(stmt); + for (bi, parent, nonce, parent_nonce) in &rows { + match (parent, parent_nonce) { + (None, _) => assert_eq!( + *nonce, 0, + "batch {bi}: NULL parent must have nonce 0, got {nonce}" + ), + (Some(_), None) => panic!("batch {bi}: parent exists but parent row missing"), + (Some(_), Some(pn)) => assert_eq!( + *nonce, + pn + 1, + "batch {bi}: nonce={nonce}, expected parent.nonce+1 = {}", + pn + 1 + ), + } + } + + // 2. At most one valid open batch. + let open_count: i64 = storage + .conn + .query_row("SELECT COUNT(*) FROM valid_open_batch", [], |row| { + row.get(0) + }) + .expect("count open"); + assert!(open_count <= 1, "more than one valid Tip: {open_count}"); + + // 3. Valid-path nonce contiguity: nonces on the valid chain are 0..N. + let mut valid_nonces: Vec = storage + .conn + .prepare("SELECT nonce FROM valid_batches ORDER BY nonce ASC") + .expect("prepare") + .query_map([], |row| row.get::<_, i64>(0)) + .expect("query") + .collect::>() + .expect("collect"); + // There can be multiple valid batches with the SAME nonce only if + // they live on different branches — but we don't allow that; valid + // batches form a strict chain. So dedup-and-equal means contiguous. + valid_nonces.sort(); + valid_nonces.dedup(); + for (i, &n) in valid_nonces.iter().enumerate() { + assert_eq!( + n, i as i64, + "valid nonces not contiguous: got {valid_nonces:?}" + ); + } + + // 4. Parent walk terminates at NULL in ≤ batch_index hops for every valid row. + for (bi, _, _, _) in &rows { + let mut cur: i64 = *bi; + let bi_u = i64_to_u64(*bi); + for _ in 0..=bi_u { + let parent: Option = storage + .conn + .query_row( + "SELECT parent_batch_index FROM batches WHERE batch_index = ?1", + params![cur], + |row| row.get(0), + ) + .expect("parent lookup"); + match parent { + None => break, + Some(p) => { + assert!( + p < cur, + "batch {bi}: parent-walk went backward ({p} >= {cur}) — cycle?" + ); + cur = p; + } + } + } + } + } + + #[test] + fn tree_invariants_hold_across_mixed_workload() { + // Exercises every mutating code path: genesis, rotations, partial + // cascades (ancestor survives), cascades across accepted frontier, + // torn cascades (no valid ancestor), and back-to-back generations. + // Asserts tree invariants after each step. + let db = temp_db("tree-invariants-workload"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let batch_submitter = SENDER_A; + + // Phase 1: genesis + 4 rotations. Simple chain. + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + assert_tree_invariants(&mut storage); + for _ in 0..4 { + storage + .close_frame_and_batch(&mut head, 100) + .expect("close"); + assert_tree_invariants(&mut storage); + } + // Tree: 0(Gold sentinel in concept)→1→2→3→4 (Tip) + + // Phase 2: cascade with a valid ancestor. Batch 0 is accepted first. + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 20, + }], + &default_protocol_config(), + ) + .expect("append accepted"); + storage + .append_safe_inputs(1400, &[], &default_protocol_config()) + .expect("advance past threshold"); + let inv = storage.detect_and_recover(1200).expect("recover"); + assert!(!inv.is_empty(), "partial cascade should invalidate"); + assert_tree_invariants(&mut storage); + + // Phase 3: more rotations after partial cascade. + let mut head = storage.open_state().expect("load").unwrap(); + for _ in 0..3 { + storage + .close_frame_and_batch(&mut head, 1500) + .expect("close gen2"); + assert_tree_invariants(&mut storage); + } + + // Phase 4: torn cascade — invalidate everything including batch 0. + let latest = storage.latest_batch_index().expect("latest").unwrap(); + for bi in 0..=latest { + storage.insert_invalid_batch(bi).expect("invalidate"); + } + storage.detect_and_recover(1200).expect("recover from torn"); + assert_tree_invariants(&mut storage); + + // Phase 5: rotations after torn cascade — new Tip has parent=NULL, nonce=0. + let mut head = storage.open_state().expect("load").unwrap(); + for _ in 0..5 { + storage + .close_frame_and_batch(&mut head, 2000) + .expect("close gen3"); + assert_tree_invariants(&mut storage); + } + } + + #[test] + fn subtree_by_batch_index_equals_subtree_by_parent_walk() { + // §12.5.2: cascade queries use `batch_index >= N` as a shortcut for + // "subtree rooted at N". This test asserts the equivalence on a + // realistic scenario with multiple cascade generations. + let db = temp_db("subtree-equivalence"); + let mut storage = Storage::open(db.path.as_str()).expect("open storage"); + let batch_submitter = SENDER_A; + + // Build: 5 batches, cascade from 2 (partial), 3 more, cascade from 1 (torn-ish). + let mut head = storage + .initialize_open_state(10, SafeInputRange::empty_at(0)) + .expect("initialize"); + for _ in 0..4 { + storage + .close_frame_and_batch(&mut head, 100) + .expect("close"); + } + storage + .append_safe_inputs( + 20, + &[StoredSafeInput { + sender: batch_submitter, + payload: make_stale_batch_payload(0, 10), + block_number: 20, + }], + &default_protocol_config(), + ) + .expect("append accepted"); + storage + .append_safe_inputs(1400, &[], &default_protocol_config()) + .expect("advance"); + let _ = storage.detect_and_recover(1200).expect("cascade 1"); + + let mut head = storage.open_state().expect("load").unwrap(); + for _ in 0..2 { + storage + .close_frame_and_batch(&mut head, 1500) + .expect("close"); + } + + // Assert equivalence among VALID batches for every valid N. + // Restricting both sides to `valid_batches` is the invariant cascade + // relies on: its WHERE filters invalidated rows, so the two sets need + // only agree on the valid subset. + let valid_bi: Vec = { + let mut stmt = storage + .conn + .prepare("SELECT batch_index FROM valid_batches ORDER BY batch_index") + .expect("prepare"); + stmt.query_map([], |row| row.get::<_, i64>(0).map(i64_to_u64)) + .expect("query") + .collect::>() + .expect("collect") + }; + for &n in &valid_bi { + let by_index: Vec = { + let mut stmt = storage + .conn + .prepare( + "SELECT batch_index FROM valid_batches \ + WHERE batch_index >= ?1 ORDER BY batch_index", + ) + .expect("prepare"); + stmt.query_map(params![u64_to_i64(n)], |row| { + row.get::<_, i64>(0).map(i64_to_u64) + }) + .expect("query") + .collect::>() + .expect("collect") + }; + let by_subtree: Vec = { + let mut stmt = storage + .conn + .prepare( + "WITH RECURSIVE subtree(batch_index) AS ( \ + SELECT batch_index FROM valid_batches WHERE batch_index = ?1 \ + UNION ALL \ + SELECT b.batch_index FROM valid_batches b \ + JOIN subtree s ON b.parent_batch_index = s.batch_index \ + ) \ + SELECT batch_index FROM subtree ORDER BY batch_index", + ) + .expect("prepare"); + stmt.query_map(params![u64_to_i64(n)], |row| { + row.get::<_, i64>(0).map(i64_to_u64) + }) + .expect("query") + .collect::>() + .expect("collect") + }; + assert_eq!( + by_index, by_subtree, + "cascade root {n}: valid batch_index >= N diverged from valid parent-walk subtree" + ); + } + } +} diff --git a/sequencer/src/storage/safe_accepted_batches.rs b/sequencer/src/storage/safe_accepted_batches.rs new file mode 100644 index 0000000..751726d --- /dev/null +++ b/sequencer/src/storage/safe_accepted_batches.rs @@ -0,0 +1,137 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Materialized view of the scheduler-accepted batches. +//! +//! `safe_accepted_batches` caches the prefix of submitted batches that the +//! on-chain scheduler would accept, based on an off-chain simulation of its +//! acceptance rules (see [`sequencer_core::protocol::ProtocolConfig`]). +//! +//! Maintenance contract: the view is advanced atomically with each +//! [`super::Storage::append_safe_inputs`] write, so any reader that sees +//! `l1_safe_head` at block B also sees every acceptance decision up to B. No +//! caller should populate this view directly. +//! +//! Readers: +//! - batch submitter frontier / danger reads (`submitter_frontier`, +//! `check_danger`) +//! - recovery cascade (`find_closed_frontier_batch_in_danger`) +//! - wall-clock and stalled-safe-head danger estimates +//! +//! The only writer is [`populate_safe_accepted_batches`], invoked from +//! `append_safe_inputs` inside its transaction. + +use rusqlite::{Connection, OptionalExtension, Result, params}; + +use super::convert::{i64_to_u64, u64_to_i64}; +use sequencer_core::protocol::{ProtocolConfig, SafeInputView}; + +/// One row of `safe_accepted_batches`, exposing just the columns the +/// frontier-read code paths need. +#[derive(Debug, Clone, Copy)] +pub(super) struct SafeAcceptedBatchRow { + pub safe_input_index: i64, + pub nonce: i64, +} + +/// The most recently accepted row, or `None` if the view is empty. +pub(super) fn query_latest_safe_accepted_batch( + conn: &Connection, +) -> Result> { + conn.query_row( + "SELECT safe_input_index, nonce FROM safe_accepted_batches \ + ORDER BY safe_input_index DESC LIMIT 1", + [], + |row| { + Ok(SafeAcceptedBatchRow { + safe_input_index: row.get(0)?, + nonce: row.get(1)?, + }) + }, + ) + .optional() +} + +/// Simulate the scheduler's acceptance logic over new safe inputs and append +/// matches to `safe_accepted_batches`. +/// +/// Paginates through `safe_inputs` rows newer than the cursor (latest accepted +/// row), pre-filtered at SQL to the batch-submitter's sender. For each row, +/// delegates to [`ProtocolConfig::scheduler_accepts`] with the currently-expected +/// nonce — on `Some`, inserts the accepted row and advances expected; on +/// `None`, moves on. The SQL sender filter is an optimization; `scheduler_accepts` +/// re-checks defensively, so the filter is correctness-neutral. +/// +/// Paginated to bound memory. The cursor tracks the scan regardless of +/// acceptance, so a long run of rejected rows between acceptances still +/// makes forward progress. +pub(super) fn populate_safe_accepted_batches( + conn: &Connection, + protocol: &ProtocolConfig, +) -> Result<()> { + const PAGE_SIZE: i64 = 256; + const SELECT_SQL: &str = "SELECT safe_input_index, payload, block_number \ + FROM safe_inputs \ + WHERE sender = ?1 AND safe_input_index > ?2 \ + ORDER BY safe_input_index ASC LIMIT ?3"; + const INSERT_SQL: &str = "INSERT OR IGNORE INTO safe_accepted_batches \ + (safe_input_index, nonce, first_frame_safe_block, inclusion_block) \ + VALUES (?1, ?2, ?3, ?4)"; + + let latest_accepted = query_latest_safe_accepted_batch(conn)?; + let mut cursor = latest_accepted + .map(|row| row.safe_input_index) + .unwrap_or(-1); + let mut expected = latest_accepted + .map(|row| i64_to_u64(row.nonce).saturating_add(1)) + .unwrap_or(0); + + loop { + // Materialize one page before executing any INSERTs. rusqlite's row + // iterator borrows the prepared statement, so we can't INSERT on the + // same connection while iterating. Once the page is collected and the + // statement is dropped, the connection is free for inserts. + let page: Vec<(i64, Vec, i64)> = { + let mut stmt = conn.prepare_cached(SELECT_SQL)?; + stmt.query_map( + params![protocol.batch_submitter.as_slice(), cursor, PAGE_SIZE,], + |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)), + )? + .collect::>()? + }; + + if page.is_empty() { + break; + } + let page_len = page.len() as i64; + + for (safe_input_index, payload, block_number) in &page { + cursor = *safe_input_index; + let input = SafeInputView { + safe_input_index: i64_to_u64(*safe_input_index), + sender: protocol.batch_submitter, + payload: payload.as_slice(), + inclusion_block: i64_to_u64(*block_number), + }; + let Some(accepted) = protocol.scheduler_accepts(input, expected) else { + continue; + }; + conn.execute( + INSERT_SQL, + params![ + u64_to_i64(accepted.safe_input_index), + u64_to_i64(accepted.nonce), + u64_to_i64(accepted.first_frame_safe_block), + u64_to_i64(accepted.inclusion_block), + ], + )?; + expected = expected.saturating_add(1); + } + + if page_len < PAGE_SIZE { + break; + } + } + + Ok(()) +} diff --git a/sequencer/src/storage/sql.rs b/sequencer/src/storage/sql.rs deleted file mode 100644 index 556fdbb..0000000 --- a/sequencer/src/storage/sql.rs +++ /dev/null @@ -1,836 +0,0 @@ -// (c) Cartesi and individual authors (see AUTHORS) -// SPDX-License-Identifier: Apache-2.0 (see LICENSE) - -use rusqlite::{Connection, Result, Row, Transaction, params}; -use std::time::{SystemTime, UNIX_EPOCH}; - -use super::{SafeInputRange, StoredSafeInput}; -use crate::inclusion_lane::PendingUserOp; - -const SQL_SELECT_SAFE_INPUTS_RANGE: &str = include_str!("queries/select_safe_inputs_range.sql"); -const SQL_SELECT_ORDERED_L2_TXS_FROM_OFFSET: &str = - include_str!("queries/select_ordered_l2_txs_from_offset.sql"); -const SQL_SELECT_ORDERED_L2_TXS_PAGE_FROM_OFFSET: &str = - include_str!("queries/select_ordered_l2_txs_page_from_offset.sql"); -const SQL_SELECT_LATEST_BATCH_WITH_USER_OP_COUNT: &str = - include_str!("queries/select_latest_batch_with_user_op_count.sql"); -const SQL_SELECT_LATEST_FRAME_IN_BATCH_FOR_BATCH: &str = - include_str!("queries/select_latest_frame_in_batch_for_batch.sql"); -const SQL_SELECT_USER_OP_COUNT_FOR_FRAME: &str = - include_str!("queries/select_user_op_count_for_frame.sql"); -const SQL_SELECT_ORDERED_L2_TXS_FOR_BATCH: &str = - include_str!("queries/select_ordered_l2_txs_for_batch.sql"); -const SQL_SELECT_LATEST_BATCH_INDEX: &str = "SELECT MAX(batch_index) FROM batches"; -const SQL_SELECT_USER_OPS_FOR_FRAME: &str = "SELECT nonce, max_fee, data, sig FROM user_ops WHERE batch_index = ?1 AND frame_in_batch = ?2 ORDER BY pos_in_frame ASC"; -const SQL_SELECT_MAX_SAFE_INPUT_INDEX: &str = "SELECT MAX(safe_input_index) FROM safe_inputs"; -const SQL_SELECT_ORDERED_L2_TX_COUNT: &str = "SELECT COUNT(*) FROM sequenced_l2_txs"; -const SQL_SELECT_BATCH_POLICY: &str = "SELECT log_recommended_fee, log_batch_size_target FROM batch_policy_derived WHERE singleton_id = 0 LIMIT 1"; -const SQL_SELECT_SAFE_BLOCK: &str = - "SELECT block_number FROM l1_safe_head WHERE singleton_id = 0 LIMIT 1"; -const SQL_INSERT_SAFE_INPUT: &str = "INSERT INTO safe_inputs (safe_input_index, sender, payload, block_number) VALUES (?1, ?2, ?3, ?4)"; -const SQL_INSERT_USER_OP: &str = include_str!("queries/insert_user_op.sql"); -const SQL_INSERT_SEQUENCED_DIRECT_INPUT: &str = - include_str!("queries/insert_sequenced_direct_input.sql"); -const SQL_UPDATE_BATCH_POLICY_LOG_GAS_PRICE: &str = - "UPDATE batch_policy SET log_gas_price = ?1 WHERE singleton_id = 0"; -const SQL_UPDATE_BATCH_POLICY_ALPHA: &str = - "UPDATE batch_policy SET log_alpha = ?1, log_one_plus_alpha = ?2 WHERE singleton_id = 0"; -const SQL_UPDATE_SAFE_BLOCK: &str = - "UPDATE l1_safe_head SET block_number = ?1 WHERE singleton_id = 0"; -#[derive(Debug, Clone)] -pub(super) struct OrderedL2TxRow { - pub kind: i64, - pub sender: Option>, - pub data: Option>, - pub fee: Option, - pub payload: Option>, - pub block_number: Option, -} - -#[derive(Debug, Clone)] -pub(super) struct SafeInputRow { - pub safe_input_index: i64, - pub sender: Vec, - pub payload: Vec, - pub block_number: i64, -} - -#[derive(Debug, Clone)] -pub(super) struct FrameHeaderRow { - pub frame_in_batch: i64, - pub fee: i64, - pub safe_block: i64, -} - -#[derive(Debug, Clone)] -pub(super) struct FrameUserOpRow { - pub nonce: i64, - pub max_fee: i64, - pub data: Vec, - pub sig: Vec, -} - -pub(super) fn sql_select_total_drained_direct_inputs(conn: &Connection) -> Result { - const SQL: &str = "SELECT COUNT(*) FROM sequenced_l2_txs WHERE safe_input_index IS NOT NULL"; - conn.query_row(SQL, [], |row| row.get(0)) -} - -pub(super) fn sql_select_max_safe_input_index(conn: &Connection) -> Result> { - conn.query_row( - SQL_SELECT_MAX_SAFE_INPUT_INDEX, - [], - convert_row_to_optional_i64, - ) -} - -pub(super) fn sql_select_latest_batch_index(conn: &Connection) -> Result> { - conn.query_row( - SQL_SELECT_LATEST_BATCH_INDEX, - [], - convert_row_to_optional_i64, - ) -} - -/// Derived batch policy: (log_recommended_fee, log_batch_size_target). -pub(super) fn sql_select_batch_policy(conn: &Connection) -> Result<(i64, i64)> { - conn.query_row(SQL_SELECT_BATCH_POLICY, [], |row| { - Ok((row.get(0)?, row.get(1)?)) - }) -} - -pub(super) fn sql_update_batch_policy_log_gas_price( - conn: &Connection, - log_gas_price: i64, -) -> Result { - conn.execute( - SQL_UPDATE_BATCH_POLICY_LOG_GAS_PRICE, - params![log_gas_price], - ) -} - -pub(super) fn sql_update_batch_policy_alpha( - conn: &Connection, - log_alpha: i64, - log_one_plus_alpha: i64, -) -> Result { - conn.execute( - SQL_UPDATE_BATCH_POLICY_ALPHA, - params![log_alpha, log_one_plus_alpha], - ) -} - -pub(super) fn sql_select_safe_block(conn: &Connection) -> Result { - conn.query_row(SQL_SELECT_SAFE_BLOCK, [], |row| row.get(0)) -} - -pub(super) fn sql_update_safe_block(conn: &Connection, safe_block: i64) -> Result { - conn.execute(SQL_UPDATE_SAFE_BLOCK, params![safe_block]) -} - -pub(super) fn sql_select_safe_inputs_range( - conn: &Connection, - from_inclusive: i64, - to_exclusive: i64, -) -> Result> { - let mut stmt = conn.prepare_cached(SQL_SELECT_SAFE_INPUTS_RANGE)?; - let mapped = stmt.query_map( - params![from_inclusive, to_exclusive], - convert_row_to_safe_input_row, - )?; - mapped.collect() -} - -pub(super) fn sql_select_frames_for_batch( - conn: &Connection, - batch_index: i64, -) -> Result> { - const SQL: &str = "SELECT frame_in_batch, fee, safe_block FROM frames WHERE batch_index = ?1 ORDER BY frame_in_batch ASC"; - let mut stmt = conn.prepare_cached(SQL)?; - let mapped = stmt.query_map(params![batch_index], convert_row_to_frame_header_row)?; - mapped.collect() -} - -pub(super) fn sql_select_user_ops_for_frame( - conn: &Connection, - batch_index: i64, - frame_in_batch: i64, -) -> Result> { - let mut stmt = conn.prepare_cached(SQL_SELECT_USER_OPS_FOR_FRAME)?; - let mapped = stmt.query_map( - params![batch_index, frame_in_batch], - convert_row_to_frame_user_op_row, - )?; - mapped.collect() -} - -pub(super) fn sql_insert_safe_inputs_batch( - tx: &Transaction<'_>, - start_index: u64, - safe_inputs: &[StoredSafeInput], -) -> Result<()> { - if safe_inputs.is_empty() { - return Ok(()); - } - - let mut stmt = tx.prepare_cached(SQL_INSERT_SAFE_INPUT)?; - for (offset, input) in safe_inputs.iter().enumerate() { - stmt.execute(params![ - u64_to_i64(start_index.saturating_add(offset as u64)), - input.sender.as_slice(), - input.payload.as_slice(), - u64_to_i64(input.block_number) - ])?; - } - Ok(()) -} - -/// Insert user-ops into the `user_ops` table. -/// The `trg_sequence_user_op` trigger automatically appends a corresponding row -/// to `sequenced_l2_txs` for each inserted user-op. -pub(super) fn sql_insert_user_ops_batch( - tx: &Transaction<'_>, - batch_index: i64, - frame_in_batch: i64, - frame_pos_start: u32, - user_ops: &[PendingUserOp], -) -> Result<()> { - if user_ops.is_empty() { - return Ok(()); - } - - let mut stmt = tx.prepare_cached(SQL_INSERT_USER_OP)?; - for (offset, item) in user_ops.iter().enumerate() { - let pos_in_frame = frame_pos_start.saturating_add(offset as u32); - let sig = item.signed.signature.as_bytes(); - stmt.execute(params![ - batch_index, - frame_in_batch, - i64::from(pos_in_frame), - item.signed.sender.as_slice(), - i64::from(item.signed.user_op.nonce), - i64::from(item.signed.user_op.max_fee), - item.signed.user_op.data.as_ref(), - &sig[..], - to_unix_ms(item.received_at), - ])?; - } - Ok(()) -} - -pub(super) fn sql_insert_sequenced_direct_inputs( - tx: &Transaction<'_>, - batch_index: i64, - frame_in_batch: i64, - direct_range: SafeInputRange, -) -> Result<()> { - if direct_range.is_empty() { - return Ok(()); - } - - let mut stmt = tx.prepare_cached(SQL_INSERT_SEQUENCED_DIRECT_INPUT)?; - for safe_input_index in direct_range.start_inclusive..direct_range.end_exclusive { - stmt.execute(params![ - batch_index, - frame_in_batch, - u64_to_i64(safe_input_index), - ])?; - } - Ok(()) -} - -pub(super) fn sql_select_ordered_l2_txs_from_offset( - conn: &Connection, - offset: i64, -) -> Result> { - let mut stmt = conn.prepare_cached(SQL_SELECT_ORDERED_L2_TXS_FROM_OFFSET)?; - let mapped = stmt.query_map(params![offset], convert_row_to_ordered_l2_tx_row)?; - mapped.collect() -} - -pub(super) fn sql_select_ordered_l2_txs_for_batch( - conn: &Connection, - batch_index: i64, -) -> Result> { - let mut stmt = conn.prepare_cached(SQL_SELECT_ORDERED_L2_TXS_FOR_BATCH)?; - let mapped = stmt.query_map(params![batch_index], convert_row_to_ordered_l2_tx_row)?; - mapped.collect() -} - -pub(super) fn sql_select_ordered_l2_txs_page_from_offset( - conn: &Connection, - offset: i64, - limit: i64, -) -> Result> { - let mut stmt = conn.prepare_cached(SQL_SELECT_ORDERED_L2_TXS_PAGE_FROM_OFFSET)?; - let mapped = stmt.query_map(params![offset, limit], convert_row_to_ordered_l2_tx_row)?; - mapped.collect() -} - -pub(super) fn sql_select_ordered_l2_tx_count(conn: &Connection) -> Result { - conn.query_row(SQL_SELECT_ORDERED_L2_TX_COUNT, [], |row| row.get(0)) -} - -pub(super) fn sql_select_latest_batch_with_user_op_count( - tx: &Transaction<'_>, -) -> Result<(i64, i64, i64)> { - tx.query_row( - SQL_SELECT_LATEST_BATCH_WITH_USER_OP_COUNT, - [], - convert_row_to_latest_batch_with_user_op_count, - ) -} - -pub(super) fn sql_select_latest_frame_in_batch_for_batch( - tx: &Transaction<'_>, - batch_index: i64, -) -> Result<(i64, i64, i64)> { - tx.query_row( - SQL_SELECT_LATEST_FRAME_IN_BATCH_FOR_BATCH, - params![batch_index], - |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)), - ) -} - -pub(super) fn sql_count_user_ops_for_frame( - tx: &Transaction<'_>, - batch_index: i64, - frame_in_batch: i64, -) -> Result { - tx.query_row( - SQL_SELECT_USER_OP_COUNT_FOR_FRAME, - params![batch_index, frame_in_batch], - |row| row.get(0), - ) -} - -pub(super) fn sql_insert_open_batch(tx: &Transaction<'_>, created_at_ms: i64) -> Result { - const SQL: &str = "INSERT INTO batches (created_at_ms) VALUES (?1)"; - tx.execute(SQL, params![created_at_ms]) -} - -pub(super) fn sql_insert_open_batch_with_index( - tx: &Transaction<'_>, - batch_index: i64, - created_at_ms: i64, -) -> Result { - const SQL: &str = "INSERT INTO batches (batch_index, created_at_ms) VALUES (?1, ?2)"; - tx.execute(SQL, params![batch_index, created_at_ms]) -} - -pub(super) fn sql_insert_open_frame( - tx: &Transaction<'_>, - batch_index: i64, - frame_in_batch: i64, - created_at_ms: i64, - fee: i64, - safe_block: i64, -) -> Result { - const SQL: &str = "INSERT INTO frames (batch_index, frame_in_batch, created_at_ms, fee, safe_block) VALUES (?1, ?2, ?3, ?4, ?5)"; - tx.execute( - SQL, - params![batch_index, frame_in_batch, created_at_ms, fee, safe_block], - ) -} - -fn convert_row_to_optional_i64(row: &Row<'_>) -> Result> { - row.get(0) -} - -fn convert_row_to_safe_input_row(row: &Row<'_>) -> Result { - Ok(SafeInputRow { - safe_input_index: row.get(0)?, - sender: row.get(1)?, - payload: row.get(2)?, - block_number: row.get(3)?, - }) -} - -fn convert_row_to_frame_header_row(row: &Row<'_>) -> Result { - Ok(FrameHeaderRow { - frame_in_batch: row.get(0)?, - fee: row.get(1)?, - safe_block: row.get(2)?, - }) -} - -fn convert_row_to_frame_user_op_row(row: &Row<'_>) -> Result { - Ok(FrameUserOpRow { - nonce: row.get(0)?, - max_fee: row.get(1)?, - data: row.get(2)?, - sig: row.get(3)?, - }) -} - -fn convert_row_to_ordered_l2_tx_row(row: &Row<'_>) -> Result { - Ok(OrderedL2TxRow { - kind: row.get(0)?, - sender: row.get(1)?, - data: row.get(2)?, - fee: row.get(3)?, - payload: row.get(4)?, - block_number: row.get(5)?, - }) -} - -fn convert_row_to_latest_batch_with_user_op_count(row: &Row<'_>) -> Result<(i64, i64, i64)> { - Ok((row.get(0)?, row.get(1)?, row.get(2)?)) -} - -fn to_unix_ms(time: SystemTime) -> i64 { - time.duration_since(UNIX_EPOCH) - .unwrap_or_default() - .as_millis() - .try_into() - .unwrap_or(i64::MAX) -} - -fn u64_to_i64(value: u64) -> i64 { - i64::try_from(value).unwrap_or(i64::MAX) -} - -#[cfg(test)] -mod tests { - use super::{ - FrameHeaderRow, SQL_INSERT_SAFE_INPUT, SQL_INSERT_SEQUENCED_DIRECT_INPUT, - SQL_INSERT_USER_OP, sql_insert_open_batch, sql_insert_open_batch_with_index, - sql_insert_open_frame, sql_insert_safe_inputs_batch, sql_insert_sequenced_direct_inputs, - sql_insert_user_ops_batch, sql_select_batch_policy, sql_select_frames_for_batch, - sql_select_latest_batch_index, sql_select_latest_batch_with_user_op_count, - sql_select_max_safe_input_index, sql_select_ordered_l2_tx_count, - sql_select_ordered_l2_txs_from_offset, sql_select_ordered_l2_txs_page_from_offset, - sql_select_safe_block, sql_select_safe_inputs_range, - sql_select_total_drained_direct_inputs, sql_select_user_ops_for_frame, - sql_update_batch_policy_alpha, sql_update_batch_policy_log_gas_price, - sql_update_safe_block, - }; - use crate::inclusion_lane::PendingUserOp; - use crate::storage::db::Storage; - use crate::storage::{SafeInputRange, StoredSafeInput}; - use alloy_primitives::{Address, Signature}; - use rusqlite::{Connection, params}; - use sequencer_core::user_op::{SignedUserOp, UserOp}; - use std::time::SystemTime; - use tokio::sync::oneshot; - - fn setup_conn() -> Connection { - let mut conn = Connection::open_in_memory().expect("open in-memory sqlite"); - Storage::run_migrations(&mut conn).expect("run migrations"); - conn - } - - fn sample_pending_user_op(seed: u8, nonce: u32, max_fee: u16) -> PendingUserOp { - let sender = Address::from_slice(&[seed; 20]); - let signature = Signature::test_signature(); - let (respond_to, _recv) = oneshot::channel(); - PendingUserOp { - signed: SignedUserOp { - sender, - signature, - user_op: UserOp { - nonce, - max_fee, - data: vec![seed].into(), - }, - }, - respond_to, - received_at: SystemTime::now(), - } - } - - fn seed_open_batch0_frame0(conn: &mut Connection) { - let tx = conn.transaction().expect("start tx"); - sql_insert_open_batch_with_index(&tx, 0, 123).expect("insert batch 0"); - sql_insert_open_frame(&tx, 0, 0, 123, 0, 0).expect("insert frame 0"); - tx.commit().expect("commit tx"); - } - - #[test] - fn max_index_helpers_work_for_empty_and_non_empty_tables() { - let mut conn = setup_conn(); - - assert_eq!( - sql_select_total_drained_direct_inputs(&conn).expect("total drained"), - 0 - ); - assert_eq!( - sql_select_max_safe_input_index(&conn).expect("query max direct input"), - None - ); - - conn.execute( - SQL_INSERT_SAFE_INPUT, - params![0_i64, vec![0x11_u8; 20], vec![0xaa_u8], 10_i64], - ) - .expect("insert direct input 0"); - conn.execute( - SQL_INSERT_SAFE_INPUT, - params![1_i64, vec![0x22_u8; 20], vec![0xbb_u8], 11_i64], - ) - .expect("insert direct input 1"); - assert_eq!( - sql_select_max_safe_input_index(&conn).expect("query max direct input"), - Some(1) - ); - - seed_open_batch0_frame0(&mut conn); - let tx = conn.transaction().expect("start tx"); - tx.execute( - SQL_INSERT_SEQUENCED_DIRECT_INPUT, - params![0_i64, 0_i64, 0_i64], - ) - .expect("insert sequenced direct input"); - tx.commit().expect("commit tx"); - - assert_eq!( - sql_select_total_drained_direct_inputs(&conn).expect("total drained"), - 1 - ); - - let tx = conn.transaction().expect("start tx"); - assert_eq!( - sql_select_max_safe_input_index(&tx).expect("query max direct input in tx"), - Some(1) - ); - } - - #[test] - fn safe_inputs_range_is_half_open_and_ordered() { - let conn = setup_conn(); - - conn.execute( - SQL_INSERT_SAFE_INPUT, - params![0_i64, vec![0x11_u8; 20], vec![0xaa_u8], 10_i64], - ) - .expect("insert direct input 0"); - conn.execute( - SQL_INSERT_SAFE_INPUT, - params![1_i64, vec![0x22_u8; 20], vec![0xbb_u8], 11_i64], - ) - .expect("insert direct input 1"); - conn.execute( - SQL_INSERT_SAFE_INPUT, - params![2_i64, vec![0x33_u8; 20], vec![0xcc_u8], 12_i64], - ) - .expect("insert direct input 2"); - - let empty = sql_select_safe_inputs_range(&conn, 1, 1).expect("query empty interval"); - assert!(empty.is_empty()); - - let rows = sql_select_safe_inputs_range(&conn, 0, 2).expect("query non-empty interval"); - assert_eq!(rows.len(), 2); - assert_eq!(rows[0].safe_input_index, 0); - assert_eq!(rows[1].safe_input_index, 1); - } - - #[test] - fn ordered_l2_query_follows_sequenced_offset_order() { - let mut conn = setup_conn(); - seed_open_batch0_frame0(&mut conn); - - conn.execute( - SQL_INSERT_USER_OP, - params![ - 0_i64, - 0_i64, - 0_i64, - vec![0x20_u8; 20], - 0_i64, - 1_i64, - vec![0x30_u8], - vec![0x40_u8; 65], - 0_i64 - ], - ) - .expect("insert user op"); - // The trg_sequence_user_op trigger automatically inserts the sequenced row. - conn.execute( - SQL_INSERT_SAFE_INPUT, - params![0_i64, vec![0x11_u8; 20], vec![0xaa_u8], 10_i64], - ) - .expect("insert direct input"); - conn.execute( - SQL_INSERT_SEQUENCED_DIRECT_INPUT, - params![0_i64, 0_i64, 0_i64], - ) - .expect("insert sequenced direct input"); - - let rows = sql_select_ordered_l2_txs_from_offset(&conn, 0).expect("query ordered l2"); - assert_eq!(rows.len(), 2); - assert_eq!(rows[0].kind, 0); - assert_eq!(rows[0].fee, Some(0)); - assert_eq!(rows[1].kind, 1); - assert_eq!(rows[1].fee, None); - - let paged = sql_select_ordered_l2_txs_page_from_offset(&conn, 1, 1).expect("query page"); - assert_eq!(paged.len(), 1); - assert_eq!(paged[0].kind, 1); - assert_eq!( - sql_select_ordered_l2_tx_count(&conn).expect("query ordered count"), - 2 - ); - } - - #[test] - fn batch_and_frame_helpers_start_empty_before_lane_initialization() { - let mut conn = setup_conn(); - let tx = conn.transaction().expect("start tx"); - - let err = sql_select_latest_batch_with_user_op_count(&tx).expect_err("no batch yet"); - assert!(matches!(err, rusqlite::Error::QueryReturnedNoRows)); - } - - #[test] - fn latest_batch_index_and_frames_for_batch_helpers_work() { - let mut conn = setup_conn(); - // No batches yet. - assert_eq!( - sql_select_latest_batch_index(&conn).expect("query latest batch nonce"), - None - ); - - // Seed batch 0 / frame 0, then batch 1 / frame 0. - seed_open_batch0_frame0(&mut conn); - { - let tx = conn.transaction().expect("start tx"); - sql_insert_open_batch(&tx, 456).expect("insert batch 1"); - let next_batch = tx.last_insert_rowid(); - sql_insert_open_frame(&tx, next_batch, 0, 456, 3, 5) - .expect("insert frame 0 for batch 1"); - tx.commit().expect("commit tx"); - } - - let latest = sql_select_latest_batch_index(&conn) - .expect("query latest batch nonce") - .expect("latest batch should exist"); - assert_eq!(latest, 1); - - let frames = sql_select_frames_for_batch(&conn, 1).expect("query frames for batch 1"); - assert_eq!(frames.len(), 1); - let FrameHeaderRow { - frame_in_batch, - fee, - safe_block, - } = frames[0].clone(); - assert_eq!(frame_in_batch, 0); - assert_eq!(fee, 3); - assert_eq!(safe_block, 5); - } - - #[test] - fn user_ops_for_frame_helper_returns_ordered_rows() { - let mut conn = setup_conn(); - seed_open_batch0_frame0(&mut conn); - - // Insert two user-ops with different pos_in_frame values. - conn.execute( - SQL_INSERT_USER_OP, - params![ - 0_i64, - 0_i64, - 1_i64, - vec![0x10_u8; 20], - 0_i64, - 1_i64, - vec![0x01_u8], - vec![0x55_u8; 65], - 0_i64 - ], - ) - .expect("insert first user op"); - conn.execute( - SQL_INSERT_USER_OP, - params![ - 0_i64, - 0_i64, - 0_i64, - vec![0x20_u8; 20], - 1_i64, - 2_i64, - vec![0x02_u8], - vec![0x66_u8; 65], - 0_i64 - ], - ) - .expect("insert second user op"); - - let rows = sql_select_user_ops_for_frame(&conn, 0, 0).expect("query user ops for frame"); - assert_eq!(rows.len(), 2); - // Ordered by pos_in_frame ASC: nonce 1 comes from pos 1, then nonce 0 from pos 0. - assert_eq!(rows[0].nonce, 1); - assert_eq!(rows[1].nonce, 0); - } - - #[test] - fn open_batch_and_frame_insert_helpers_work() { - let mut conn = setup_conn(); - let tx = conn.transaction().expect("start tx"); - - sql_insert_open_batch(&tx, 123).expect("insert open batch"); - let new_batch = tx.last_insert_rowid(); - sql_insert_open_frame(&tx, new_batch, 0, 123, 7, 9).expect("insert open frame"); - tx.commit().expect("commit tx"); - - let batch_count: i64 = conn - .query_row("SELECT COUNT(*) FROM batches", [], |row| row.get(0)) - .expect("count batches"); - let frame_count: i64 = conn - .query_row("SELECT COUNT(*) FROM frames", [], |row| row.get(0)) - .expect("count frames"); - assert_eq!(batch_count, 1); - assert_eq!(frame_count, 1); - } - - #[test] - fn batch_policy_helpers_read_defaults_and_update_knobs() { - let conn = setup_conn(); - // Default: log_gas_price=0 → log_recommended_fee=0+20+419+621=1060 - // log_batch_size_target = 1403 - (-229) - 419 = 1213 - let (log_fee, log_target) = sql_select_batch_policy(&conn).expect("read policy"); - assert_eq!(log_fee, 20 + 419 + 621); // 1060 - assert_eq!(log_target, 1403 - (-229) - 419); // 1213 - - sql_update_batch_policy_log_gas_price(&conn, 100).expect("update log gas price"); - let (log_fee, _) = sql_select_batch_policy(&conn).expect("read updated policy"); - assert_eq!(log_fee, 100 + 20 + 419 + 621); // 1160 - - // Update alpha: num=200, denom=1000 → log_alpha=-207, log_one_plus_alpha=23 - // View derives: log_batch_size_target = 1403 - (-207) - 419 = 1191 - sql_update_batch_policy_alpha(&conn, -207, 23).expect("update alpha"); - let (log_fee, log_target) = sql_select_batch_policy(&conn).expect("read updated target"); - assert_eq!(log_target, 1403 - (-207) - 419); // 1191 - assert_eq!(log_fee, 100 + 23 + 419 + 621); // 1163 - } - - #[test] - fn batch_policy_check_rejects_unsafe_alpha() { - let conn = setup_conn(); - // log_alpha=-350 → log_batch_size_target = 1403-(-350)-419 = 1334 >= log_max_batch_bytes=1333 - let err = sql_update_batch_policy_alpha(&conn, -350, 0); - assert!( - err.is_err(), - "CHECK should reject unsafe alpha (log_batch_size_target >= log_max_batch_bytes)" - ); - } - - #[test] - fn l1_safe_head_helpers_read_and_update_singleton() { - let conn = setup_conn(); - assert_eq!(sql_select_safe_block(&conn).expect("read safe block"), 0); - sql_update_safe_block(&conn, 12).expect("update safe block"); - assert_eq!(sql_select_safe_block(&conn).expect("read updated"), 12); - } - - #[test] - fn batch_insert_helpers_insert_multiple_rows() { - let mut conn = setup_conn(); - seed_open_batch0_frame0(&mut conn); - let tx = conn.transaction().expect("start tx"); - - let safe_inputs = vec![ - StoredSafeInput { - sender: Address::ZERO, - payload: vec![0xaa_u8], - block_number: 10, - }, - StoredSafeInput { - sender: Address::ZERO, - payload: vec![0xbb_u8], - block_number: 11, - }, - ]; - sql_insert_safe_inputs_batch(&tx, 0, safe_inputs.as_slice()) - .expect("insert direct inputs batch"); - - let user_ops = vec![ - sample_pending_user_op(0x20, 0, 1), - sample_pending_user_op(0x21, 1, 1), - ]; - sql_insert_user_ops_batch(&tx, 0, 0, 0, user_ops.as_slice()) - .expect("insert user ops + sequenced batch"); - - sql_insert_sequenced_direct_inputs( - &tx, - 0, - 0, - SafeInputRange::new(0, safe_inputs.len() as u64), - ) - .expect("insert sequenced direct inputs batch"); - - tx.commit().expect("commit tx"); - - let direct_inputs_count: i64 = conn - .query_row("SELECT COUNT(*) FROM safe_inputs", [], |row| row.get(0)) - .expect("count direct inputs"); - let user_ops_count: i64 = conn - .query_row("SELECT COUNT(*) FROM user_ops", [], |row| row.get(0)) - .expect("count user ops"); - let sequenced_count: i64 = conn - .query_row("SELECT COUNT(*) FROM sequenced_l2_txs", [], |row| { - row.get(0) - }) - .expect("count sequenced l2 txs"); - - assert_eq!(direct_inputs_count, 2); - assert_eq!(user_ops_count, 2); - assert_eq!(sequenced_count, 4); - } - - #[test] - fn user_op_uniqueness_is_sender_nonce() { - let mut conn = setup_conn(); - seed_open_batch0_frame0(&mut conn); - - // Same nonce with different senders should be accepted. - conn.execute( - SQL_INSERT_USER_OP, - params![ - 0_i64, - 0_i64, - 0_i64, - vec![0x11_u8; 20], - 0_i64, - 0_i64, - vec![0x01_u8], - vec![0x55_u8; 65], - 0_i64 - ], - ) - .expect("insert first user op"); - conn.execute( - SQL_INSERT_USER_OP, - params![ - 0_i64, - 0_i64, - 1_i64, - vec![0x22_u8; 20], - 0_i64, - 0_i64, - vec![0x02_u8], - vec![0x66_u8; 65], - 0_i64 - ], - ) - .expect("insert second user op with same nonce and different sender"); - - // Same sender + nonce should violate uniqueness. - let duplicate_sender_nonce = conn.execute( - SQL_INSERT_USER_OP, - params![ - 0_i64, - 0_i64, - 2_i64, - vec![0x11_u8; 20], - 0_i64, - 0_i64, - vec![0x03_u8], - vec![0x77_u8; 65], - 0_i64 - ], - ); - assert!( - duplicate_sender_nonce.is_err(), - "duplicate (sender, nonce) should fail" - ); - } -} diff --git a/sequencer/src/storage/test_helpers.rs b/sequencer/src/storage/test_helpers.rs new file mode 100644 index 0000000..52f3db2 --- /dev/null +++ b/sequencer/src/storage/test_helpers.rs @@ -0,0 +1,111 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Shared test fixtures used by `#[cfg(test)]` modules in `storage/`. + +use alloy_primitives::Address; +use sequencer_core::l2_tx::SequencedL2Tx; +use sequencer_core::protocol::ProtocolConfig; +use tempfile::TempDir; + +use super::{SafeInputRange, Storage, StoredSafeInput}; + +pub(crate) const SENDER_A: Address = Address::repeat_byte(0xAA); +pub(crate) const SENDER_B: Address = Address::repeat_byte(0xBB); + +/// Default protocol config for tests that don't care about the specific +/// submitter address or margin. Uses `SENDER_A` as the submitter. +pub(crate) fn default_protocol_config() -> ProtocolConfig { + protocol_config_for(SENDER_A) +} + +/// Protocol config with a specific submitter address and the default +/// `MAX_WAIT_BLOCKS`. Common test shape: seed via this sender, assert against +/// it. For explicit `max_wait_blocks` tuning build `ProtocolConfig` directly. +pub(crate) fn protocol_config_for(sender: Address) -> ProtocolConfig { + ProtocolConfig { + batch_submitter: sender, + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + seconds_per_block: 12, + } +} + +pub(crate) struct TestDb { + pub _dir: TempDir, + pub path: String, +} + +pub(crate) fn temp_db(name: &str) -> TestDb { + let dir = tempfile::Builder::new() + .prefix(format!("sequencer-{name}-").as_str()) + .tempdir() + .expect("create temporary test directory"); + let path = dir.path().join("sequencer.sqlite"); + TestDb { + _dir: dir, + path: path.to_string_lossy().into_owned(), + } +} + +/// Insert safe inputs whose payloads are SSZ-encoded batches with the given nonces, +/// all attributed to `sender`. Uses `protocol_config_for(sender)` so the +/// populated `safe_accepted_batches` view matches this sender. +pub(crate) fn seed_safe_inputs_with_batch_nonces( + storage: &mut Storage, + sender: Address, + safe_block: u64, + nonces: &[u64], +) { + let inputs: Vec = nonces + .iter() + .map(|nonce| StoredSafeInput { + sender, + payload: ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce: *nonce, + frames: Vec::new(), + }), + block_number: safe_block, + }) + .collect(); + let protocol = protocol_config_for(sender); + storage + .append_safe_inputs(safe_block, inputs.as_slice(), &protocol) + .expect("append safe inputs"); +} + +/// Create N closed batches (batch indices `0..count-1`) plus one open batch (index `count`). +pub(crate) fn seed_closed_batches(storage: &mut Storage, count: u64) { + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + for _ in 0..count { + let safe_block = head.safe_block; + storage + .close_frame_and_batch(&mut head, safe_block) + .expect("close batch"); + } +} + +/// Pull every valid sequenced L2 tx out of storage, dropping the offset. +/// Test-only convenience around `ordered_l2_txs_page_from`. +pub(crate) fn all_ordered_l2_txs(storage: &mut Storage) -> Vec { + storage + .ordered_l2_txs_page_from(0, 1_000_000) + .expect("load all ordered l2 txs") + .into_iter() + .map(|(_offset, tx)| tx) + .collect() +} + +/// SSZ-encoded single-frame batch payload at the given (nonce, safe_block). +pub(crate) fn make_stale_batch_payload(nonce: u64, safe_block: u64) -> Vec { + ssz::Encode::as_ssz_bytes(&sequencer_core::batch::Batch { + nonce, + frames: vec![sequencer_core::batch::Frame { + safe_block, + fee_price: 0, + user_ops: vec![], + }], + }) +} diff --git a/sequencer/tests/batch_submitter_integration.rs b/sequencer/tests/batch_submitter_integration.rs index 945ab7a..cf0fd6b 100644 --- a/sequencer/tests/batch_submitter_integration.rs +++ b/sequencer/tests/batch_submitter_integration.rs @@ -4,46 +4,92 @@ //! Integration tests for the batch submitter: worker loop with real storage and mock poster. use std::sync::Arc; +use std::sync::Mutex; use std::time::Duration; -use alloy_primitives::Address; use async_trait::async_trait; -use sequencer::batch_submitter::{BatchPoster, BatchPosterError, TxHash}; -use sequencer::batch_submitter::{BatchSubmitter, BatchSubmitterConfig}; -use sequencer::shutdown::ShutdownSignal; +use sequencer::l1::submitter::{BatchPoster, BatchPosterError, TxHash}; +use sequencer::l1::submitter::{BatchSubmitter, BatchSubmitterConfig}; +use sequencer::runtime::shutdown::ShutdownSignal; use sequencer::storage::{SafeInputRange, Storage}; use sequencer_core::batch::Batch; -use tempfile::TempDir; -const BATCH_SUBMITTER_ADDRESS: Address = Address::repeat_byte(0x11); +mod common; +use common::{TestDb, temp_db}; -/// Minimal mock for integration tests: records submissions. +/// Minimal mock for integration tests. +/// +/// Records submissions. Optionally delays each `submit_batches` call (to race +/// a concurrent writer against the submitter loop), and can fail a configurable +/// number of times before succeeding (to exercise the transient-error retry +/// path). struct TestMock { - submissions: std::sync::Mutex>, + submissions: Mutex>, + /// Per-call delay applied inside `submit_batches`. + submit_delay: Mutex, + /// Remaining `submit_batches` calls that should return a Provider error + /// before the real submission path runs. + fail_next_n_submits: Mutex, } impl TestMock { fn new() -> Arc { Arc::new(Self { - submissions: std::sync::Mutex::new(Vec::new()), + submissions: Mutex::new(Vec::new()), + submit_delay: Mutex::new(Duration::ZERO), + fail_next_n_submits: Mutex::new(0), }) } + fn submissions(&self) -> Vec<(u64, usize)> { self.submissions.lock().expect("lock").clone() } + + fn set_submit_delay(&self, delay: Duration) { + *self.submit_delay.lock().expect("lock") = delay; + } + + fn fail_next_n_submits(&self, n: u32) { + *self.fail_next_n_submits.lock().expect("lock") = n; + } } #[async_trait] impl BatchPoster for TestMock { - async fn submit_batch(&self, payload: Vec) -> Result { - let batch_index = ssz::Decode::from_ssz_bytes(payload.as_slice()) - .map(|b: Batch| b.nonce) - .unwrap_or(0); - self.submissions - .lock() - .expect("lock") - .push((batch_index, payload.len())); - Ok(TxHash::ZERO) + async fn submit_batches( + &self, + payloads: Vec>, + ) -> Result, BatchPosterError> { + // Transient-failure hook: consume one of the configured failures + // before anything else, so the tick outcome maps to `Transient` and + // the loop must sleep + retry. + { + let mut slot = self.fail_next_n_submits.lock().expect("lock"); + if *slot > 0 { + *slot -= 1; + return Err(BatchPosterError::Provider( + "injected transient failure".into(), + )); + } + } + + let delay = *self.submit_delay.lock().expect("lock"); + if !delay.is_zero() { + tokio::time::sleep(delay).await; + } + + let mut tx_hashes = Vec::with_capacity(payloads.len()); + for payload in payloads { + let batch_index = ssz::Decode::from_ssz_bytes(payload.as_slice()) + .map(|b: Batch| b.nonce) + .unwrap_or(0); + self.submissions + .lock() + .expect("lock") + .push((batch_index, payload.len())); + tx_hashes.push(TxHash::ZERO); + } + Ok(tx_hashes) } async fn observed_submitted_batch_nonces( @@ -60,20 +106,9 @@ impl BatchPoster for TestMock { } } -const SQLITE_SYNCHRONOUS_PRAGMA: &str = "NORMAL"; - -fn temp_db(name: &str) -> (TempDir, String) { - let dir = tempfile::Builder::new() - .prefix(format!("sequencer-batch-submitter-it-{name}-").as_str()) - .tempdir() - .expect("create temporary test directory"); - let path = dir.path().join("sequencer.sqlite"); - (dir, path.to_string_lossy().into_owned()) -} - /// Seeds storage so batches 1 and 2 are closed and batch 3 is open. fn seed_two_closed_batches(db_path: &str) { - let mut storage = Storage::open(db_path, SQLITE_SYNCHRONOUS_PRAGMA).expect("open storage"); + let mut storage = Storage::open(db_path).expect("open storage"); let mut head = storage .initialize_open_state(0, SafeInputRange::empty_at(0)) .expect("initialize open state"); @@ -89,9 +124,34 @@ fn seed_two_closed_batches(db_path: &str) { .expect("close batch 2"); } +/// Seeds storage so batch 0 is closed and batch 1 is the open Tip. +fn seed_one_closed_batch(db_path: &str) { + let mut storage = Storage::open(db_path).expect("open storage"); + let mut head = storage + .initialize_open_state(0, SafeInputRange::empty_at(0)) + .expect("initialize open state"); + let next_safe = head.safe_block; + storage + .close_frame_and_batch(&mut head, next_safe) + .expect("close batch 0"); +} + +/// Close the current open Tip so it becomes eligible for submission. +fn close_current_tip(db_path: &str) { + let mut storage = Storage::open(db_path).expect("open storage"); + let mut head = storage + .open_state() + .expect("load open state") + .expect("open Tip exists"); + let next_safe = head.safe_block; + storage + .close_frame_and_batch(&mut head, next_safe) + .expect("close current Tip"); +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn submitter_loop_submits_closed_batches_then_exits_on_shutdown() { - let (_dir, path) = temp_db("loop-submits"); + let TestDb { _dir, path } = temp_db("loop-submits"); seed_two_closed_batches(&path); let mock = TestMock::new(); @@ -99,13 +159,7 @@ async fn submitter_loop_submits_closed_batches_then_exits_on_shutdown() { let config = BatchSubmitterConfig { idle_poll_interval_ms: 5000, }; - let submitter = BatchSubmitter::new( - path, - BATCH_SUBMITTER_ADDRESS, - mock.clone(), - shutdown.clone(), - config, - ); + let submitter = BatchSubmitter::new(path, mock.clone(), shutdown.clone(), config); let handle = submitter.start().expect("start batch submitter"); // Allow at least one tick to run (worker may submit batch 1 and 2 in one tick). @@ -124,3 +178,112 @@ async fn submitter_loop_submits_closed_batches_then_exits_on_shutdown() { assert_eq!(submissions[1].0, 1, "second submission should be batch 1"); assert_eq!(submissions[2].0, 2, "third submission should be batch 2"); } + +// ── Loop cadence invariants ─────────────────────────────────────────────── +// +// These pin the behavior the two-worker refactor unlocked: +// - Submitted → re-enter IMMEDIATELY (no sleep). +// - Transient (Poster error) → log + sleep + retry (loop must NOT exit). +// +// Both are loop-level properties that aren't visible from `tick_once`. + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn submitter_re_enters_immediately_after_productive_tick() { + // Design of the test: + // + // t=0ms Submitter starts. Tick 1 loads batch 0, enters submit_batches + // which sleeps for `submit_delay` (400ms) before recording. + // t=100 A concurrent writer closes the Tip, making batch 1 eligible. + // t~400 submit_batches returns. Tick 1 outcome is Submitted(1). + // Loop must re-enter IMMEDIATELY (Submitted branch → `continue`). + // t~400 Tick 2 observes the new batch 1, submits it (another 400ms). + // t~800 submit_batches returns again, Submitted(1). + // t=1200 Test asserts: two submissions landed inside the window. + // + // If `Submitted → sleep idle_poll` ever regresses, tick 2 would wait 10s + // and the second submission would not appear in the 1.2s budget. + let TestDb { _dir, path } = temp_db("loop-immediate-retry"); + seed_one_closed_batch(&path); + + let mock = TestMock::new(); + mock.set_submit_delay(Duration::from_millis(400)); + let shutdown = ShutdownSignal::default(); + let config = BatchSubmitterConfig { + // Ten seconds — anything above ~2s would be enough to fail if the + // immediate-retry cadence regressed to always-sleep. + idle_poll_interval_ms: 10_000, + }; + let submitter = BatchSubmitter::new(path.clone(), mock.clone(), shutdown.clone(), config); + let handle = submitter.start().expect("start batch submitter"); + + // Let tick 1 enter `submit_batches` (which is now blocking on the delay), + // then close the Tip so batch 1 is eligible by the time tick 2 runs. + tokio::time::sleep(Duration::from_millis(100)).await; + close_current_tip(&path); + + // Budget: ~2x the submit delay. With immediate-retry this is plenty. + tokio::time::sleep(Duration::from_millis(1100)).await; + + shutdown.request_shutdown(); + let _ = tokio::time::timeout(Duration::from_secs(2), handle).await; + + let submissions = mock.submissions(); + assert_eq!( + submissions.len(), + 2, + "Submitted-then-new-work must re-enter without sleeping idle_poll=10s; \ + got submissions {submissions:?}" + ); + assert_eq!(submissions[0].0, 0); + assert_eq!(submissions[1].0, 1); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn submitter_recovers_from_transient_poster_error_without_exiting() { + // Design of the test: + // + // t=0ms Submitter starts. Tick 1 calls submit_batches, which returns + // a Provider error (the first of N injected failures). + // t=0ms Loop maps Err(Poster) → TickOutcome::Transient → sleep idle_poll. + // t~80ms Tick 2 runs. submit_batches succeeds, batch 0 recorded. + // t=250ms Test asserts: exactly 1 submission AND loop is still alive. + // + // Regressions this catches: + // - Propagating Poster errors as fatal (loop would exit; handle would + // resolve with BatchSubmitterError before shutdown fires). + // - Forgetting the sleep on Transient (would work, but could busy-loop + // on a persistent error — not tested here, but the retry-count path + // documents the intended cadence). + let TestDb { _dir, path } = temp_db("loop-transient-retry"); + seed_one_closed_batch(&path); + + let mock = TestMock::new(); + mock.fail_next_n_submits(1); + let shutdown = ShutdownSignal::default(); + let config = BatchSubmitterConfig { + // Short poll interval so the retry sleep completes well within the + // test window. Still long enough that accidentally always-sleeping + // would delay the single submission past the assertion. + idle_poll_interval_ms: 50, + }; + let submitter = BatchSubmitter::new(path.clone(), mock.clone(), shutdown.clone(), config); + let handle = submitter.start().expect("start batch submitter"); + + tokio::time::sleep(Duration::from_millis(250)).await; + + assert!( + !handle.is_finished(), + "loop must not exit on a transient Poster error — it should log and retry", + ); + + let submissions = mock.submissions(); + assert_eq!( + submissions.len(), + 1, + "transient failure followed by success should land exactly one submission; got {submissions:?}", + ); + assert_eq!(submissions[0].0, 0); + + shutdown.request_shutdown(); + let _ = tokio::time::timeout(Duration::from_secs(2), handle).await; +} diff --git a/sequencer/tests/chain_id_validation.rs b/sequencer/tests/chain_id_validation.rs new file mode 100644 index 0000000..bbeb272 --- /dev/null +++ b/sequencer/tests/chain_id_validation.rs @@ -0,0 +1,153 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! §8.3 — H7 regression: chain-id mismatch is caught early in bootstrap. +//! +//! The H7 hardening moved the chain-id check before any DB writes and replaced +//! `assert_eq!` with a typed `RunError::ChainIdMismatch`. This file locks two +//! of the three code paths where the check matters: +//! +//! - §8.3.2 Cache path: L1 is unreachable but a cache exists with a different +//! chain_id. Check fires before `InputReader::from_parts`. +//! - Positive control: with a matched chain_id, `ChainIdMismatch` does NOT +//! fire, so the check doesn't misfire on the happy path. +//! +//! §8.3.1 (RPC path: L1 reachable, chain_id from `eth_chainId` mismatches) is +//! NOT covered here because `InputReader::new` needs a real InputBox contract +//! deployed at `config.app_address` before the chain-id check fires. That +//! setup only exists in the full rollups-e2e harness (after `just setup`). +//! Tracking in `tests/TEST_PLAN.md` §8.3.1. + +use std::time::Duration; + +use alloy_primitives::Address; +use app_core::application::{WalletApp, WalletConfig}; +use clap::Parser; +use sequencer::RunConfig; +use sequencer::runtime::RunError; +use tempfile::TempDir; + +// Anvil's default devnet private key #0. +const ANVIL_KEY: &str = "0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80"; +const TEST_APP_ADDR: &str = "0x1111111111111111111111111111111111111111"; + +/// Verify that `anvil` is available. Panics with a clear message if not found. +fn require_anvil() { + assert!( + std::process::Command::new("anvil") + .arg("--version") + .stdout(std::process::Stdio::null()) + .stderr(std::process::Stdio::null()) + .status() + .is_ok(), + "anvil not found on PATH — install Foundry (https://getfoundry.sh)" + ); +} + +fn build_config( + data_dir: &str, + eth_rpc_url: &str, + chain_id: u64, +) -> Result { + RunConfig::try_parse_from([ + "sequencer", + "--http-addr", + "127.0.0.1:0", + "--data-dir", + data_dir, + "--eth-rpc-url", + eth_rpc_url, + "--chain-id", + &chain_id.to_string(), + "--app-address", + TEST_APP_ADDR, + "--batch-submitter-private-key", + ANVIL_KEY, + ]) +} + +fn build_app() -> WalletApp { + WalletApp::new(WalletConfig::default()) +} + +// ── §8.3.2 — Cache path ────────────────────────────────────────────────────── + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn chain_id_mismatch_from_cache_returns_typed_error() { + // Scenario: L1 is unreachable, but a bootstrap cache exists from a previous + // successful run. The cached chain_id does NOT match the current config. + // The cache-fallback arm must return ChainIdMismatch (was `assert_eq!` before H7). + + let dir = TempDir::new().expect("tempdir"); + let data_dir = dir.path().to_str().unwrap(); + + // Pre-populate the bootstrap cache with chain_id=31337. + let db_path = format!("{data_dir}/sequencer.db"); + { + let mut storage = sequencer::storage::Storage::open(&db_path).expect("open db for seed"); + storage + .save_l1_bootstrap_cache( + Address::from_slice(&[0x22; 20]), // input_box + 100, // genesis + 31_337, // chain_id + ) + .expect("seed cache"); + } + + // Point the sequencer at an unreachable RPC (port 1, reliably refused) and + // a MISMATCHED chain_id=1. L1 is unreachable → cache-fallback path runs + // → cached chain_id (31337) mismatches config (1) → ChainIdMismatch. + let config = build_config(data_dir, "http://127.0.0.1:1", 1).expect("parse config"); + + let result = tokio::time::timeout(Duration::from_secs(30), sequencer::run(build_app(), config)) + .await + .expect("run() must return quickly on mismatch"); + + match result { + Err(RunError::ChainIdMismatch { rpc, config }) => { + assert_eq!(rpc, 31_337, "rpc field carries the cached value"); + assert_eq!(config, 1, "config field carries the configured value"); + } + other => panic!("expected RunError::ChainIdMismatch, got: {other:?}"), + } +} + +// ── Positive: matched chain_id does NOT trigger ChainIdMismatch ────────────── + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn chain_id_match_does_not_produce_mismatch_error() { + // Positive control: when chain_id matches, we should NOT get ChainIdMismatch. + // (The sequencer then tries to start the full stack. We don't care about + // that — a timeout counts as "didn't return ChainIdMismatch early", which + // is what we want to verify.) + require_anvil(); + + let anvil = alloy::node_bindings::Anvil::default().spawn(); + let rpc_url = anvil.endpoint(); + let dir = TempDir::new().expect("tempdir"); + let config = build_config(dir.path().to_str().unwrap(), &rpc_url, 31_337) + .expect("parse config with matching chain_id"); + + // Short timeout: if ChainIdMismatch is going to fire, it fires fast. + // A timeout means the check passed and the sequencer is running normally. + let result = + tokio::time::timeout(Duration::from_secs(3), sequencer::run(build_app(), config)).await; + + match result { + Err(_timeout) => {} // expected — sequencer is running + Ok(Err(RunError::ChainIdMismatch { rpc, config })) => { + panic!( + "matched chain_id must not produce ChainIdMismatch, got rpc={rpc} config={config}" + ); + } + Ok(Err(other)) => { + // Some other error is fine — we only care that it's not ChainIdMismatch. + eprintln!( + "sequencer returned non-mismatch error (expected under test conditions): {other:?}" + ); + } + Ok(Ok(())) => { + panic!("sequencer should not complete run() in a short test window"); + } + } +} diff --git a/sequencer/tests/common/mod.rs b/sequencer/tests/common/mod.rs new file mode 100644 index 0000000..45b9afa --- /dev/null +++ b/sequencer/tests/common/mod.rs @@ -0,0 +1,27 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! Shared fixtures for `sequencer/tests/*.rs` integration tests. +//! +//! Integration tests compile as separate crates and cannot reach the +//! `#[cfg(test)]` helpers inside `sequencer/src/`. This module keeps the same +//! `TestDb` shape so callers work identically on both sides. + +use tempfile::TempDir; + +pub struct TestDb { + pub _dir: TempDir, + pub path: String, +} + +pub fn temp_db(name: &str) -> TestDb { + let dir = tempfile::Builder::new() + .prefix(format!("sequencer-{name}-").as_str()) + .tempdir() + .expect("create temporary test directory"); + let path = dir.path().join("sequencer.sqlite"); + TestDb { + _dir: dir, + path: path.to_string_lossy().into_owned(), + } +} diff --git a/sequencer/tests/e2e_sequencer.rs b/sequencer/tests/e2e_sequencer.rs index 2869f2f..b78e235 100644 --- a/sequencer/tests/e2e_sequencer.rs +++ b/sequencer/tests/e2e_sequencer.rs @@ -12,23 +12,186 @@ use app_core::application::{ use futures_util::StreamExt; use k256::ecdsa::SigningKey; use k256::ecdsa::signature::hazmat::PrehashSigner; -use sequencer::api::{self, ApiConfig}; -use sequencer::inclusion_lane::{ +use sequencer::egress::l2_tx_feed::{L2TxFeed, L2TxFeedConfig}; +use sequencer::http::{self, ApiConfig}; +use sequencer::ingress::inclusion_lane::{ InclusionLane, InclusionLaneConfig, InclusionLaneError, PendingUserOp, }; -use sequencer::l2_tx_feed::{L2TxFeed, L2TxFeedConfig}; -use sequencer::shutdown::ShutdownSignal; +use sequencer::runtime::shutdown::ShutdownSignal; use sequencer::storage::{SafeInputRange, Storage, StoredSafeInput}; use sequencer_core::api::{TxRequest, TxResponse, WsTxMessage}; use sequencer_core::l2_tx::SequencedL2Tx; use sequencer_core::user_op::UserOp; use sequencer_rust_client::SequencerClient; -use tempfile::TempDir; use tokio::io::{AsyncReadExt, AsyncWriteExt}; use tokio::sync::mpsc; use tokio_tungstenite::connect_async; use tokio_tungstenite::tungstenite::Message; +mod common; +use common::temp_db; + +// ── §1.1 — V1 regression: cross-boundary signature domain consistency ──────── +// +// The sequencer signs user-ops with `sequencer_core::build_input_domain`. The +// scheduler (canonical-app) recovers senders with the same function. If the +// two sides ever drift (the V1 bug: scheduler had `name: None`, sequencer had +// `name: Some("CartesiAppSequencer")`), every signature recovers a different +// address on each side, structurally breaking the rollup. +// +// These tests lock the invariant at two levels: +// 1. A signature built via the shared constructor recovers the signer's +// address (positive). +// 2. A signature built with ANY domain that differs from the shared +// constructor recovers a DIFFERENT address (negative — proves the domain +// actually affects recovery). + +#[test] +fn v1_regression_shared_domain_recovers_signer() { + use alloy_sol_types::SolStruct; + + let signing_key = SigningKey::from_bytes((&[42_u8; 32]).into()).expect("signing key"); + let signer_address = address_from_signing_key(&signing_key); + + let chain_id = 31_337_u64; + let app = Address::from_slice(&[0xaa; 20]); + let domain = sequencer_core::build_input_domain(chain_id, app); + + let user_op = UserOp { + nonce: 0, + max_fee: 1_200, + data: vec![0x01, 0x02, 0x03].into(), + }; + + // Sign with the shared domain. + let hash = user_op.eip712_signing_hash(&domain); + let k256_sig = signing_key.sign_prehash(hash.as_slice()).expect("sign"); + let signature = [false, true] + .into_iter() + .map(|parity| Signature::from_signature_and_parity(k256_sig, parity)) + .find(|s| { + s.recover_address_from_prehash(&hash) + .ok() + .is_some_and(|r| r == signer_address) + }) + .expect("recoverable parity"); + + // Recover with the shared domain — must equal signer. + let hash_again = user_op.eip712_signing_hash(&domain); + let recovered = signature + .recover_address_from_prehash(&hash_again) + .expect("recover"); + assert_eq!( + recovered, signer_address, + "shared domain must recover signer" + ); +} + +#[test] +fn v1_regression_name_none_domain_recovers_different_address() { + use alloy_sol_types::{Eip712Domain, SolStruct}; + + let signing_key = SigningKey::from_bytes((&[42_u8; 32]).into()).expect("signing key"); + let signer_address = address_from_signing_key(&signing_key); + + let chain_id = 31_337_u64; + let app = Address::from_slice(&[0xaa; 20]); + let correct_domain = sequencer_core::build_input_domain(chain_id, app); + + // The exact buggy domain the scheduler used pre-V1 fix. + let buggy_domain = Eip712Domain { + name: None, + version: None, + chain_id: Some(U256::from(chain_id)), + verifying_contract: Some(app), + salt: None, + }; + + let user_op = UserOp { + nonce: 0, + max_fee: 1_200, + data: vec![0x01, 0x02, 0x03].into(), + }; + + // Sign with the correct (shared) domain. + let hash = user_op.eip712_signing_hash(&correct_domain); + let k256_sig = signing_key.sign_prehash(hash.as_slice()).expect("sign"); + let signature = [false, true] + .into_iter() + .map(|parity| Signature::from_signature_and_parity(k256_sig, parity)) + .find(|s| { + s.recover_address_from_prehash(&hash) + .ok() + .is_some_and(|r| r == signer_address) + }) + .expect("recoverable parity"); + + // Recover with the buggy domain — must NOT recover the signer. + // (This is what would silently fail at the scheduler under the V1 bug.) + let buggy_hash = user_op.eip712_signing_hash(&buggy_domain); + let recovered_under_buggy = signature + .recover_address_from_prehash(&buggy_hash) + .expect("recovery succeeds but returns the wrong address"); + assert_ne!( + recovered_under_buggy, signer_address, + "a name:None domain must not recover the signer — if this fails, \ + the shared domain constructor is bit-identical to the buggy one, \ + meaning the V1 fix regressed" + ); +} + +#[test] +fn v1_regression_domain_fields_all_affect_recovery() { + use alloy_sol_types::SolStruct; + + let signing_key = SigningKey::from_bytes((&[42_u8; 32]).into()).expect("signing key"); + let signer_address = address_from_signing_key(&signing_key); + + let app = Address::from_slice(&[0xaa; 20]); + let user_op = UserOp { + nonce: 0, + max_fee: 1_200, + data: vec![0x01].into(), + }; + + // Sign with chain_id = 1. + let chain_a = sequencer_core::build_input_domain(1, app); + let hash_a = user_op.eip712_signing_hash(&chain_a); + let k256_sig = signing_key.sign_prehash(hash_a.as_slice()).expect("sign"); + let signature = [false, true] + .into_iter() + .map(|parity| Signature::from_signature_and_parity(k256_sig, parity)) + .find(|s| { + s.recover_address_from_prehash(&hash_a) + .ok() + .is_some_and(|r| r == signer_address) + }) + .expect("recoverable parity"); + + // Cross-chain replay must fail: recover under chain_id=2 with the same app. + let chain_b = sequencer_core::build_input_domain(2, app); + let hash_b = user_op.eip712_signing_hash(&chain_b); + let recovered_b = signature + .recover_address_from_prehash(&hash_b) + .expect("recovery returns some address"); + assert_ne!( + recovered_b, signer_address, + "cross-chain replay must not recover signer" + ); + + // Cross-app replay must fail: recover under same chain but different app. + let other_app = Address::from_slice(&[0xbb; 20]); + let chain_a_app_other = sequencer_core::build_input_domain(1, other_app); + let hash_app_other = user_op.eip712_signing_hash(&chain_a_app_other); + let recovered_app_other = signature + .recover_address_from_prehash(&hash_app_other) + .expect("recovery returns some address"); + assert_ne!( + recovered_app_other, signer_address, + "cross-app replay must not recover signer" + ); +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn e2e_submit_tx_ack_and_broadcast() { let db = temp_db("full-e2e"); @@ -54,7 +217,7 @@ async fn e2e_submit_tx_ack_and_broadcast() { // The deposit is broadcast first. let deposit_message = recv_ws_message(&mut ws).await; match deposit_message { - WsTxMessage::DirectInput { offset, .. } => assert_eq!(offset, 0), + WsTxMessage::DirectInput { offset, .. } => assert_eq!(offset, 1), other => panic!("expected deposit direct input as first WS message, got {other:?}"), } let method = Method::Withdrawal(Withdrawal { @@ -96,7 +259,7 @@ async fn e2e_submit_tx_ack_and_broadcast() { fee, data, } => { - assert_eq!(offset, 1); + assert_eq!(offset, 2); assert_eq!(ws_sender, sender.to_string()); // Frame fee is the default log_recommended_fee = 1060. assert_eq!(fee, 1060); @@ -228,6 +391,43 @@ async fn api_rejects_malformed_json_as_bad_request() { "expected bad-request error code, got: {body}" ); + // §2.10 / H2 regression: the message must come from the fixed taxonomy + // ("invalid JSON"), NOT reflect serde's line/column/token excerpt. The + // malformed input contains the token `0x1234` — assert it doesn't appear + // in the response body so no attacker-submitted bytes are echoed. + assert!( + body.contains("\"message\":\"invalid JSON\""), + "expected fixed message 'invalid JSON' in body, got: {body}" + ); + assert!( + !body.contains("0x1234"), + "body must not reflect attacker-submitted input bytes, got: {body}" + ); + + shutdown_runtime(runtime).await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_rejects_missing_content_type_with_fixed_message() { + // §2.10 / H2 regression: missing Content-Type must produce a fixed + // `"missing content type"` message, not reflect any part of the request. + let db = temp_db("missing-content-type"); + let domain = test_domain(); + bootstrap_open_frame(db.path.as_str()); + + let Some(runtime) = start_full_server_with_max_body(db.path.as_str(), domain, 128 * 1024).await + else { + return; + }; + + // Valid JSON body, but sent without Content-Type: application/json. + let (status, body) = post_raw_body_no_content_type(runtime.addr, "{}").await; + assert_eq!(status, 400, "missing content-type: {body}"); + assert!( + body.contains("\"message\":\"missing content type\""), + "expected fixed 'missing content type' message, got: {body}" + ); + shutdown_runtime(runtime).await; } @@ -315,6 +515,453 @@ async fn api_rejects_user_op_payloads_above_application_limit() { shutdown_runtime(runtime).await; } +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_rejects_json_with_missing_fields_using_fixed_envelope() { + // §2.3.3 / H2 regression: a body that is valid JSON but missing required + // fields must respond with the fixed `"invalid JSON"` envelope. The + // response must not echo serde's deserialization error text — that would + // leak our internal field names and parser internals to callers. + let db = temp_db("missing-fields-json"); + let domain = test_domain(); + bootstrap_open_frame(db.path.as_str()); + + let Some(runtime) = start_full_server_with_max_body(db.path.as_str(), domain, 128 * 1024).await + else { + return; + }; + + // Empty object — valid JSON, missing every required field. + let (status, body) = post_raw_json(runtime.addr, "{}").await; + assert_eq!(status, 400, "missing fields: {body}"); + + // Parse the response envelope and assert the message is exactly the fixed + // taxonomy string. Anything else implies serde leaked internals into the + // body — that's the regression this test pins. + let envelope: serde_json::Value = serde_json::from_str(&body).expect("response is JSON"); + let message = envelope + .get("message") + .and_then(|m| m.as_str()) + .expect("envelope has string `message` field"); + assert_eq!( + message, "invalid JSON", + "response message must be the fixed taxonomy string, got: {message:?} (full body: {body})", + ); + let code = envelope + .get("code") + .and_then(|c| c.as_str()) + .expect("envelope has string `code` field"); + assert_eq!(code, "BAD_REQUEST", "unexpected error code: {body}"); + + // Sanity: serde's typical leak vocabulary must not appear anywhere. + for needle in [ + "missing field", + "expected", + "deserializ", + "line ", + "column ", + ] { + assert!( + !body.contains(needle), + "potential serde leak — body contains {needle:?}: {body}", + ); + } + + shutdown_runtime(runtime).await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_payload_size_check_fires_before_signature_recovery() { + // §2.3.5 sharpening: oversized `data` must be rejected by + // `validate_payload_size` BEFORE any cryptographic work. We submit an + // oversized payload paired with a garbage-but-correctly-shaped signature: + // if the size check is enforced first, the response says "user op payload + // too large"; if signature recovery ran first the response would mention a + // signature/sender mismatch instead. Catches a regression that re-orders + // signature verification ahead of size validation, which would open a DoS + // vector (huge body × secp256k1 recovery cost). + let db = temp_db("size-before-sig"); + let domain = test_domain(); + bootstrap_open_frame(db.path.as_str()); + + let Some(runtime) = start_full_server(db.path.as_str(), domain).await else { + return; + }; + + // Hand-craft a request: oversized data + correctly-shaped but garbage + // signature. The 65-byte signature passes `validate_hex_lengths`, so the + // next gate is `validate_payload_size`. If anyone moves signature recovery + // ahead of it, the response message changes and this assertion fails. + let oversized_data_hex = "00".repeat(MAX_METHOD_PAYLOAD_BYTES + 1); + let bogus_sig_hex = format!("0x{}", "00".repeat(65)); + let body = format!( + "{{\"message\":{{\"nonce\":0,\"max_fee\":0,\"data\":\"0x{oversized_data_hex}\"}},\ + \"signature\":\"{bogus_sig_hex}\",\ + \"sender\":\"0x0000000000000000000000000000000000000001\"}}", + ); + // Confirm the body fits under the default 4 KB body limit so we exercise + // the payload-size gate, not the upstream body-too-large gate. + assert!( + body.len() < 4 * 1024, + "test body must stay under default max_body_bytes (got {} bytes)", + body.len(), + ); + + let (status, response_body) = post_raw_json(runtime.addr, body.as_str()).await; + assert_eq!(status, 400, "oversized + bogus sig: {response_body}"); + assert!( + response_body.contains("user op payload too large"), + "size check must fire before signature verification — \ + expected 'user op payload too large' message, got: {response_body}", + ); + // Defensive: ensure the rejection is NOT a signature-class error. Any of + // these would mean signature recovery ran on the oversized payload. + for sig_marker in [ + "signature", + "sender mismatch", + "recover", + "INVALID_SIGNATURE", + ] { + assert!( + !response_body.contains(sig_marker), + "response mentions {sig_marker:?} — signature recovery may have run \ + before the size check: {response_body}", + ); + } + + shutdown_runtime(runtime).await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_rejects_signature_with_invalid_parity_byte() { + // §2.2.3: signature with correct length (65 bytes) but a parity byte + // outside the valid set (0/1 or 27/28) must be rejected at the crypto + // boundary with 422. Catches regressions where a new signature codec + // accepts arbitrary parity values and silently drifts recovery. + let db = temp_db("bad-parity-byte"); + let domain = test_domain(); + bootstrap_open_frame(db.path.as_str()); + + let Some(runtime) = start_full_server(db.path.as_str(), domain.clone()).await else { + return; + }; + + let endpoint = format!("http://{}", runtime.addr); + let client = SequencerClient::new_with_timeout(endpoint, Duration::from_secs(2)) + .expect("build sequencer client"); + + // Correct-length signature (65 bytes) with a non-recoverable parity byte. + let mut bogus_sig = [0_u8; 65]; + bogus_sig[64] = 0xFF; + let bogus_sig_hex = format!("0x{}", alloy_primitives::hex::encode(bogus_sig)); + + let mut request = make_valid_request(&domain); + request.signature = bogus_sig_hex; + + let (status, body) = client + .submit_tx_with_status(&request) + .await + .expect("submit tx"); + // Observed: 400 with `INVALID_SIGNATURE` code. (TEST_PLAN originally said + // 422; the code returns 400 for all signature-class rejections, same as + // §2.2.1 `forged_signature_rejected_test`. This test pins the actual + // contract.) + assert_eq!( + status, 400, + "invalid parity byte must produce 400 (signature-class error), got {status}: {body}", + ); + assert!( + body.contains("INVALID_SIGNATURE"), + "expected INVALID_SIGNATURE code, got: {body}", + ); + // Defensive: make sure the rejection is from the signature layer, not the + // hex-length gate (§2.2.2 covers that) and not the payload-size gate. + assert!( + !body.contains("signature must be") && !body.contains("payload too large"), + "expected sig-recovery class error, not hex-length or size: {body}", + ); + + shutdown_runtime(runtime).await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_rejects_sender_claim_that_mismatches_signature_recovery() { + // §2.2.4: `sender` field in the request must equal the address recovered + // from the signature. A valid signature over a user-op paired with a + // different claimed `sender` must be rejected — can't accept someone + // else's signed op as if it came from ourselves. Complements the + // integration-level forged_signature_rejected_test (which asserts the + // end-to-end shape); this one pins the direct API response. + let db = temp_db("sender-mismatch-explicit"); + let domain = test_domain(); + bootstrap_open_frame(db.path.as_str()); + + let Some(runtime) = start_full_server(db.path.as_str(), domain.clone()).await else { + return; + }; + + let endpoint = format!("http://{}", runtime.addr); + let client = SequencerClient::new_with_timeout(endpoint, Duration::from_secs(2)) + .expect("build sequencer client"); + + // Key A signs the user op; we claim the sender is address B. + let signing_key_a = SigningKey::from_bytes((&[1_u8; 32]).into()).expect("create signing key a"); + let signing_key_b = SigningKey::from_bytes((&[2_u8; 32]).into()).expect("create signing key b"); + let address_a = address_from_signing_key(&signing_key_a); + let address_b = address_from_signing_key(&signing_key_b); + assert_ne!(address_a, address_b, "test setup: A and B must differ"); + + let user_op = UserOp { + nonce: 0, + max_fee: TEST_MAX_FEE, + data: Vec::new().into(), + }; + let request = TxRequest { + signature: sign_user_op_hex(&domain, &user_op, &signing_key_a), + sender: address_b.to_string(), + message: user_op, + }; + + let (status, body) = client + .submit_tx_with_status(&request) + .await + .expect("submit tx"); + // Observed: 400 `INVALID_SIGNATURE` `"sender mismatch"`. See parity-byte + // test above for the TEST_PLAN-vs-reality note on the status code. + assert_eq!( + status, 400, + "sender-mismatch must produce 400 (signature-class error), got {status}: {body}", + ); + assert!( + body.contains("sender mismatch"), + "expected `sender mismatch` message, got: {body}", + ); + assert!( + body.contains("INVALID_SIGNATURE"), + "expected INVALID_SIGNATURE code, got: {body}", + ); + + shutdown_runtime(runtime).await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_rejects_user_op_with_nonce_gap() { + // §2.4.3: submitting a user-op with a nonce above the next expected one + // (i.e., a gap) must return 422 `InvalidNonce` and leave state + // unchanged. Complement to §2.4.2 (nonce too low / replay) — together + // they pin the strict-equality requirement on `current_user_nonce`. + let db = temp_db("nonce-gap-too-high"); + let domain = test_domain(); + let signing_key = SigningKey::from_bytes((&[7_u8; 32]).into()).expect("create signing key"); + let sender = address_from_signing_key(&signing_key); + bootstrap_open_frame_with_deposits(db.path.as_str(), &[(sender, U256::from(1_000_000_u64))]); + + let Some(runtime) = start_full_server(db.path.as_str(), domain.clone()).await else { + return; + }; + + let endpoint = format!("http://{}", runtime.addr); + let client = SequencerClient::new_with_timeout(endpoint, Duration::from_secs(2)) + .expect("build sequencer client"); + + // Current user nonce is 0 — a fresh sender has never submitted. Nonce 7 + // leaves a six-slot gap. + let user_op = UserOp { + nonce: 7, + max_fee: TEST_MAX_FEE, + data: ssz::Encode::as_ssz_bytes(&Method::Withdrawal(Withdrawal { + amount: U256::from(0_u64), + })) + .into(), + }; + let request = TxRequest { + signature: sign_user_op_hex(&domain, &user_op, &signing_key), + sender: sender.to_string(), + message: user_op, + }; + + let (status, body) = client + .submit_tx_with_status(&request) + .await + .expect("submit tx"); + assert_eq!( + status, 422, + "nonce gap must produce 422, got {status}: {body}", + ); + assert!( + body.contains("nonce") || body.contains("NONCE"), + "expected nonce-class error, got: {body}", + ); + + shutdown_runtime(runtime).await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_accepts_user_op_with_max_fee_equal_to_current_frame_fee() { + // §2.5.2 boundary: the check is `max_fee >= current_frame_fee` (strict + // less-than rejects). An op with `max_fee == current_frame_fee` must be + // accepted. Pairs with §2.5.1 (`fee_below_minimum_rejected_test`) — the + // two together pin the comparator. + let db = temp_db("fee-boundary-equal"); + let domain = test_domain(); + let signing_key = SigningKey::from_bytes((&[9_u8; 32]).into()).expect("create signing key"); + let sender = address_from_signing_key(&signing_key); + // Fund with enough to cover gas at the frame fee. + bootstrap_open_frame_with_deposits(db.path.as_str(), &[(sender, U256::from(1_000_000_u64))]); + + // `bootstrap_open_frame` asserts frame_fee == 1060; use that exact value + // for the boundary case. + const FRAME_FEE_BOUNDARY: u16 = 1060; + + let Some(runtime) = start_full_server(db.path.as_str(), domain.clone()).await else { + return; + }; + + let endpoint = format!("http://{}", runtime.addr); + let client = SequencerClient::new_with_timeout(endpoint, Duration::from_secs(2)) + .expect("build sequencer client"); + + let user_op = UserOp { + nonce: 0, + max_fee: FRAME_FEE_BOUNDARY, + data: ssz::Encode::as_ssz_bytes(&Method::Withdrawal(Withdrawal { + amount: U256::from(0_u64), + })) + .into(), + }; + let request = TxRequest { + signature: sign_user_op_hex(&domain, &user_op, &signing_key), + sender: sender.to_string(), + message: user_op, + }; + + let (status, body) = client + .submit_tx_with_status(&request) + .await + .expect("submit tx"); + assert_eq!( + status, 200, + "max_fee == current_frame_fee boundary must be accepted (comparator is `<`, not `<=`), got {status}: {body}", + ); + + shutdown_runtime(runtime).await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_rejects_user_op_when_balance_below_gas_cost() { + // §2.6.1: if sender's balance < `fee_to_linear(current_frame_fee)` the + // user op must be rejected with 422 `InsufficientGasBalance` and leave + // state unchanged. Exercises the balance check in + // `WalletApp::validate_user_op` (app-core). A fresh sender with no + // deposits has balance 0, well below `fee_to_linear(1060)` (the + // bootstrapped frame fee). + let db = temp_db("insufficient-gas-balance"); + let domain = test_domain(); + let signing_key = SigningKey::from_bytes((&[11_u8; 32]).into()).expect("create signing key"); + let sender = address_from_signing_key(&signing_key); + // No deposit for `sender` → balance = 0. + bootstrap_open_frame(db.path.as_str()); + + let Some(runtime) = start_full_server(db.path.as_str(), domain.clone()).await else { + return; + }; + + let endpoint = format!("http://{}", runtime.addr); + let client = SequencerClient::new_with_timeout(endpoint, Duration::from_secs(2)) + .expect("build sequencer client"); + + let user_op = UserOp { + nonce: 0, + max_fee: TEST_MAX_FEE, + data: ssz::Encode::as_ssz_bytes(&Method::Withdrawal(Withdrawal { + amount: U256::from(0_u64), + })) + .into(), + }; + let request = TxRequest { + signature: sign_user_op_hex(&domain, &user_op, &signing_key), + sender: sender.to_string(), + message: user_op, + }; + + let (status, body) = client + .submit_tx_with_status(&request) + .await + .expect("submit tx"); + assert_eq!( + status, 422, + "insufficient-balance must produce 422, got {status}: {body}", + ); + assert!( + body.contains("insufficient balance for gas"), + "expected InsufficientGasBalance message, got: {body}", + ); + + shutdown_runtime(runtime).await; +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn api_concurrent_same_nonce_leaves_exactly_one_committed() { + // §2.8.2: two concurrent POSTs for the same (sender, nonce) — one + // succeeds, one is rejected with a nonce-class error. Pins the invariant + // that the rejected half does NOT leave any state artifact: the final + // balance/nonce must match the single-commit path. + let db = temp_db("concurrent-same-nonce"); + let domain = test_domain(); + let signing_key = SigningKey::from_bytes((&[13_u8; 32]).into()).expect("create signing key"); + let sender = address_from_signing_key(&signing_key); + bootstrap_open_frame_with_deposits(db.path.as_str(), &[(sender, U256::from(10_000_000_u64))]); + + let Some(runtime) = start_full_server(db.path.as_str(), domain.clone()).await else { + return; + }; + + let user_op = UserOp { + nonce: 0, + max_fee: TEST_MAX_FEE, + data: ssz::Encode::as_ssz_bytes(&Method::Withdrawal(Withdrawal { + amount: U256::from(0_u64), + })) + .into(), + }; + let request = TxRequest { + signature: sign_user_op_hex(&domain, &user_op, &signing_key), + sender: sender.to_string(), + message: user_op, + }; + let request_json = serde_json::to_string(&request).expect("serialize request"); + + // Two concurrent POSTs with byte-identical bodies. + let addr = runtime.addr; + let body_a = request_json.clone(); + let body_b = request_json; + let a = tokio::spawn(async move { post_raw_json(addr, body_a.as_str()).await }); + let b = tokio::spawn(async move { post_raw_json(addr, body_b.as_str()).await }); + let (res_a, res_b) = tokio::try_join!(a, b).expect("join concurrent posts"); + + let outcomes = [res_a, res_b]; + let accepted = outcomes.iter().filter(|(s, _)| *s == 200).count(); + let rejected_bodies: Vec<&String> = outcomes + .iter() + .filter_map(|(s, b)| (*s == 422).then_some(b)) + .collect(); + assert_eq!( + accepted, 1, + "exactly one concurrent submission must be accepted, outcomes: {outcomes:?}", + ); + assert_eq!( + rejected_bodies.len(), + 1, + "exactly one concurrent submission must be rejected with 422, outcomes: {outcomes:?}", + ); + let rejected_body = rejected_bodies[0]; + assert!( + rejected_body.contains("bad nonce") || rejected_body.contains("INVALID_NONCE"), + "rejected concurrent op should be nonce-class, got: {rejected_body}", + ); + + shutdown_runtime(runtime).await; +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn restart_replays_same_ordered_l2_tx_stream_from_db() { let db = temp_db("restart-replay-golden"); @@ -357,15 +1004,16 @@ async fn restart_replays_same_ordered_l2_tx_stream_from_db() { let second_live = recv_ws_message(&mut ws).await; drop(ws); - let expected = load_all_ordered_l2_txs(db.path.as_str()); + let expected = all_ordered_l2_txs(db.path.as_str()); assert_eq!( expected.len(), 3, "expected deposit, direct input, and user op" ); - assert_ws_message_matches_tx(deposit_live, &expected[0], 0); - assert_ws_message_matches_tx(first_live, &expected[1], 1); - assert_ws_message_matches_tx(second_live, &expected[2], 2); + // DB offsets (SQLite rowid) start at 1. + assert_ws_message_matches_tx(deposit_live, &expected[0], 1); + assert_ws_message_matches_tx(first_live, &expected[1], 2); + assert_ws_message_matches_tx(second_live, &expected[2], 3); shutdown_runtime(runtime).await; @@ -384,9 +1032,10 @@ async fn restart_replays_same_ordered_l2_tx_stream_from_db() { .expect("timeout connecting websocket after restart") .expect("connect websocket after restart"); - for (offset, expected_tx) in expected.iter().enumerate() { + for (i, expected_tx) in expected.iter().enumerate() { let replayed = recv_ws_message(&mut restarted_ws).await; - assert_ws_message_matches_tx(replayed, expected_tx, offset as u64); + // DB offsets start at 1. + assert_ws_message_matches_tx(replayed, expected_tx, (i + 1) as u64); } drop(restarted_ws); @@ -396,9 +1045,10 @@ async fn restart_replays_same_ordered_l2_tx_stream_from_db() { struct FullServerRuntime { addr: std::net::SocketAddr, shutdown: ShutdownSignal, - server_task: Option, - lane_handle: - Option>>, + server_task: Option, + lane_handle: Option< + tokio::task::JoinHandle>, + >, _parked_rx: Option>, } @@ -435,7 +1085,7 @@ async fn start_full_server_with_max_body( }; let addr = listener.local_addr().expect("read listener addr"); - let storage = Storage::open(db_path, "NORMAL").expect("open storage"); + let storage = Storage::open(db_path).expect("open storage"); let shutdown = ShutdownSignal::default(); let (tx, lane_handle) = InclusionLane::start( @@ -449,6 +1099,7 @@ async fn start_full_server_with_max_body( safe_input_buffer_capacity: 32, max_batch_open: Duration::from_secs(60 * 60), idle_poll_interval: Duration::from_millis(2), + frontier_min_interval: Duration::ZERO, }, ); @@ -462,7 +1113,7 @@ async fn start_full_server_with_max_body( }, ); - let server_task = api::start_on_listener( + let server_task = http::start_on_listener( listener, tx, domain, @@ -500,7 +1151,7 @@ async fn start_api_only_server( }; let addr = listener.local_addr().expect("read listener addr"); - let _storage = Storage::open(db_path, "NORMAL").expect("open storage"); + let _storage = Storage::open(db_path).expect("open storage"); let (tx, rx) = mpsc::channel::(queue_capacity); let shutdown = ShutdownSignal::default(); let tx_feed = L2TxFeed::new( @@ -512,7 +1163,7 @@ async fn start_api_only_server( batch_submitter_address: None, }, ); - let server_task = api::start_on_listener( + let server_task = http::start_on_listener( listener, tx, domain, @@ -567,7 +1218,7 @@ fn bootstrap_open_frame(db_path: &str) { /// Bootstrap open frame, optionally seeding ERC-20 deposits for the given senders. /// Each sender receives `amount` tokens before the frame is opened. fn bootstrap_open_frame_with_deposits(db_path: &str, deposits: &[(Address, U256)]) { - let mut storage = Storage::open(db_path, "NORMAL").expect("open storage"); + let mut storage = Storage::open(db_path).expect("open storage"); let config = WalletConfig::default(); if !deposits.is_empty() { @@ -586,7 +1237,16 @@ fn bootstrap_open_frame_with_deposits(db_path: &str, deposits: &[(Address, U256) }) .collect(); storage - .append_safe_inputs(1, &safe_inputs) + .append_safe_inputs( + 1, + &safe_inputs, + &sequencer_core::protocol::ProtocolConfig { + batch_submitter: Address::ZERO, + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + seconds_per_block: 12, + }, + ) .expect("seed deposits"); } @@ -622,7 +1282,7 @@ fn make_valid_request(domain: &Eip712Domain) -> TxRequest { } fn seed_safe_direct_input(db_path: &str, safe_block: u64, payload: Vec) { - let mut storage = Storage::open(db_path, "NORMAL").expect("open storage"); + let mut storage = Storage::open(db_path).expect("open storage"); storage .append_safe_inputs( safe_block, @@ -631,18 +1291,24 @@ fn seed_safe_direct_input(db_path: &str, safe_block: u64, payload: Vec) { payload, block_number: safe_block, }], + &sequencer_core::protocol::ProtocolConfig { + batch_submitter: Address::ZERO, + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + seconds_per_block: 12, + }, ) .expect("append safe direct input"); } -fn load_all_ordered_l2_txs(db_path: &str) -> Vec { +fn all_ordered_l2_txs(db_path: &str) -> Vec { let mut storage = Storage::open_read_only(db_path).expect("open read-only storage"); - let total = storage - .ordered_l2_tx_count() - .expect("query ordered l2 tx count"); storage - .load_ordered_l2_txs_page_from(0, total as usize) + .ordered_l2_txs_page_from(0, 1_000_000) .expect("load ordered l2 txs") + .into_iter() + .map(|(_offset, tx)| tx) + .collect() } fn assert_ws_message_matches_tx( @@ -696,6 +1362,30 @@ fn assert_ws_message_matches_tx( } } +async fn post_raw_body_no_content_type(addr: std::net::SocketAddr, body: &str) -> (u16, String) { + let host_port = addr.to_string(); + let mut stream = tokio::net::TcpStream::connect(host_port.as_str()) + .await + .expect("connect test http socket"); + // Deliberately omit Content-Type header. + let request = format!( + "POST /tx HTTP/1.1\r\nHost: {host_port}\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{body}", + body.len() + ); + stream + .write_all(request.as_bytes()) + .await + .expect("write raw request"); + stream.flush().await.expect("flush raw request"); + + let mut response = Vec::new(); + stream + .read_to_end(&mut response) + .await + .expect("read raw response"); + parse_http_response(response.as_slice()) +} + async fn post_raw_json(addr: std::net::SocketAddr, body: &str) -> (u16, String) { let host_port = addr.to_string(); let mut stream = tokio::net::TcpStream::connect(host_port.as_str()) @@ -785,28 +1475,5 @@ fn decode_hex_prefixed(value: &str) -> Vec { } fn test_domain() -> Eip712Domain { - Eip712Domain { - name: Some("CartesiAppSequencer".to_string().into()), - version: Some("1".to_string().into()), - chain_id: Some(U256::from(1_u64)), - verifying_contract: Some(Address::from_slice(&[0_u8; 20])), - salt: None, - } -} - -struct TestDb { - _dir: TempDir, - path: String, -} - -fn temp_db(name: &str) -> TestDb { - let dir = tempfile::Builder::new() - .prefix(format!("sequencer-full-e2e-{name}-").as_str()) - .tempdir() - .expect("create temporary test directory"); - let path = dir.path().join("sequencer.sqlite"); - TestDb { - _dir: dir, - path: path.to_string_lossy().into_owned(), - } + sequencer_core::build_input_domain(1, Address::from_slice(&[0_u8; 20])) } diff --git a/sequencer/tests/ws_broadcaster.rs b/sequencer/tests/ws_broadcaster.rs index 5b25f4f..68b07f4 100644 --- a/sequencer/tests/ws_broadcaster.rs +++ b/sequencer/tests/ws_broadcaster.rs @@ -8,20 +8,22 @@ use alloy_primitives::{Address, Signature}; use alloy_sol_types::Eip712Domain; use app_core::application::MAX_METHOD_PAYLOAD_BYTES; use futures_util::{SinkExt, StreamExt}; -use sequencer::api::{self, ApiConfig, WS_CATCHUP_WINDOW_EXCEEDED_REASON}; -use sequencer::inclusion_lane::{PendingUserOp, SequencerError}; -use sequencer::l2_tx_feed::{L2TxFeed, L2TxFeedConfig}; -use sequencer::shutdown::ShutdownSignal; +use sequencer::egress::l2_tx_feed::{L2TxFeed, L2TxFeedConfig}; +use sequencer::http::{self, ApiConfig, WS_CATCHUP_WINDOW_EXCEEDED_REASON}; +use sequencer::ingress::inclusion_lane::{PendingUserOp, SequencerError}; +use sequencer::runtime::shutdown::ShutdownSignal; use sequencer::storage::{SafeInputRange, Storage, StoredSafeInput}; use sequencer_core::api::WsTxMessage; use sequencer_core::l2_tx::SequencedL2Tx; use sequencer_core::user_op::{SignedUserOp, UserOp}; use sequencer_rust_client::SequencerClient; -use tempfile::TempDir; use tokio::sync::{mpsc, oneshot}; use tokio_tungstenite::connect_async; use tokio_tungstenite::tungstenite::Message; +mod common; +use common::temp_db; + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn ws_subscribe_streams_ordered_txs_from_offset_zero() { let db = temp_db("ws-subscribe-zero"); @@ -44,19 +46,21 @@ async fn ws_subscribe_streams_ordered_txs_from_offset_zero() { shutdown_runtime(runtime).await; - assert_ws_message_matches_tx(first, &expected[0], 0); - assert_ws_message_matches_tx(second, &expected[1], 1); + // DB offsets (SQLite rowid) start at 1. + assert_ws_message_matches_tx(first, &expected[0], 1); + assert_ws_message_matches_tx(second, &expected[1], 2); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn ws_subscribe_resumes_from_given_offset() { let db = temp_db("ws-subscribe-resume"); seed_ordered_txs(db.path.as_str()); + // Resume from DB offset 1 — should get items with offset > 1. let expected = load_ordered_l2_txs_page(db.path.as_str(), 1, 1); assert_eq!( expected.len(), 1, - "resume snapshot must contain one event at offset 1" + "resume snapshot must contain one event at offset 2" ); let Some(runtime) = start_test_server(db.path.as_str()).await else { @@ -73,7 +77,7 @@ async fn ws_subscribe_resumes_from_given_offset() { shutdown_runtime(runtime).await; - assert_ws_message_matches_tx(first, &expected[0], 1); + assert_ws_message_matches_tx(first, &expected[0], 2); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] @@ -105,7 +109,7 @@ async fn ws_subscribe_receives_live_events_after_subscribing() { shutdown_runtime(runtime).await; - assert_ws_message_matches_tx(live, &expected[0], base_offset); + assert_ws_message_matches_tx(live, &expected[0], base_offset + 1); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] @@ -143,8 +147,8 @@ async fn ws_subscribe_fanout_delivers_live_event_to_multiple_subscribers() { shutdown_runtime(runtime).await; - assert_ws_message_matches_tx(event_a, &expected[0], base_offset); - assert_ws_message_matches_tx(event_b, &expected[0], base_offset); + assert_ws_message_matches_tx(event_a, &expected[0], base_offset + 1); + assert_ws_message_matches_tx(event_b, &expected[0], base_offset + 1); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] @@ -265,8 +269,8 @@ async fn ws_subscribe_allows_catchup_exactly_at_limit() { shutdown_runtime(runtime).await; - assert_ws_message_matches_tx(first, &expected[0], 0); - assert_ws_message_matches_tx(second, &expected[1], 1); + assert_ws_message_matches_tx(first, &expected[0], 1); + assert_ws_message_matches_tx(second, &expected[1], 2); } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] @@ -304,7 +308,7 @@ async fn ws_subscribe_closes_on_oversized_inbound_message() { } fn seed_ordered_txs(db_path: &str) { - let mut storage = Storage::open(db_path, "NORMAL").expect("open storage"); + let mut storage = Storage::open(db_path).expect("open storage"); let mut head = storage .initialize_open_state(0, SafeInputRange::empty_at(0)) .expect("initialize open state"); @@ -335,6 +339,12 @@ fn seed_ordered_txs(db_path: &str) { payload: vec![0xaa], block_number: 10, }], + &sequencer_core::protocol::ProtocolConfig { + batch_submitter: Address::ZERO, + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + seconds_per_block: 12, + }, ) .expect("append direct input"); storage @@ -343,9 +353,9 @@ fn seed_ordered_txs(db_path: &str) { } fn append_drained_direct_input(db_path: &str, payload: Vec) { - let mut storage = Storage::open(db_path, "NORMAL").expect("open storage"); + let mut storage = Storage::open(db_path).expect("open storage"); let mut head = storage - .load_open_state() + .open_state() .expect("load open state") .expect("open state should exist"); let safe_block = storage @@ -363,6 +373,12 @@ fn append_drained_direct_input(db_path: &str, payload: Vec) { payload, block_number: safe_block, }], + &sequencer_core::protocol::ProtocolConfig { + batch_submitter: Address::ZERO, + max_wait_blocks: sequencer_core::MAX_WAIT_BLOCKS, + preemptive_margin_blocks: 75, + seconds_per_block: 12, + }, ) .expect("append direct input"); storage @@ -377,7 +393,7 @@ fn append_drained_direct_input(db_path: &str, payload: Vec) { struct WsServerRuntime { addr: std::net::SocketAddr, shutdown: ShutdownSignal, - server_task: Option, + server_task: Option, } impl Drop for WsServerRuntime { @@ -421,7 +437,7 @@ async fn start_test_server_with_limits( batch_submitter_address: None, }, ); - let task = api::start_on_listener( + let task = http::start_on_listener( listener, tx_sender, Eip712Domain { @@ -507,15 +523,18 @@ fn ws_subscribe_url(addr: std::net::SocketAddr, from_offset: u64) -> String { fn ordered_l2_tx_count(db_path: &str) -> u64 { let mut storage = Storage::open_read_only(db_path).expect("open read-only storage"); storage - .ordered_l2_tx_count() - .expect("query ordered l2 count") + .ordered_l2_tx_head_offset() + .expect("query ordered l2 head offset") } fn load_ordered_l2_txs_page(db_path: &str, from_offset: u64, limit: usize) -> Vec { let mut storage = Storage::open_read_only(db_path).expect("open read-only storage"); storage - .load_ordered_l2_txs_page_from(from_offset, limit) + .ordered_l2_txs_page_from(from_offset, limit) .expect("load ordered l2 tx page") + .into_iter() + .map(|(_offset, tx)| tx) + .collect() } fn assert_ws_message_matches_tx( @@ -566,20 +585,3 @@ fn assert_ws_message_matches_tx( } } } - -struct TestDb { - _dir: TempDir, - path: String, -} - -fn temp_db(name: &str) -> TestDb { - let dir = tempfile::Builder::new() - .prefix(format!("sequencer-ws-feed-{name}-").as_str()) - .tempdir() - .expect("create temporary test directory"); - let path = dir.path().join("sequencer.sqlite"); - TestDb { - _dir: dir, - path: path.to_string_lossy().into_owned(), - } -} diff --git a/tests/TEST_PLAN.md b/tests/TEST_PLAN.md new file mode 100644 index 0000000..0547bc8 --- /dev/null +++ b/tests/TEST_PLAN.md @@ -0,0 +1,611 @@ +# Sequencer Test Plan + +A living document tracking the scenarios we need to exercise to have confidence the sequencer is correct under its threat model. This is **scenario-first** — it describes behaviors, not code paths. A behavior without a test is a liability regardless of how much code coverage the implementation has. + +The project is **security-critical**. The open-batch-staleness bug was caught by an e2e test written for the behavior ("after a stale sequencer restarts, the invalid transfer must not reappear"), not by any code-level check. That experience is why this plan prioritizes *what should happen* over *what code runs*. + +## Status markers + +- `[x]` — scenario has a test, known to pass +- `[ ]` — planned, needs implementation +- `[!]` — test exists but is flaky, partial, or needs hardening +- `[?]` — coverage unclear, needs verification against existing tests +- `[-]` — out of scope under current tooling (see §14) + +## Recent regression work + +**Phase 1 — Security-review regressions** (completed): 19 new tests locking in the fixes from the staged security review. See §1.1 (V1), §7.3 (open-batch staleness), §8.5 (H3/H4 provider), §2.10 (H2 error body), §6.5/§8.3 (H7 chain-id cache path). Notably, the IPv6-loopback test caught a latent bug in the H4 fix itself (`host_str()` returns bracket-wrapped `[::1]` for IPv6 literals; original `matches!` check missed it). + +**Phase 2 — Tooling + zone matrix** (completed): +- Built `tests/harness/src/proxy.rs` — programmable TCP proxy (`TcpProxy::spawn/disconnect/reconnect`) with 6 unit tests exercising the forwarder, disconnect, and reconnect paths. Handles both clean-EOF and RST close behavior (OS-dependent). +- Added `ManagedSequencer::set_l1_endpoint_override` so tests can route the sequencer through the proxy while still mining blocks directly on Anvil (bypassing the proxy) to simulate "L1 advanced while the gateway was down." +- 3 new e2e scenarios registered and **verified end-to-end with `just test-rollups-e2e`**: §11.1.1 (sequencer outage, pre-danger), §11.1.2 (sequencer outage, danger zone), §11.2.3 (provider outage, past-stale using the proxy). Full suite: 15/15 passing in ~53s. +- 3 H8 clap-validation regression tests locking `SEQ_SECONDS_PER_BLOCK >= 1`. + +**Lessons surfaced by actually running the e2e suite:** +- Wallet-client nonce state: the harness's `WalletL2Client` initializes `next_nonce: 0`. In no-cascade restart scenarios (where on-chain nonce is preserved), post-restart submissions need explicit nonce-state plumbing. Current workaround: the pre-danger/danger-zone scenarios don't submit new work after the restart. +- Wall-clock fallback measures *real* seconds, not mined blocks. `anvil_mine(N)` advances the chain's block count in milliseconds of wall-clock time, so the fallback correctly reports "not yet in danger" even after mining 1250+ blocks. The block-time coupling assumption is documented in `docs/threat-model/README.md`. +- Built `ManagedSequencer::rewind_synced_at_ms` helper — rewrites `l1_safe_head.synced_at_ms` in the DB while the sequencer is stopped. Semantically equivalent to advancing the wall clock. + +**Danger-check unification bug (fixed):** + +The first e2e attempt at `provider_outage_wall_clock_refuses_boot_test` surfaced a real structural bug. Two code paths asked "is a batch in danger" with asymmetric scope: + +- `check_danger_zone` (live submitter tick + wall-clock fallback at boot) — closed-and-nonced batches only. +- `detect_and_recover` (atomic cascade) — closed + open batches (post §7.3 fix). + +The asymmetry meant an open batch could age past the danger threshold while L1 was unreachable and the preemptive path would miss it. Fixed by splitting the public API around the semantic distinction: + +- **`Storage::check_danger_zone`** (closed-only) — zombie-detection check. Live submitter keeps using this: its response (shutdown → flush pending nonces → restart) only makes sense for submitted batches with potential zombie risk. +- **`Storage::check_any_unresolved_batch_in_danger`** (unified, closed + open) — wall-clock fallback uses this at startup when L1 is unreachable. Refuses to boot if any unresolved batch might be past-stale. +- **`detect_and_recover`** (at `MAX_WAIT_BLOCKS`) — uses `find_first_batch_in_danger` (unified). Handles actually-stale open batches via cascade. + +Behind the scenes, all three share `find_first_batch_in_danger` and `find_closed_frontier_batch_in_danger` in `storage/recovery.rs`. The old one-step helpers `detect_stale_and_cascade` and `check_open_batch_staleness` are removed. + +**Key insight from the failure:** a first attempted refactor unified ALL callers behind the unified helper. That broke the live submitter — it started crashing on aging open batches (which have no zombies to flush), causing a restart loop. The corrected split keeps "zombie danger" (closed-only) separate from "any danger" (unified), because their expected responses differ: zombie-danger → flush + shutdown; open-batch-danger → let the batch close naturally or refuse to boot. + +**Tests landed:** +- `check_danger_zone_does_not_flag_open_batch_zombie` — regression for the submitter worker loop. +- `check_any_unresolved_flags_stale_open_batch` + `check_any_unresolved_does_not_flag_fresh_open_batch` — regressions for the unified helper. +- `provider_outage_wall_clock_refuses_boot_test` — e2e proving the full chain works end-to-end. + +**Still open from Phase 1**: +- §2.10.1 (H1 rusqlite leak) — needs failpoint injection (tool T5) +- (§6.5.1 / §8.3.1 (H7 RPC-path) closed by `tests/e2e` in commit `6f47b38`.) + +**Phase 3 — Unit-test hygiene** (in progress): +- Shared `TestDb` / `temp_db` unified: `storage::test_helpers` promoted to `pub(crate)` and reused across 4 inline test modules; `sequencer/tests/common/mod.rs` added for integration tests. 6 local `temp_db` clones removed. +- `storage/recovery.rs`'s 38 flat tests split into 8 nested sub-modules (`invalid_batches`, `detect_and_recover`, `tip_staleness`, `check_danger_zone`, `check_any_unresolved`, `boundary`, `schema_invariants`, `tree_invariants`). Test names now self-locate (e.g. `tests::tip_staleness::open_batch_exactly_at_threshold_is_invalidated`). +- `sequencer-core/src/batch.rs` unit tests added (was zero tests): §1.4 SSZ roundtrip for `Batch`/`Frame`/`WireUserOp`, cross-call determinism, and §1.5 decode robustness (empty, below-header, truncated, invalid offset, garbage fuzz). 12 new tests. +- Stale markers cleaned: §1.4 `[?]`→`[x]`, §1.5 `[ ]`→`[x]`, §2.4.2 `[?]`→`[x]`, §2.7.1 `[ ]`→`[x]`, §5.1.1 `[?]`→`[x]`. + +**SSZ library finding (Phase 3):** `ethereum_ssz::Decode::from_ssz_bytes` silently accepts trailing bytes after a valid `Batch` encoding. Not a security issue under our threat model (only the trusted batch-submitter sender is classified as `Batch` at L1; the scheduler also authenticates by msg_sender). Flagging for visibility: if any future path decodes a non-authenticated payload as `Batch`, this would need a pre-decode length check or a wrapper that enforces full-consumption. Referenced in §1.5 notes. + +**Landed in Phase 3** (cumulative, unit-layer): +- §1.4, §1.5 — batch SSZ roundtrip + decode robustness (`sequencer-core/src/batch.rs`). +- §1.7 — S-malleability: malleable variant cannot recover a different address (alloy/k256 regression lock). +- §7.4.2, §7.4.3 — undrained safe input reaches recovery batch; empty recovery first frame. **Also covered at e2e in `6f47b38`** — both layers retained for defense in depth. +- §7.5.1 — first-batch-stale → nonce 0 reused after torn cascade. **Also covered at e2e in `6f47b38`.** +- §7.6.3 — post-`open_recovery_batch` crash → restart is no-op over persisted state. +- §7.7.4, §7.7.5 — flusher fee-bump and timeout helpers extracted + H5/H6 regression-locked. +- §8.4.1 — `preemptive_margin_blocks` validation extracted + `#[should_panic]` covered. + +**Prioritized unit-layer gaps still open:** +- §2.10.1 (H1 rusqlite leak) — needs failpoint injection (tool T5). + +**Completed design-review items:** +- [x] **TLA+ spec alignment with the danger-check split.** `docs/recovery/preemptive.tla` now distinguishes the zombie path (stale Silver frontier) from the aging-open-Tip path, `docs/recovery/README.md` documents the same split explicitly, and TLC was re-run against the updated model. + +## Test layers + +| Layer | Purpose | Examples | Runs where | +|-------|---------|----------|-----------| +| **Unit** | Pure functions, data structures, per-module invariants | `fee.rs`, `batch.rs` SSZ round-trip, `storage/recovery.rs` inline tests | `cargo test --lib` | +| **Integration** | Crate-level wiring with mocks or Anvil | `sequencer/tests/*.rs`, inclusion-lane tests | `cargo test` (Anvil optional) | +| **E2E** | Full binary + Anvil + harness, real RPC, real DB | `tests/e2e/src/test_cases.rs` | `cargo test -p rollups-e2e` | +| **Formal** | Bounded model checking | `docs/recovery/preemptive.tla` | `tlc` | + +The existing convention is documented in [`AGENTS.md`](../AGENTS.md). This plan should coexist with that guide, not replace it. + +--- + +## 1. Wire Compatibility (Sequencer ↔ Scheduler) + +These are the **cross-boundary** invariants. Any divergence here is catastrophic: the scheduler is the canonical authority, and a mismatch breaks every honest transaction. + +| # | Scenario | Layer | Status | Notes | +|---|----------|-------|--------|-------| +| 1.1 | Sign a `UserOp` with `sequencer_core::build_input_domain(chain_id, app)`, decode with the same constructor, assert recovered sender matches signer | Integration (`sequencer/tests/e2e_sequencer.rs::v1_regression_shared_domain_recovers_signer`) | `[x]` | **V1 regression.** Plus a negative test that a `name:None` domain recovers a DIFFERENT address — catches any reintroduction of the V1 bug. | +| 1.2 | Sign with chain_id=X, attempt recover with chain_id=Y → recovered address ≠ signer | Integration (`v1_regression_domain_fields_all_affect_recovery`) | `[x]` | Cross-chain replay protection | +| 1.3 | Sign with app=X, attempt recover with app=Y → recovered address ≠ signer | Integration (same test) | `[x]` | Cross-app replay protection | +| 1.4 | SSZ encode a `Batch`, decode, re-encode → byte-identical | Unit (`sequencer-core/src/batch.rs::tests::ssz_roundtrip_*`) | `[x]` | Covers empty batch, populated batch, empty-user-ops frame, wire user op, and cross-call determinism | +| 1.5 | SSZ decode fails cleanly on truncated payload, garbage bytes, malformed offsets → returns `DecodeError`, never panics | Unit (`sequencer-core/src/batch.rs::tests::ssz_decode_*`) | `[x]` | Covers empty payload, sub-header lengths, truncated valid batch, invalid offset, and garbage-pattern fuzz. **Known library behavior:** `ethereum_ssz` silently accepts trailing bytes after a valid batch. Not a security issue under our threat model (only the trusted batch-submitter sender is classified as `Batch`), but worth noting if the scheduler side ever decodes a non-authenticated payload as `Batch`. | +| 1.6 | `MAX_WAIT_BLOCKS` constant is the same value on sequencer and scheduler sides at link time | Unit | `[x]` | Shared via `sequencer_core::MAX_WAIT_BLOCKS` — structural guarantee, no runtime check needed | +| 1.7 | S-malleability neutralized: signing the same op twice produces low-s and high-s forms; both recover the same sender | Unit (`sequencer/src/ingress/api.rs::tests::s_malleable_signature_cannot_recover_a_different_address`) | `[x]` | Constructs the malleable variant (`s' = n - s`, flipped parity) and asserts recovery either errors (EIP-2 rejection) or yields the same address. Regression lock against alloy/k256 behavioral drift. | + +--- + +## 2. `POST /tx` — Public Attack Surface + +### 2.1 Happy path + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 2.1.1 | Valid signature, correct sender, correct nonce, sufficient balance → admitted, returns sender + nonce in 200 body | `[x]` | `deposit_transfer_withdrawal_test` | +| 2.1.2 | Soft confirmation arrives on WS within 500 ms of successful POST | `[?]` | Check e2e tests assert this | + +### 2.2 Signature validation + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 2.2.1 | Forged signature (valid format, wrong key) → 400 `INVALID_SIGNATURE`, no state change | `[x]` | `forged_signature_rejected_test` (e2e). **Note on status code**: observed contract is 400 `INVALID_SIGNATURE` for all signature-class rejections (not 422). Prior TEST_PLAN text said 422; updated to match reality. | +| 2.2.2 | Signature wrong hex length → 400 before crypto work | `[x]` | `sequencer/tests/e2e_sequencer.rs::api_rejects_signature_with_wrong_hex_length` — passes a 4-byte signature (`0xdeadbeef`); rejection fires from `validate_hex_lengths` before any crypto runs. | +| 2.2.3 | Signature valid bytes, invalid parity byte → 400 `INVALID_SIGNATURE` | `[x]` | `sequencer/tests/e2e_sequencer.rs::api_rejects_signature_with_invalid_parity_byte` — sends a 65-byte signature with the parity byte set to `0xFF`. Observed `"cannot recover sender"` path. Defensively asserts the rejection is *not* from the hex-length or payload-size gates. | +| 2.2.4 | Signature recovers a different address than claimed `sender` field → 400 `INVALID_SIGNATURE` | `[x]` | `sequencer/tests/e2e_sequencer.rs::api_rejects_sender_claim_that_mismatches_signature_recovery` — key A signs the op, request claims sender is B; asserts `sender mismatch` + `INVALID_SIGNATURE` code. Complements the e2e `forged_signature_rejected_test` (which covers the full end-to-end shape including the empty WS); this one pins the direct API response. | + +### 2.3 Body / format + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 2.3.1 | Body exceeds `max_body_bytes` (default 4 KB) → 413 before JSON parse | `[x]` | `sequencer/tests/e2e_sequencer.rs::api_rejects_oversized_json_body_before_parsing` — uses a small `max_body_bytes` (256) to make the 413 trigger fast; asserts status `PAYLOAD_TOO_LARGE`. Regression for `DefaultBodyLimit` enforcement. | +| 2.3.2 | Body is not JSON → 400 with `"invalid JSON"` (H2 regression: must NOT leak serde internals) | `[x]` | `sequencer/tests/e2e_sequencer.rs::api_rejects_malformed_json_as_bad_request` — sends a malformed body containing the bytes `0x1234`; asserts response message is exactly `"invalid JSON"` AND that `0x1234` does not appear in the body (no input echo). | +| 2.3.3 | Body is JSON but missing fields → 400, doesn't leak deserialization error text | `[x]` | `sequencer/tests/e2e_sequencer.rs::api_rejects_json_with_missing_fields_using_fixed_envelope` — sends `{}`; parses the response envelope and asserts `message == "invalid JSON"` and `code == "BAD_REQUEST"`; sweeps for serde leak vocabulary (`"missing field"`, `"expected"`, `"deserializ"`, `"line "`, `"column "`). H2 regression. | +| 2.3.4 | Content-Type other than `application/json` → 400 with `"missing content type"` | `[x]` | `sequencer/tests/e2e_sequencer.rs::api_rejects_missing_content_type_with_fixed_message` — sends a valid JSON body without the header; asserts the fixed `"missing content type"` envelope message. H2 regression. | +| 2.3.5 | User op `data` field exceeds `max_user_op_data_bytes` → 400 before signature verify | `[x]` | Two complementary tests: `api_rejects_user_op_payloads_above_application_limit` (oversized data + valid signature → 400 with `"user op payload too large"`, body echoes the limit) and `api_payload_size_check_fires_before_signature_recovery` (oversized data + correctly-shaped *garbage* signature → still gets the size-class error, never a signature error — proves the validation order in `validate_payload_size` runs before `recover_sender`, so signature recovery isn't a DoS amplifier on huge bodies). | + +### 2.4 Nonce rules + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 2.4.1 | First tx with nonce 0 → accepted, next expected becomes 1 | `[x]` | `deposit_transfer_withdrawal_test` | +| 2.4.2 | Tx with nonce too low (e.g., replay) → 422 `InvalidNonce`, no state change | `[x]` | `rejected_user_op_not_broadcast_test` | +| 2.4.3 | Tx with nonce too high (gap) → 422 `InvalidNonce`, no state change | `[x]` | `sequencer/tests/e2e_sequencer.rs::api_rejects_user_op_with_nonce_gap` — submits nonce 7 when the expected nonce is 0; asserts 422 + nonce-class message. Complement to §2.4.2 (nonce too low); together they pin strict-equality on `current_user_nonce`. | +| 2.4.4 | `InvalidNonce` response does NOT get broadcast on WS | `[x]` | `rejected_user_op_not_broadcast_test` | + +### 2.5 Fee rules (V3 regression) + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 2.5.1 | `max_fee < current_frame_fee` → 422 `InvalidMaxFee` | `[x]` | `fee_below_minimum_rejected_test` | +| 2.5.2 | `max_fee == current_frame_fee` → accepted (boundary) | `[x]` | `sequencer/tests/e2e_sequencer.rs::api_accepts_user_op_with_max_fee_equal_to_current_frame_fee` — submits `max_fee = 1060` (exactly the bootstrapped frame's fee); asserts 200. Paired with §2.5.1 (`fee_below_minimum_rejected_test`), pins the comparator as strict `<` (not `<=`). | +| 2.5.3 | Rejection handled by trait-default `validate_and_execute_user_op` (V3 regression) | `[x]` | Unit test in `app-core/wallet.rs` | + +### 2.6 Balance rules + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 2.6.1 | `balance < fee_to_linear(current_fee)` → 422 `InsufficientGasBalance`, no state change | `[x]` | `sequencer/tests/e2e_sequencer.rs::api_rejects_user_op_when_balance_below_gas_cost` — fresh signer with no deposit (balance = 0) submits a user-op; asserts 422 + `"insufficient balance for gas"` (the `InvalidReason::InsufficientGasBalance` Display text from `sequencer_core::application`). Exercises `WalletApp::validate_user_op`'s balance check in app-core. | +| 2.6.2 | Rejected op does NOT broadcast | `[x]` | Covered indirectly by `rejected_user_op_not_broadcast_test` (e2e) which asserts the WS no-message-after-reject invariant on the bad-nonce variant. The broadcast filter in the lane is rejection-class-agnostic (any `SequencerError` rejection path → no WS event), so bad-nonce coverage applies to the insufficient-gas path too. A dedicated insufficient-gas test would add belt-and-suspenders and could land alongside §2.6.1. | + +### 2.7 Admission control + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 2.7.1 | Queue full → `429 OVERLOADED` with body `"queue full"` | `[x]` | `sequencer/tests/e2e_sequencer.rs::api_returns_429_when_queue_is_full` | +| 2.7.2 | Queue-full response does not leak per-sender info | `[ ]` | Hardening | + +### 2.8 Concurrency + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 2.8.1 | Two concurrent POSTs for same (sender, nonce) → exactly one admitted, one gets `InvalidNonce` | `[x]` | `concurrent_user_ops_test` | +| 2.8.2 | Rejected concurrent op produces no state change | `[x]` | `sequencer/tests/e2e_sequencer.rs::api_concurrent_same_nonce_leaves_exactly_one_committed` — two `tokio::spawn`-ed POSTs with byte-identical bodies (same sender, same nonce) join concurrently; asserts exactly one 200 + one 422 with a nonce-class message. Complements `concurrent_user_ops_test` (distinct-sender happy path, at e2e) by pinning the rejected-branch outcome specifically. | + +### 2.9 Shutdown semantics + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 2.9.1 | Mid-request shutdown: in-flight requests get 503 or clean error | `[ ]` | Not currently covered. The old `shutdown_during_inflight_test` was renamed to `restart_after_committed_tx_replays_cleanly_test` because it only proves replay-after-restart for an already-committed tx. A deterministic hook would be needed for a real in-flight shutdown test. | +| 2.9.2 | Post-shutdown POST → 503 immediately | `[x]` | `sequencer/src/ingress/api.rs::tests::submit_tx_rejects_when_shutdown_has_started` — requests shutdown on the `ShutdownSignal`, then submits; asserts `StatusCode::SERVICE_UNAVAILABLE` with code `UNAVAILABLE`. | + +### 2.10 Error-body hardening (regression tests for security review findings) + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 2.10.1 | DB-error response body contains `"internal storage error"`, not rusqlite text | `[-]` | **H1 regression** deferred — requires failpoint injection (tool T5). Code review + code is trivial (`format!` removed in favor of fixed string). | +| 2.10.2 | Malformed JSON response body is from fixed taxonomy, doesn't reflect bytes | `[x]` | **H2 regression** in `e2e_sequencer.rs::api_rejects_malformed_json_as_bad_request` — asserts `"message":"invalid JSON"` AND that attacker-submitted bytes don't appear in response. | +| 2.10.3 | Missing Content-Type produces fixed `"missing content type"` message | `[x]` | H2 regression in `api_rejects_missing_content_type_with_fixed_message` | + +--- + +## 3. Inclusion Lane (Hot Path) + +### 3.1 Chunk commit semantics + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 3.1.1 | Ack returns AFTER chunk is durably committed to SQLite, not merely enqueued | `[x]` | `ingress/inclusion_lane/tests.rs` | +| 3.1.2 | Storage failure during chunk commit → every pending op gets `Err`, lane crashes, no partial ack | `[x]` | Covered by existing lane tests | +| 3.1.3 | Chunk commit triggers autoincrement insert into `sequenced_l2_txs` via SQL trigger | `[x]` | `trg_sequence_user_op` — verified by integration tests | + +### 3.2 Frame rotation + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 3.2.1 | Frame closes on direct-input drain and opens a new one at the current safe_block | `[?]` | | +| 3.2.2 | New frame's `fee_price` sampled from `batch_policy_derived.recommended_fee` at rotation | `[?]` | | +| 3.2.3 | Frame fee stays fixed for the frame's lifetime even if policy is updated mid-frame | `[x]` | `storage/ingress.rs::tests::frame_fee_is_immutable_for_the_lifetime_of_the_frame` — opens a frame at default fee (1060), calls `set_log_gas_price(100)` mid-frame (derived policy now recommends 1160), asserts the open frame's persisted `frames.fee` is still 1060 AND the `WriteHead.frame_fee` mirror is stable; then closes the frame and asserts the *next* frame opens at 1160 (policy flows in at close). Regression for "frames.fee immutable" invariant. | + +### 3.3 Batch closure + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 3.3.1 | Batch closes when `max_batch_user_op_bytes` target is reached | `[x]` | `batch_closes_when_max_user_op_bytes_is_reached` | +| 3.3.2 | Batch closes when deadline (`max_open_time`) elapses | `[x]` | `batch_closes_when_max_open_time_is_reached` | +| 3.3.3 | Closed batch becomes eligible for nonce assignment | `[x]` | `storage/l1_submission.rs::tests::closed_batch_becomes_eligible_for_submission_with_assigned_nonce` — asserts `load_pending_batches(0)` is empty before close and returns `[batch_index=0, nonce=0]` after `close_frame_and_batch`; also asserts the new open Tip (batch 1) is NOT eligible. Pins the open→closed→eligible transition + the genesis nonce invariant. | + +### 3.4 Single-writer invariant + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 3.4.1 | Inclusion lane is sole writer of open batch/frame state; no cross-task races | `[-]` | Structural, enforced by `&mut self` and single-task spawn; not testable at runtime | + +### 3.5 Direct-input draining + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 3.5.1 | Direct input arriving between two user ops is drained before the next frame's ops (ordering) | `[x]` | `direct_input_not_safe_yet_test`, `safe_inputs_already_available_are_sequenced_before_later_user_ops` | +| 3.5.2 | Multiple direct inputs in the same block drained in `safe_input_index` order | `[x]` | `multi_deposit_same_block_test` | + +--- + +## 4. WS Subscribe / L2 Feed + +### 4.1 Happy subscription + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 4.1.1 | Subscribe `from_offset=0` → receive all historical events then live | `[x]` | Many tests | +| 4.1.2 | Subscribe `from_offset=N` (N < head) → receive tail only | `[x]` | `reconnect_from_offset_test` | +| 4.1.3 | Subscribe `from_offset=future` → waits for new events, doesn't error | `[x]` `ws_subscribe_from_future_offset_waits_silently_test` | Pins the contract: subscribe with offset well beyond current head succeeds, delivers nothing until an event with a greater offset arrives. Consistent with `from_offset=0` on an empty head — we don't want the wait-for-new-events path to differ based on whether history happens to exist. | + +### 4.2 Catch-up bounds + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 4.2.1 | Catch-up window exceeded (>50000 events behind) → WS close code 1008, reason `"catch-up window exceeded"` | `[ ]` | Hard to produce 50000 events in a test; maybe reduce cap for test builds | +| 4.2.2 | Close reason is a constant string, not attacker-influenced | `[ ]` | Hardening regression | + +### 4.3 Subscriber limit + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 4.3.1 | 65th concurrent subscriber → rejected at handshake | `[ ]` | | + +### 4.4 Invalidation visibility + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 4.4.1 | After cascade-invalidation, subscribing `from_offset=0` does NOT deliver events from invalidated batches | `[x]` | `recovery_after_stale_batches_test` (regression for open-batch bug) | +| 4.4.2 | Reconnect after a cascade at a previously-observed offset that got invalidated → cursor delivers only post-recovery events. Complement to §4.4.1: that test reconnects at `from_offset=0` (trivial walk of the valid view); this tests the non-zero case where the client's last-seen offset is *itself* now hidden by `valid_sequenced_l2_txs`. A WS connection can't span invalidation — the sequencer exits (DangerZone or stop) first and the socket dies — so the scenario is specifically "client had last_seen=N before the break, reconnects at N post-recovery, query `WHERE offset > N` against the valid view skips cleanly past N". | `[x]` `ws_reconnect_at_invalidated_offset_skips_cleanly_test` | Captures the transfer's offset pre-cascade, reconnects at that offset post-recovery, asserts (a) delivered event's offset is strictly greater and (b) reconnect-at-invalidated matches reconnect-at-zero. | + +### 4.5 Data exposure + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 4.5.1 | Broadcast message contains only `sender`, `fee`, `data`, `offset`, `kind` — no DB internals, no debug info | `[?]` | Structural; unit-test the `BroadcastTxMessage` serializer | +| 4.5.2 | No timing side channel exposes internal batch-close decisions | `[-]` | Out of scope (timing attacks) | + +--- + +## 5. L1 Input Reader + +### 5.1 Event ingestion + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 5.1.1 | `InputAdded` event at safe block N → row in `safe_inputs` with block_number=N | `[x]` | Covered by `deposit_transfer_withdrawal_test` (deposit e2e) | +| 5.1.2 | Multiple events in one `eth_getLogs` response ingested in order | `[?]` | | +| 5.1.3 | Zero events in a safe-head advance → `l1_safe_head.block_number` advances, `synced_at_ms` updates | `[ ]` | | + +### 5.2 Sender classification + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 5.2.1 | Event from batch-submitter address → NOT stored as direct input (opaque to safe_inputs) | `[?]` | | +| 5.2.2 | Event from any other address → stored verbatim as direct input regardless of payload bytes | `[?]` | | + +### 5.3 Safe-head atomicity + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 5.3.1 | Event insert + safe_head update are atomic (same transaction); crash mid-insert leaves both unchanged | `[ ]` | Could test via injected mid-tx panic | + +### 5.4 RPC error handling + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 5.4.1 | Transient `Provider` error → reader retries, does not crash | `[x]` `provider_outage_input_reader_retries_after_reconnect_test` | Routes through T1 proxy. Disconnect → deposit on L1 (bypasses the proxy) → mine 20 blocks for safe depth → reader keeps retrying with connection errors for ≥5 s (`observe_for` asserts no exit) → reconnect → reader pulls the backlog → WS delivers the deposit event. | +| 5.4.2 | Provider times out → reader logs and retries | `[x]` | Covered by the same test — T1's `disconnect()` simulates any provider failure mode (connection refused / closed socket / pending read timeout); at e2e level there's no clean way to distinguish a refused connection from a timeout, and the retry path is identical. | +| 5.4.3 | Storage error during insert → reader fails loudly (fail-stop) | `[ ]` | | + +### 5.5 Long-range partition + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 5.5.1 | Range that triggers `SEQ_LONG_BLOCK_RANGE_ERROR_CODES` splits in half, both halves succeed | `[ ]` | Not cheaply testable at e2e: the proxy (T1) is a dumb TCP pass-through and can't selectively error based on RPC params / block-range size. Clean coverage would need either an HTTP-inspecting proxy (substantial new tooling) or a mock `Provider` (alloy's trait surface is large; non-trivial scaffolding) or a closure-refactor of `get_input_added_events` (production-code change for testability). The interesting logic — error-code matching in `error_message_matches_retry_codes` — is already unit-tested; the recursion itself is a standard bisect over that predicate. Low regression risk without dedicated coverage. | +| 5.5.2 | Range splits down to 1 block and still fails → bubbles up cleanly | `[ ]` | Same blocker as §5.5.1. Covered by inspection: the termination condition `if start_block >= end_block { return Err(...) }` in `get_input_added_events` is a 3-line bisect guard. | + +--- + +## 6. Batch Submitter + +### 6.1 Nonce management + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 6.1.1 | Nonce derived from `Latest` account nonce each tick — no local state | `[x]` | `batch_submitter_integration.rs` | +| 6.1.2 | Multiple pending batches → submitted at contiguous nonces starting from `Latest` | `[x]` | Same | +| 6.1.3 | After confirmation, next tick's `Latest` reflects the increment | `[?]` | | + +### 6.2 Confirmation depth + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 6.2.1 | `SEQ_BATCH_SUBMITTER_CONFIRMATION_DEPTH=2` means tx watched until `depth+1=3` confirmations | `[?]` | | +| 6.2.2 | Confirmation timeout returns `Ok` (not error); next tick reassesses | `[?]` | | + +### 6.3 Fee handling + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 6.3.1 | Batch submission uses `estimate_eip1559_fees()` result | `[?]` | | +| 6.3.2 | "Replacement underpriced" is not a stall (just retry next tick with current estimate) | `[?]` | Documented in security review as expected behavior | + +### 6.4 Provider outage + +See §11 matrix rows for full outage behavior. + +### 6.5 Chain-id validation at startup (H7 regression) + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 6.5.1 | Sequencer configured with `--chain-id=X`, RPC returns Y → startup returns `RunError::ChainIdMismatch`, no panic, no DB writes | `[x]` Covered at e2e level by `chain_id_mismatch_via_live_rpc_refuses_boot_test` (see §8.2.1). The `tests/e2e/` harness's deployed-InputBox setup is what made this feasible. | +| 6.5.2 | L1 unreachable at startup with cache present, cached chain_id matches config → boots | `[x]` | Positive control in `chain_id_match_does_not_produce_mismatch_error` | +| 6.5.3 | L1 unreachable at startup with cache present, cached chain_id differs → returns `RunError::ChainIdMismatch`, no panic | `[x]` | **H7 regression (cache path)**: `chain_id_mismatch_from_cache_returns_typed_error` | + +--- + +## 7. Recovery Procedure (CRITICAL) + +The largest and most sensitive section. The open-batch bug demonstrates that design gaps here have silent-corruption consequences. Every transition in the recovery state machine needs a test. + +### 7.1 Detection paths + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 7.1.1 | Frontier batch (nonce-bearing, closed, accepted) crosses `MAX_WAIT_BLOCKS` by inclusion staleness → cascade-invalidated on next check | `[-]` | Scoped out: the unique submitter-side path (live `check_danger_zone` firing on a closed-in-danger batch) is already covered by §7.3.5. The *other* unique path — `populate_safe_accepted_batches_inner`'s inclusion-stale skip (the `batch_age_is_stale` continue) — has unit coverage and is hard to exercise e2e: Anvil's `anvil_mine(N)` includes any pending tx in the first mined block, so you can't mine empty blocks past a held mempool tx. Also, the submitter's live-exit path is gated by `wait_for_confirmations`'s 24–72 s timeout (hard-coded against ETHEREUM_BLOCK_TIME_SECS, not config-tunable). Would become cheap if that timeout became test-configurable (T3-adjacent). | +| 7.1.2 | Open batch (not yet closed) crosses `MAX_WAIT_BLOCKS` by current staleness → cascade-invalidated | `[x]` | `recovery_after_stale_batches_test` (**the bug we caught**) | +| 7.1.3 | Batch in danger zone but not yet stale → flush triggers, but no cascade | `[ ]` | See §11 zone matrix | +| 7.1.4 | Batch pre-danger-zone → no flush, no cascade | `[ ]` | See §11 zone matrix | + +### 7.2 Cascade invalidation + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 7.2.1 | Stale batch N cascades to all batches with `batch_index >= N` | `[x]` | `storage/recovery.rs` unit tests | +| 7.2.2 | Cascade is a single atomic SQL transaction; crash mid-cascade leaves DB unchanged | `[x]` | `detect_and_recover_rolls_back_when_cascade_update_aborts` injects a SQLite trigger abort during the cascade UPDATE and proves the DB rolls back cleanly | +| 7.2.3 | `valid_*` views hide invalidated batches immediately after cascade | `[x]` | Covered by inline tests | +| 7.2.4 | Nonce reuse works automatically via parent-pointer (new Tip's `parent.nonce + 1` equals the invalidated suffix's first nonce) | `[x]` | Covered by `detect_and_recover_does_not_false_match_after_nonce_reuse`, `nonce_reuse_after_cascade_with_valid_ancestor`, `nonce_is_reused_after_torn_cascade` | + +### 7.3 Open-batch-only case (NEW regression zone — V4 + open-batch fix) + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 7.3.1 | Sequencer stops before batch closure, L1 advances past MAX_WAIT_BLOCKS, restart invalidates open batch | `[x]` | `recovery_after_stale_batches_test` (e2e) + `open_batch_stale_by_current_safe_block_is_invalidated` (unit) | +| 7.3.2 | Same scenario with NO direct inputs pending → recovery batch opens, empty frame | `[x]` | Implicit in `open_batch_stale_by_current_safe_block_is_invalidated` (no deposits seeded) | +| 7.3.3 | Closed-and-nonced batch stale + open batch also stale → both in one cascade | `[x]` | `closed_unsubmitted_stale_and_open_stale_both_cascade` | +| 7.3.4 | `check_open_batch_staleness` returns `None` when open batch is NOT stale → no false positive cascade | `[x]` | **Critical negative test**: `open_batch_not_yet_stale_is_not_invalidated` + boundary tests (`open_batch_exactly_at_threshold_is_invalidated`, `open_batch_one_block_below_threshold_is_not_invalidated`) | +| 7.3.5 | **Aging Tip while sequencer is UP and L1 is reachable**: Tip ages past `danger_threshold` without crossing `MAX_WAIT_BLOCKS`. Submitter's zombie check (closed-only) must NOT trigger shutdown loop; Tip closes/invalidates by natural policy; no doomed soft confirmations are issued. Closes the gap the schema refactor was designed to prevent. | `[x]` `aging_open_tip_tolerated_by_zombie_check_test` | Decoupled L1/wall-clock advance: `mine_l1_blocks(1150)` jumps L1 into the danger zone while the wall clock stays put so the Tip remains open. `observe_for(8s)` asserts the sequencer keeps running (would catch any regression that unifies the zombie check across open + closed batches). Then `set_faketime_offset("+7500s")` (past `DEFAULT_MAX_BATCH_OPEN` = 7200s) forces the inclusion lane's natural time-based close; submitter's next tick exits with `DangerZone`. Asserts `counts.invalidated == 0` (danger zone, below MAX_WAIT → no cascade). | + +### 7.4 Re-drain direct inputs + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 7.4.1 | Direct input was drained into invalidated batch → re-drained into recovery batch | `[x]` | `recovery_redrains_direct_inputs_and_replay_sees_them_once` | +| 7.4.2 | Direct input that was already safe but NOT yet drained → included in recovery batch's first frame | `[x]` | **e2e:** `recovery_drains_safe_but_undrained_direct_input_test` — stops the sequencer before any user activity, deposits on L1 (bypasses the sequencer's process), advances past MAX_WAIT. Respawn's startup recovery syncs safe head, sees the previously-invisible deposit in `safe_inputs`, cascades the aged empty initial Tip, opens a recovery batch whose `leading_range` includes the never-drained deposit. Distinct from §7.4.1 (`recovery_after_stale_batches_test`), which re-drains an already-drained-into-invalidated-batch input. **Unit:** `storage/recovery.rs::tests::tip_staleness::undrained_safe_input_appears_in_recovery_batch_first_frame` — covers the same recovery-drain branch via direct Storage-layer setup (no harness/Anvil). | +| 7.4.3 | No direct inputs pending → recovery batch opens empty | `[x]` | **e2e:** `recovery_batch_opens_empty_when_no_direct_inputs_pending_test` — negative control for §7.4.2: same shape, no L1 deposit. `leading_range = [0, 0)` → recovery batch's first frame is empty → WS(0) sees nothing. Cascade still fires on the aged empty initial Tip. **Unit:** `storage/recovery.rs::tests::tip_staleness::recovery_batch_opens_empty_when_no_direct_inputs_pending`. | +| 7.4.4 | A subscriber seeing events across recovery sees each direct input exactly once | `[x]` | Implicit in 7.4.1 | + +### 7.5 Nonce-0 edge case + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 7.5.1 | First-ever batch (nonce 0) goes stale before any batch reaches Gold → recovery invalidates and opens fresh batch 0 | `[x]` | **e2e:** `nonce_zero_recovery_invalidates_then_accepts_at_nonce_zero_test` — uses T2 (auto-mining off + drop) to ensure the first-ever batch's L1 submission never lands. Cascade fires → recovery batch opens with `parent_batch_index = NULL` and reused `nonce = 0`. Structural invariants (NULL parent → nonce 0, contiguous valid-path nonces) verified by post-test `assert_schema_invariants`. **Unit:** `storage/recovery.rs::tests::tip_staleness::first_batch_stale_recovery_reuses_nonce_zero` — asserts the same `nonce = 0` / `parent_batch_index = NULL` invariants directly at the Storage layer via raw SQL. | +| 7.5.2 | After 7.5.1, scheduler accepts the recovery batch at nonce 0 (nonce space reused) | `[x]` | Same e2e test as §7.5.1 — drives 150 transfers into the recovery batch to size-trigger close + submit, then explicitly mines L1 blocks for confirmations. Asserts `safe_accepted_batches` has a row with `MIN(nonce) = 0` — proving `populate_safe_accepted_batches_inner` accepts a reused-nonce batch after cascade. | + +### 7.6 Idempotency & crash-safety + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 7.6.1 | Run `detect_and_recover` twice on the same state → second run is no-op | `[x]` | `detect_and_recover_is_idempotent` | +| 7.6.2 | Crash AFTER cascade INSERT but BEFORE `open_recovery_batch_in_tx` → on restart, a recovery batch is opened (torn state) | `[x]` | `detect_and_recover_opens_batch_after_torn_invalidation` | +| 7.6.3 | Crash AFTER open_recovery_batch → restart finds valid open batch, does nothing | `[x]` | `storage/recovery.rs::tests::tip_staleness::detect_and_recover_after_post_recovery_crash_is_no_op` — drops Storage between calls to model a restart over the persisted DB. Distinct from §7.6.1's back-to-back same-handle idempotence. | +| 7.6.4 | The entire recovery procedure (populate + detect + open) runs in a single `Immediate` transaction | `[x]` | Structural, verified by reading | +| 7.6.5 | `populate_safe_accepted_batches` is resumable (cursor-tracked, `INSERT OR IGNORE`) | `[x]` | | +| 7.6.6 | Nonce assignment is structural (not a discrete step); `insert_new_batch` derives nonce from `parent.nonce + 1` at creation time | `[x]` | `trg_enforce_nonce_contiguity` verifies; `schema_rejects_bad_nonce_contiguity` covers the trigger path | + +### 7.7 Mempool flusher + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 7.7.1 | Pending wallet-nonce slot → flusher submits a no-op that consumes the slot | `[x]` | Existing Anvil-backed flusher tests | +| 7.7.2 | No pending slots → flush is instant no-op | `[x]` | | +| 7.7.3 | Flusher no-op competes with a batch tx at the same nonce; one of them lands, slot is consumed | `[x]` | | +| 7.7.4 | Flusher fee bump satisfies Ethereum's ≥10% replacement rule (H5 regression) | `[x]` | Extracted `bumped_replacement_fees()` helper in `recovery/flusher.rs`; covered by `replacement_fee_bump_exceeds_ten_percent_for_max_fee`, `replacement_fee_bump_doubles_priority_fee`, `replacement_fee_floor_is_positive_even_when_base_is_zero`, `replacement_fee_bump_saturates_at_u128_max`. | +| 7.7.5 | Flusher `confirmation_timeout` derives from `seconds_per_block` config (H6 regression) | `[x]` | Extracted `derive_timeouts()` helper; covered by `timeouts_derive_from_seconds_per_block` (tests 1/2/12 s/block) and `confirmation_timeout_is_ten_times_safe_poll_interval` (structural invariant). | +| 7.7.6 | Flusher outer loop runs without timeout; inner watch-timeout re-enters the loop | `[x]` | Verified in review | +| 7.7.7 | Flusher survives extended provider outage — retries forever, completes when provider returns | `[x]` | `sequencer/src/recovery/flusher.rs::tests::flush_surfaces_provider_error_under_disconnect_and_completes_on_reconnect` — spawns a `TcpProxy` (from `rollups-harness`, added as sequencer dev-dep) in front of Anvil; seeds pending wallet-nonce state; disconnects proxy and asserts `flush_and_wait` returns `FlushError::Provider` fast (no internal retry); reconnects proxy + starts mining; asserts a fresh flusher call completes and the nonce-0 slot reaches safe. **Implementation note pinned by the test**: `flush_and_wait` does NOT retry internally; "retries forever" in this row is the *orchestrator restart loop* (covered at e2e by §11.1.5 / §11.2.2-followup's `respawn_until_stable`). This test pins the flusher's error surface under disconnect + its completion on reconnect — the two ends of what the orchestrator is looping over. | + +### 7.8 Wall-clock fallback + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 7.8.1 | L1 unreachable, elapsed wall time estimates `missed_blocks > danger_threshold` → recovery triggers | `[x]` | `provider_outage_wall_clock_refuses_boot_test` in `tests/e2e`. Validated end-to-end: proxy disconnected → `anvil_mine(1500)` + `faketime '+5h'` → respawn fails with `StartupDangerZoneEstimate` → proxy reconnect + respawn succeeds + cascade fires. Migrated from the now-removed `rewind_synced_at_ms` helper to faketime. | +| 7.8.2 | `l1_safe_head.synced_at_ms == 0` (never synced) → treat as danger zone, return `StartupDangerZoneEstimate` error | `[x]` `first_boot_l1_unreachable_never_synced_refuses_boot_test` | Normal boot seeds the bootstrap cache; `ManagedSequencer::reset_l1_safe_head_synced_at_ms` then rewrites `synced_at_ms` to 0 on disk while the sequencer is stopped. Respawning with the proxy disconnected triggers the wall-clock fallback's `synced_at_ms == 0` branch → `StartupDangerZoneEstimate`. Scope limit: the separate "truly first-ever boot (no bootstrap cache)" path is tested elsewhere; this one pins the wall-clock branch specifically. | +| 7.8.3 | `SystemTime::now()` backward jump → `saturating_sub` handles cleanly, no panic | `[x]` | `wall_clock_backward_jump_no_panic_test` in `tests/e2e`. Uses `faketime '-1h'` with proxy disconnected to force the wall-clock-fallback path with `now < last_sync_ms`. | +| 7.8.4 | `SEQ_SECONDS_PER_BLOCK=0` rejected at config parse (H8 regression) | `[x]` | Clap integration tests at §8.4.2 | +| 7.8.5 | L1 reachable, safe head frozen, startup estimates danger from stale safe-progress timestamp and refuses boot | `[x]` | `stalled_safe_head_startup_refuses_boot_test` — ages an open Tip into the danger window while L1 is healthy, stops the sequencer, advances only `faketime`, and verifies startup sync succeeds but still refuses because the safe head did not advance. Mining one new block makes the next respawn stable again. | +| 7.8.6 | L1 reachable, safe head frozen, running submitter self-exits before provider failure | `[x]` | `stalled_safe_head_live_exit_test` — starts from the same aging-open-Tip shape as §7.3.5, then advances only `faketime` so the live stalled-safe-head estimate trips `DangerZone` while the provider remains reachable. | + +--- + +## 8. Startup / Bootstrap + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 8.1.1 | First boot, L1 reachable → discovers InputBox + genesis + chain_id from L1, writes bootstrap cache | `[?]` | Covered by normal e2e | +| 8.1.2 | First boot, L1 unreachable → returns error (`"L1 unreachable and no bootstrap cache"`) | `[x]` `first_boot_no_cache_l1_unreachable_refuses_boot_test` | Distinct from §7.8.2 (wall-clock fallback): this hits the *earlier* `InputReader::new` discovery step. Harness `clear_l1_bootstrap_cache` empties the cache table after a normal boot; respawn through a disconnected proxy hits the no-cache + L1-unreachable code path. Verifies reversibility: reconnect proxy, respawn succeeds. | +| 8.2.1 | Restart, L1 reachable → validates RPC chain_id against config before any DB write (H7 regression) | `[x]` `chain_id_mismatch_via_live_rpc_refuses_boot_test` | **H7 regression (RPC path).** Spawns the full sequencer binary against real Anvil with mismatched `--chain-id` (override on `ManagedSequencer`); asserts respawn fails with `RunError::ChainIdMismatch`. Reset-to-correct-chain-id respawn succeeds — proves the failed attempt didn't poison the bootstrap cache. Complements the cache-path test in `sequencer/tests/chain_id_validation.rs`. | +| 8.2.2 | Restart, L1 unreachable, cache present → uses cache, validates cached chain_id | `[x]` | `restart_and_replay_test` + `chain_id_match_does_not_produce_mismatch_error` | +| 8.3.1 | Chain-id mismatch (config vs RPC) → `RunError::ChainIdMismatch`, no DB contamination | `[x]` Same test as §8.2.1 — `chain_id_mismatch_via_live_rpc_refuses_boot_test` covers both since they're the same code path with different framings. | +| 8.3.2 | Chain-id mismatch (config vs cache) → `RunError::ChainIdMismatch`, no DB contamination | `[x]` | **H7 regression (cache)**: `chain_id_mismatch_from_cache_returns_typed_error` | +| 8.4.1 | `SEQ_PREEMPTIVE_MARGIN_BLOCKS >= MAX_WAIT_BLOCKS` rejected at startup | `[x]` | Validation extracted to `runtime::compute_danger_threshold` and covered by `runtime::tests::margin_equal_to_max_wait_panics`, `margin_greater_than_max_wait_panics`, plus positive-control tests for 0, default (75), and just-below-max-wait. | +| 8.4.2 | `SEQ_SECONDS_PER_BLOCK=0` rejected by clap parser | `[x]` | **H8 regression**: `run_config_rejects_seconds_per_block_zero` + `run_config_accepts_seconds_per_block_one` + `run_config_default_seconds_per_block_is_12` in `runtime/config.rs` | +| 8.5.1 | Private-key parse failure does not echo key bytes in error (H3 regression) | `[x]` | **H3 regression**: `create_signer_provider_does_not_echo_key_bytes_on_invalid_hex` + `_on_odd_length` in `l1/provider.rs::tests` | +| 8.5.2 | `http://` URL for non-loopback host rejected (H4 regression) | `[x]` | **H4 regression**: `create_client_rejects_http_for_remote_host` | +| 8.5.3 | `http://127.0.0.1:8545` accepted (loopback exception) | `[x]` | `create_client_accepts_http_for_127_0_0_1` + `_for_localhost` + `_for_ipv6_loopback` (caught a bug in the H4 fix: bracket-wrapped IPv6 literal) | + +--- + +## 9. Shutdown + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 9.1.1 | `runtime.stop()` drains pending user ops with explicit `Err(Unavailable)`; no silent drops | `[ ]` | Not currently covered. `restart_after_committed_tx_replays_cleanly_test` exercises replay consistency after restart, not shutdown with a tx still pending. | +| 9.1.2 | Post-shutdown POST → 503 immediately (before consuming channel slot) | `[?]` | | +| 9.1.3 | Shutdown during batch submission: in-flight tx either completes or is abandoned cleanly | `[ ]` | Needs proxy or controlled timing | +| 9.1.4 | Shutdown during L1 input reader poll: reader exits cleanly, no corrupt safe-head state | `[ ]` | | + +--- + +## 10. Application Trait Contract + +Derived from the `Application Trait Contract` section in [`AGENTS.md`](../AGENTS.md). + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 10.1.1 | An input that executed successfully live MUST succeed on replay (catch-up) | `[x]` `replay_matches_live_for_mixed_workload_test` | Diverse multi-sender workload (Alice/Bob/Charlie, two interleaved deposits, transfers in both directions, two withdrawals). Post-restart WS catch-up assembles a fresh replay; test asserts per-user balance + nonce + executed-input-count equality against the live replay. Any Application non-determinism or catch-up bug diverges the two replays immediately. Complements `restart_and_replay_test` (narrower single-sender workload, implicit equality). | +| 10.1.2 | `AppError::Internal` during catch-up → lane crashes, sequencer fails to start | `[x]` | `catch_up.rs` error handling | +| 10.1.3 | `ExecutionOutcome::Invalid` during catch-up → skipped cleanly | `[x]` | | +| 10.2.1 | `validate_user_op` is pure: no mutations, no time dependence, no randomness | `[-]` | Enforced by code review; can't test directly | +| 10.2.2 | No state mutation from `current_user_nonce` or `current_user_balance` | `[-]` | Same | + +--- + +## 11. Outage × Zone Matrix + +The two primary failure dimensions: **who is offline** (sequencer or its RPC) and **how stale did L1 get during the outage** (pre-danger, danger, past-stale). Each cell needs a deterministic test. Use `--no-mining` + explicit `anvil_mine(N)` to hit zone boundaries exactly. + +The danger threshold is `MAX_WAIT_BLOCKS - preemptive_margin`. With `MAX_WAIT_BLOCKS = 1200` and `preemptive_margin = 75` (default), boundaries are: +- **Pre-danger:** advance < 1125 blocks +- **Danger zone:** 1125 ≤ advance < 1200 +- **Past-stale:** advance ≥ 1200 + +For deterministic tests, pick margins well inside each zone (e.g., 500 / 1150 / 1250). + +### 11.1 Sequencer outage (anvil stays up, sequencer killed) + +| # | Zone | Expected behavior | Status | +|---|------|-------------------|--------| +| 11.1.1 | Pre-danger (500) | No recovery. Sequencer resumes; pending batches submit normally. | `[x]` `sequencer_outage_pre_danger_no_recovery_test` | +| 11.1.2 | Danger zone (1150), decoupled wall clock | Narrow: only L1 advances; wall clock stays put. No closed batch past frontier is stale → no flush, no cascade, sequencer resumes. | `[x]` `sequencer_outage_danger_zone_no_cascade_test`. Uses `mine_l1_blocks` directly (no wall-clock advance) because coupled advance triggers the aged-Tip-auto-close → flush-cycle path covered by §11.1.5 below. | +| 11.1.3 | Past-stale, open batch (1250) | Open batch invalidated via staleness check. Recovery batch opened. Resume. | `[x]` `recovery_after_stale_batches_test`. Uses `advance_wall_and_mine` — coupled wall-clock+L1 advance models real outage semantics. | +| 11.1.4 | Past-stale, closed+submitted batch (1250) | Closed batch invalidated. Recovery batch opened. Resume. | `[x]` `delayed_inclusion_cascades_on_restart_test` | Uses T2. Setup: deposit + 150 transfers force a size-triggered batch close while auto-mining is disabled, so the submitter's L1 tx lands in a held mempool. Stop sequencer → `drop_all_pending_txs` → `advance_wall_and_mine(1250 * 12s)` (genuinely empty blocks since mempool is empty) → re-enable auto-mining → respawn. Startup recovery detects the closed batch is past `MAX_WAIT_BLOCKS` and cascades; flush runs against the (now live) auto-miner. WS replay asserts the transfers are rolled back. | +| 11.1.5 | Danger zone (1150), **coupled wall+L1 advance** | Realistic: outage advances both L1 and wall clock. On respawn the aged Tip auto-closes, the resulting closed batch is in danger, the detector/submitter cycle drives a restart loop, and the system converges to a healthy state. Two end states are valid: either a later respawn cascades once the closed batch ages past `MAX_WAIT_BLOCKS`, or the submitter gets one last batch onto L1 before shutdown and the branch remains canonical. | `[x]` `sequencer_outage_danger_zone_coupled_restart_cycle_recovers_test` — drives the full orchestrator loop via `respawn_until_stable` (T8). Asserts the loop requires at least two attempts (not a cheap no-op) and accepts either the rollback branch (cascade-invalidation fired, transfer removed) or the canonical-landing branch (transfer remained valid because the batch landed before shutdown). | + +### 11.2 Provider outage (proxy disconnects, sequencer stays up, anvil advances behind the proxy) + +| # | Zone | Expected behavior | Status | +|---|------|-------------------|--------| +| 11.2.1 | Pre-danger (500), sequencer stays UP, load applied | Sequencer retries. Wall-clock estimate < threshold. Inclusion lane continues accepting user ops **and closes batches by size**. Reconnect → sync, resume. | `[x]` `provider_outage_pre_danger_sequencer_continues_test` — submits ~150 transfers during the outage, asserts `count_batches().sealed` strictly increased. | +| 11.2.2 | Danger zone (3h55min), sequencer UP, self-exits | Running sequencer's wall-clock fallback detects danger mid-run → exits with `DangerZone`. Startup wall-clock fallback refuses subsequent boot while proxy still disconnected. No invalidation (not past-stale). | `[x]` `provider_outage_danger_zone_sequencer_self_exits_test` — uses dynamic faketime (file-based) to shift the running sequencer's clock into the danger zone without a respawn. Stops at the "refuse to reboot" assertion. | +| 11.2.2-follow-up | Danger zone → mid-run exit → reconnect → restart cycle | Completes §11.2.2: proxy reconnects, `respawn_until_stable` drives the orchestrator loop (advancing L1 each retry) until the aged closed batch crosses `MAX_WAIT_BLOCKS` and cascade fires. Asserts Stable convergence + cascade-invalidation. | `[x]` `provider_outage_danger_zone_mid_run_exit_then_restart_cycle_recovers_test` — uses T8 (`respawn_until_stable`). | +| 11.2.3 | Past-stale (1250) | Wall-clock estimate past stale. Recovery + flush block on proxy. Reconnect → flush + cascade. | `[x]` `provider_outage_past_stale_cascades_test` — stops sequencer, disconnects proxy, advances L1, verifies restart refuses while proxy is disconnected (wall-clock fallback past stale → `StartupDangerZoneEstimate`), then reconnects and verifies cascade | + +### 11.3 Combined: outage both sides at once + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 11.3.1 | Sequencer stopped, proxy disconnected, anvil mines 1250 blocks, BOTH reconnect → recovery triggers correctly | `[x]` | Effectively covered by §11.2.3 — the "sequencer stopped + proxy disconnected" path is tested end-to-end there | +| 11.3.2 | Both stopped, advance to danger zone, then turn on sequencer ONLY (proxy still disconnected) | `[x]` `both_down_danger_zone_sequencer_first_refuses_boot_test` | Realistic datacenter-outage-recovery scenario: sequencer boots while L1 is still unreachable, wall-clock fallback sees past-danger → `StartupDangerZoneEstimate`. Stops at the refuse-boot assertion (no cascade yet — we're below MAX_WAIT). Complement to §11.2.3 in the danger-zone window instead of past-stale. | +| 11.3.3 | Both stopped, advance to danger zone, proxy returns FIRST (sequencer still down), then sequencer → normal sync, startup sees aged batches and handles them | `[x]` `both_down_danger_zone_proxy_first_restart_cycle_recovers_test` | Tests the "L1 recovered before us" reconnect ordering. Uses T8: first respawn exits with `DangerZone` after the aged Tip closes, `respawn_until_stable` advances L1 by 100 blocks per retry until cascade fires on a subsequent respawn. | + +### 11.4 Short-duration provider hiccups (heal-within-pre-danger) + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 11.4.1 | Sequencer running, proxy disconnects for a few seconds (pre-danger), reconnects. Sequencer retries, resumes without any recovery action. | `[x]` `provider_outage_short_hiccup_no_recovery_test` | Most-common production fault — RPC flaked briefly, retry succeeded. Disconnect lasts ≥1 submitter poll interval (6s) with zero L1/wall-clock advance, then reconnects; asserts POST /tx keeps working and no batch gets invalidated. Complement to §11.2.1 (load-under-outage); this covers the "pure retry loop" path with no wall-clock pressure. | + +--- + +## 12. Storage Layer + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 12.1.1 | Schema CHECK constraints enforced: `safe_inputs.sender` length 20, `frames.fee >= 0`, XOR on `sequenced_l2_txs`, etc. | `[x]` | `storage/recovery.rs::tests::schema_invariants::schema_rejects_*` — six new tests exercise CHECK-level refusals: `safe_input_with_wrong_sender_length`, `user_op_with_wrong_sender_length`, `user_op_with_wrong_signature_length`, `sequenced_l2_tx_with_neither_xor_branch`, `l1_bootstrap_cache_with_zero_chain_id`, `safe_input_with_negative_block_number`. Each asserts `CHECK constraint failed` specifically (not a trigger/FK/NOT NULL error). | +| 12.1.2 | FK cascade: deleting a `batches` row (should be impossible via PK) doesn't orphan children | `[-]` | Structural; writes are append-only | +| 12.2.1 | `valid_batches` correctly filters by `invalidated_at_ms IS NULL` | `[x]` | Implicit in recovery tests | +| 12.2.2 | `valid_closed_batches` correctly filters (sealed + valid) | `[x]` | Submitter pending-batch load covers it | +| 12.2.3 | `valid_sequenced_l2_txs` correctly filters | `[x]` | | +| 12.2.4 | `valid_open_batch` has at most one row (partial unique index `ux_single_valid_tip`) | `[x]` | `schema_rejects_second_valid_tip` | +| 12.2.5 | Schema triggers reject: bad nonce, re-seal, re-invalidate, writes to non-Tip, parent mutation | `[x]` | `schema_rejects_*` test group | +| 12.3.1 | Multi-statement writers wrap in `Immediate` transaction; partial failure leaves DB unchanged | `[?]` | | +| 12.3.2 | `trg_sequence_user_op` does not fire if outer user_ops INSERT rolls back | `[?]` | | +| 12.4.1 | Rowid pagination correctly skips invalidated rows via `valid_sequenced_l2_txs` view | `[x]` | Implicit in WS catch-up after recovery | + +### 12.5 Parent-pointer tree invariants (NEW) + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 12.5.1 | **Tree integrity property test**: for a mixed workload (opens, closes, partial/torn cascades), every valid batch satisfies `nonce = parent.nonce + 1`, `parent_batch_index` is NULL (genesis) or references an existing batch, and parent-walk terminates within `batch_index` hops. | `[x]` | `tree_invariants_hold_across_mixed_workload` in `storage/recovery.rs` tests. | +| 12.5.2 | **Subtree equivalence**: among *valid* batches, `{batch_index >= N}` equals the subtree rooted at N via recursive `parent_batch_index` walk. Documents the equivalence the cascade query relies on. | `[x]` | `subtree_by_batch_index_equals_subtree_by_parent_walk`. If this ever diverges, cascade must switch to recursive CTE. | +| 12.5.3 | **Post-e2e schema invariants**: after each passing e2e test, harness-side DB inspection asserts at most one `valid_open_batch` row, `nonce = parent.nonce + 1` across all batches, contiguous valid-path nonces, and no FK orphans. | `[x]` | `ManagedSequencer::assert_schema_invariants` wired into `tests/e2e/src/main.rs` as a post-scenario step. Harness-only; no sequencer changes. | + +--- + +## 13. Fee Model + +| # | Scenario | Status | Notes | +|---|----------|--------|-------| +| 13.1.1 | `fee_to_linear(0) = 1`, `fee_to_linear(MAX_EXPONENT)` does not panic | `[x]` | `sequencer-core/src/fee.rs` unit tests | +| 13.1.2 | `fee_to_linear(MAX_EXPONENT + 1)` panics loudly (assert_eq message) | `[x]` | | +| 13.1.3 | `fee_from_linear(U256::MAX)` saturates to `MAX_EXPONENT` | `[x]` | | +| 13.1.4 | Round-trip `fee_from_linear(fee_to_linear(n))` within 1% | `[x]` | | +| 13.1.5 | `log_fee_ratio` handles `num < denom` via negation | `[x]` | | +| 13.2.1 | `batch_policy_derived.recommended_fee` clamps at `MAX_EXPONENT` at Rust read boundary | `[x]` | `query_batch_policy` test | +| 13.2.2 | High `log_gas_price` via `set_log_gas_price` → clamped, doesn't panic | `[x]` | `high_gas_price_clamps_recommended_fee_to_max_exponent` | +| 13.3.1 | `set_alpha` CHECK constraint rejects configs where `log_batch_size_target >= log_max_batch_bytes` | `[x]` | | +| 13.3.2 | `set_alpha(0, _)` or `set_alpha(_, 0)` panics with clear message | `[?]` | | + +--- + +## 14. Out-of-scope under current tooling + +Documented here so we are deliberate about what we *aren't* testing at the e2e level. These remain covered at the code-review + formal-verification level per the [threat model](../docs/threat-model/README.md) and [recovery spec](../docs/recovery/README.md). + +| Threat | Why not e2e | Covered by | +|--------|-------------|-----------| +| Adversarial mempool: a previously-submitted tx lands long after we gave up | Anvil auto-mines everything in the mempool when `anvil_mine` is called; we cannot "hold" a specific tx indefinitely | TLA+ spec (157M states) + Part 6 code review | +| Replacement-by-nonce races | Same — we cannot model two builders racing | TLA+ + code review | +| Byzantine L1 / RPC (lying about events or `safe`) | Out of scope per threat model | Threat model + code review | +| Reorgs beyond safe depth | Anvil doesn't do reorgs | Threat model excludes | +| Timing side channels in WS feed | Timing attacks out of scope | Threat model excludes | +| DoS / resource exhaustion | Explicitly out of scope | Threat model excludes | + +To cover the adversarial-mempool gap at e2e level we would need a **mock L1** with programmable inclusion logic (a custom JSON-RPC server that accepts txs but selectively mines them). Significant investment; not planned. + +--- + +## Tooling dependencies + +Coverage of the above requires the following test-harness additions. Each unlocks a row of the matrix: + +| # | Tool | Unlocks | Status | +|---|------|---------|--------| +| T1 | TCP proxy with `disconnect()` / `reconnect()` | §11.2, §11.3, §7.7.7, §5.4 | `[x]` Built — `tests/harness/src/proxy.rs`; 6 unit tests; `ManagedSequencer::set_l1_endpoint_override` routes sequencer through it | +| T2 | Runtime toggle of Anvil's auto-mining + mempool drop | §11.1.4 (done); §7.1.1, §7.1.3, §7.1.4 (pending — live-runtime variants) | `[x]` `ManagedSequencer::set_automine(bool)` (via `anvil_setAutomine`) holds or releases the mempool without respawning Anvil; `drop_all_pending_txs` (via `anvil_dropAllTransactions`) simulates gateway packet loss. Chosen over `--no-mining` spawn flag because it's runtime-toggleable — existing tests stay on auto-mining, only delayed-inclusion tests flip it. | +| T3 | Shorter poll intervals for tests (sub-second `SEQ_BATCH_SUBMITTER_IDLE_POLL_INTERVAL_MS`) | Reduces raciness in §11, §7.7, §6 | `[ ]` Not built | +| T4 | `wait_for_recovery_complete` helper (poll a health / debug endpoint) | Replaces sleep-based waits throughout §11, §7 | `[ ]` Not built | +| T5 | Injectable failpoints (SQLite error, sub-transaction crash) | §2.10.1 (H1) | `[?]` Partial — inline SQLite-trigger tests now cover recovery crash-atomicity; a broader framework is still only needed for the API error-body leak case | +| T6 | Smaller `MAX_WAIT_BLOCKS` for test builds (optional optimization) | Shortens mine-1200-blocks tests | `[-]` Probably not needed — 1200 empty blocks mines in <1s | +| T7 | libfaketime via `FAKETIME_TIMESTAMP_FILE` (dynamic) for the sequencer subprocess | §7.8.1 (done), §7.8.3 (clock skew, done), §11.2.2 (done, live danger-zone detection), §7.3.5 (aging-Tip, pending), §7.8.2 (first-boot-L1-down, pending) | `[x]` `ManagedSequencer::set_faketime_offset(Option)` writes to the rc file; `ManagedSequencer::advance_wall_and_mine(Duration)` is the coupled (cumulative) helper. Harness sets `FAKETIME_TIMESTAMP_FILE` + `FAKETIME_NO_CACHE=1` + `DYLD_INSERT_LIBRARIES`/`LD_PRELOAD` on the child. Dynamic: the running sequencer re-reads the file on every time call, so tests can shift time mid-run without a respawn. Added to `flake.nix` + CI (`apt install faketime` on Ubuntu). | +| T8 | Orchestrator-restart primitive (`respawn_until_stable`) | §11.1.5 (done), §11.2.2-follow-up (done), §11.3.3 (done) | `[x]` `ManagedSequencer::respawn_and_watch(Duration) -> RespawnAttemptOutcome` classifies a single attempt into `Stable` / `RespawnFailed(String)` / `ExitedPostRespawn(ExitStatus)`. `respawn_until_stable(RespawnPolicy)` wraps it in a retry loop with optional `advance_per_retry` — required for the danger-zone-to-cascade convergence path (aged closed batch only cascades once it ages past `MAX_WAIT_BLOCKS`, so each retry needs to advance L1 + wall clock). Returns the full attempt sequence so tests can assert *both* convergence and that the loop actually exercised the flush/shutdown path (not a cheap first-attempt success). | + +--- + +## How to use this document + +1. **Adding a test:** find the relevant row, flip `[ ]` to `[x]` when the test is written and passing. +2. **Adding a scenario:** add a new row under the relevant section. Include the status marker and one-line rationale. +3. **Before merging a bug fix:** find the scenario that should have caught it. If there isn't one, add it. +4. **Before a security review:** scan for `[!]` and `[?]` rows — these are the areas where confidence is weakest. +5. **For changes to tooling (T1-T6):** update the dependency table; flip status markers on unlocked rows. + +## Relationship to other docs + +- [`AGENTS.md`](../AGENTS.md) — architecture, invariants, coding conventions +- [`docs/threat-model/README.md`](../docs/threat-model/README.md) — what's in and out of scope +- [`docs/recovery/README.md`](../docs/recovery/README.md) — recovery design + TLA+ spec +- [`SECURITY_TODO.md`](../SECURITY_TODO.md) — open security findings +- This doc — what should be tested to gain confidence those invariants hold in practice diff --git a/tests/benchmarks/src/bin/report.rs b/tests/benchmarks/src/bin/report.rs index a380edd..3cfb8de 100644 --- a/tests/benchmarks/src/bin/report.rs +++ b/tests/benchmarks/src/bin/report.rs @@ -282,7 +282,7 @@ fn load_latest_multi_row_sweep(dir: &Path) -> Option Eip712Domain { - Eip712Domain { - name: Some(DOMAIN_NAME.to_string().into()), - version: Some(DOMAIN_VERSION.to_string().into()), - chain_id: Some(U256::from(self.chain_id)), - verifying_contract: Some(self.verifying_contract), - salt: None, - } + sequencer_core::build_input_domain(self.chain_id, self.verifying_contract) } } diff --git a/tests/benchmarks/src/lib.rs b/tests/benchmarks/src/lib.rs index 788322a..01d8423 100644 --- a/tests/benchmarks/src/lib.rs +++ b/tests/benchmarks/src/lib.rs @@ -16,8 +16,7 @@ mod workload; pub use ack::{AckRunConfig, AckRunReport, run_ack_benchmark}; pub use domain::{ - BenchmarkDomain, DEFAULT_ENDPOINT, DOMAIN_NAME, DOMAIN_VERSION, parse_address, - resolve_external_benchmark_domain, + BenchmarkDomain, DEFAULT_ENDPOINT, parse_address, resolve_external_benchmark_domain, }; pub use evaluation::{ ACK_P99_TARGET_MS, DIAGNOSTIC_P999_MIN_ACCEPTED_COUNT, NetworkProfile, NetworkProfileKind, @@ -34,6 +33,7 @@ pub use rt_sweep::{ RtSweepMeasurements, RtSweepRow, RtSweepRunReport, RtSweepSummary, compute_rt_sweep_summary, print_rt_sweep_report, write_csv as write_rt_sweep_csv, }; +pub use sequencer_core::{DOMAIN_NAME, DOMAIN_VERSION}; pub use stats::{ Stats, StatsMs, format_optional_f64, print_stats, rejection_rate, summarize, throughput_tx_per_s, diff --git a/tests/e2e/src/main.rs b/tests/e2e/src/main.rs index 69483fc..56b0c09 100644 --- a/tests/e2e/src/main.rs +++ b/tests/e2e/src/main.rs @@ -19,8 +19,20 @@ fn main() { ManagedSequencer::spawn(default_devnet_sequencer_config(log_prefix)) .await?; let scenario_result = scenario(&mut runtime).await; + // Post-test schema invariants (TEST_PLAN §12.5.3): + // assert the DB's structural invariants only if the + // scenario succeeded — otherwise we'd mask the original + // failure with downstream weirdness. Checks the partial + // unique index, nonce contiguity, and FK validity + // directly against the DB file. + let invariant_result = if scenario_result.is_ok() { + runtime.assert_schema_invariants() + } else { + Ok(()) + }; let shutdown_result = runtime.shutdown().await; shutdown_result?; + invariant_result?; scenario_result }) }) diff --git a/tests/e2e/src/test_cases.rs b/tests/e2e/src/test_cases.rs index 827ba77..6061527 100644 --- a/tests/e2e/src/test_cases.rs +++ b/tests/e2e/src/test_cases.rs @@ -6,7 +6,8 @@ use std::time::Duration; use crate::{ScenarioFn, ScenarioResult}; use alloy_primitives::{Address, U256}; use rollups_harness::{ - ManagedSequencer, ReplayWalletApp, TestSigner, WalletL1Client, WsClient, sign_user_op_hex, + ManagedSequencer, ReplayWalletApp, RespawnAttemptOutcome, RespawnPolicy, TcpProxy, TestSigner, + WalletL1Client, WsClient, sign_user_op_hex, }; use sequencer_core::api::{TxRequest, WsTxMessage}; use sequencer_core::fee::fee_to_linear; @@ -21,6 +22,95 @@ const DEFAULT_FRAME_FEE: u16 = 1060; /// Max fee used for raw TxRequest construction. Must be >= DEFAULT_FRAME_FEE. const DEFAULT_MAX_FEE: u16 = 1200; +// ── Zone-math constants for §11 outage matrix + §7 recovery tests ───────── +// +// These derive from the sequencer's default config so a change to +// `MAX_WAIT_BLOCKS`, `SEQ_PREEMPTIVE_MARGIN_BLOCKS`, or `SEQ_SECONDS_PER_BLOCK` +// flows through here automatically. The compile-time asserts below catch any +// drift that would invalidate the zone framing of the tests (e.g., a per-retry +// advance that no longer crosses MAX_WAIT in the orchestrator loop). +// +// The picks (PRE / DANGER / PAST_STALE) are deliberately well inside their +// zones to give tests slack against scheduling jitter and timing drift. + +/// Source of truth: shared between sequencer + scheduler via +/// `sequencer_core::MAX_WAIT_BLOCKS`. +const MAX_WAIT_BLOCKS: u64 = sequencer_core::MAX_WAIT_BLOCKS; + +/// Default `SEQ_PREEMPTIVE_MARGIN_BLOCKS` from `runtime/config.rs`. If the +/// default changes, update here so `DANGER_THRESHOLD_BLOCKS` stays aligned. +const DEFAULT_PREEMPTIVE_MARGIN_BLOCKS: u64 = 75; + +/// Default `SEQ_SECONDS_PER_BLOCK` from `runtime/config.rs`. The harness +/// `advance_wall_and_mine` also assumes this value internally. +const DEFAULT_SECONDS_PER_BLOCK: u64 = 12; + +/// Derived: the preemptive-recovery danger threshold. Below this we're safe; +/// above it (but below `MAX_WAIT_BLOCKS`) is the danger zone where the +/// sequencer triggers flush + shutdown but no cascade. +const DANGER_THRESHOLD_BLOCKS: u64 = MAX_WAIT_BLOCKS - DEFAULT_PREEMPTIVE_MARGIN_BLOCKS; + +/// Pre-danger pick — well below `DANGER_THRESHOLD_BLOCKS` so background drift +/// can't accidentally tip a test into the danger zone. +const PRE_DANGER_BLOCKS: u64 = 500; + +/// Danger-zone pick — comfortably past `DANGER_THRESHOLD_BLOCKS`, comfortably +/// below `MAX_WAIT_BLOCKS`. Used by tests that want "danger detected, no +/// cascade" framing. +const DANGER_ZONE_BLOCKS: u64 = 1150; + +/// Past-stale pick — comfortably past `MAX_WAIT_BLOCKS`. Startup recovery +/// must cascade at this point. +const PAST_STALE_BLOCKS: u64 = 1250; + +/// Per-retry L1 + wall-clock advance for `respawn_until_stable` loops that +/// start in the danger zone. The closed in-danger batch only cascades once +/// it ages past `MAX_WAIT_BLOCKS`, so each retry has to push the system +/// across that boundary within `RespawnPolicy::max_attempts`. The +/// compile-time check below pins the load-bearing relationship. +const RESPAWN_RETRY_ADVANCE_BLOCKS: u64 = 100; + +/// Convert a block count to wall-clock duration assuming the default block time. +const fn blocks_as_duration(blocks: u64) -> Duration { + Duration::from_secs(blocks * DEFAULT_SECONDS_PER_BLOCK) +} + +// Compile-time guards: drift in the constants above that breaks the test +// framing fails the build instead of failing tests at runtime. +const _: () = { + assert!( + DANGER_THRESHOLD_BLOCKS < MAX_WAIT_BLOCKS, + "danger threshold must precede the staleness boundary", + ); + assert!( + PRE_DANGER_BLOCKS < DANGER_THRESHOLD_BLOCKS, + "PRE_DANGER_BLOCKS must stay below DANGER_THRESHOLD_BLOCKS", + ); + assert!( + DANGER_ZONE_BLOCKS > DANGER_THRESHOLD_BLOCKS, + "DANGER_ZONE_BLOCKS must clear DANGER_THRESHOLD_BLOCKS", + ); + assert!( + DANGER_ZONE_BLOCKS < MAX_WAIT_BLOCKS, + "DANGER_ZONE_BLOCKS must stay below MAX_WAIT_BLOCKS (no premature cascade)", + ); + assert!( + PAST_STALE_BLOCKS > MAX_WAIT_BLOCKS, + "PAST_STALE_BLOCKS must exceed MAX_WAIT_BLOCKS (cascade must fire)", + ); + // Load-bearing for §11.1.5 / §11.3.3 / §7.5.x: starting from a closed + // in-danger batch, one retry advance must push it past MAX_WAIT_BLOCKS + // so cascade fires before max_attempts is exhausted. If + // `RESPAWN_RETRY_ADVANCE_BLOCKS` shrinks or `MAX_WAIT_BLOCKS` grows + // such that this no longer holds, tests would silently start failing + // by exhausting their retries — the compile-time check makes the + // breakage visible immediately. + assert!( + DANGER_ZONE_BLOCKS + RESPAWN_RETRY_ADVANCE_BLOCKS > MAX_WAIT_BLOCKS, + "RESPAWN_RETRY_ADVANCE_BLOCKS must cross MAX_WAIT from DANGER_ZONE in one retry", + ); +}; + struct ExpectedWalletState { address: Address, balance: U256, @@ -59,9 +149,160 @@ pub fn test_cases() -> Vec<(&'static str, ScenarioFn)> { ("multi_deposit_same_block_test", |runtime| { Box::pin(run_multi_deposit_same_block_test(runtime)) }), - ("shutdown_during_inflight_test", |runtime| { - Box::pin(run_shutdown_during_inflight_test(runtime)) + ( + "restart_after_committed_tx_replays_cleanly_test", + |runtime| Box::pin(run_restart_after_committed_tx_replays_cleanly_test(runtime)), + ), + ("recovery_after_stale_batches_test", |runtime| { + Box::pin(run_recovery_after_stale_batches_test(runtime)) }), + ("sequencer_outage_pre_danger_no_recovery_test", |runtime| { + Box::pin(run_sequencer_outage_pre_danger_no_recovery_test(runtime)) + }), + ("sequencer_outage_danger_zone_no_cascade_test", |runtime| { + Box::pin(run_sequencer_outage_danger_zone_no_cascade_test(runtime)) + }), + ("provider_outage_past_stale_cascades_test", |runtime| { + Box::pin(run_provider_outage_past_stale_cascades_test(runtime)) + }), + ("provider_outage_wall_clock_refuses_boot_test", |runtime| { + Box::pin(run_provider_outage_wall_clock_refuses_boot_test(runtime)) + }), + ("wall_clock_backward_jump_no_panic_test", |runtime| { + Box::pin(run_wall_clock_backward_jump_no_panic_test(runtime)) + }), + ("stalled_safe_head_startup_refuses_boot_test", |runtime| { + Box::pin(run_stalled_safe_head_startup_refuses_boot_test(runtime)) + }), + ( + "provider_outage_pre_danger_sequencer_continues_test", + |runtime| { + Box::pin(run_provider_outage_pre_danger_sequencer_continues_test( + runtime, + )) + }, + ), + ( + "provider_outage_danger_zone_sequencer_self_exits_test", + |runtime| { + Box::pin(run_provider_outage_danger_zone_sequencer_self_exits_test( + runtime, + )) + }, + ), + ("provider_outage_short_hiccup_no_recovery_test", |runtime| { + Box::pin(run_provider_outage_short_hiccup_no_recovery_test(runtime)) + }), + ( + "both_down_danger_zone_sequencer_first_refuses_boot_test", + |runtime| { + Box::pin(run_both_down_danger_zone_sequencer_first_refuses_boot_test( + runtime, + )) + }, + ), + ( + "both_down_danger_zone_proxy_first_restart_cycle_recovers_test", + |runtime| { + Box::pin(run_both_down_danger_zone_proxy_first_restart_cycle_recovers_test(runtime)) + }, + ), + ( + "sequencer_outage_danger_zone_coupled_restart_cycle_recovers_test", + |runtime| { + Box::pin( + run_sequencer_outage_danger_zone_coupled_restart_cycle_recovers_test(runtime), + ) + }, + ), + ( + "provider_outage_danger_zone_mid_run_exit_then_restart_cycle_recovers_test", + |runtime| { + Box::pin( + run_provider_outage_danger_zone_mid_run_exit_then_restart_cycle_recovers_test( + runtime, + ), + ) + }, + ), + ( + "first_boot_l1_unreachable_never_synced_refuses_boot_test", + |runtime| { + Box::pin(run_first_boot_l1_unreachable_never_synced_refuses_boot_test(runtime)) + }, + ), + ("delayed_inclusion_cascades_on_restart_test", |runtime| { + Box::pin(run_delayed_inclusion_cascades_on_restart_test(runtime)) + }), + ("aging_open_tip_tolerated_by_zombie_check_test", |runtime| { + Box::pin(run_aging_open_tip_tolerated_by_zombie_check_test(runtime)) + }), + ("stalled_safe_head_live_exit_test", |runtime| { + Box::pin(run_stalled_safe_head_live_exit_test(runtime)) + }), + ( + "ws_reconnect_at_invalidated_offset_skips_cleanly_test", + |runtime| { + Box::pin(run_ws_reconnect_at_invalidated_offset_skips_cleanly_test( + runtime, + )) + }, + ), + ( + "ws_subscribe_from_future_offset_waits_silently_test", + |runtime| { + Box::pin(run_ws_subscribe_from_future_offset_waits_silently_test( + runtime, + )) + }, + ), + ( + "recovery_drains_safe_but_undrained_direct_input_test", + |runtime| { + Box::pin(run_recovery_drains_safe_but_undrained_direct_input_test( + runtime, + )) + }, + ), + ( + "recovery_batch_opens_empty_when_no_direct_inputs_pending_test", + |runtime| { + Box::pin(run_recovery_batch_opens_empty_when_no_direct_inputs_pending_test(runtime)) + }, + ), + ("replay_matches_live_for_mixed_workload_test", |runtime| { + Box::pin(run_replay_matches_live_for_mixed_workload_test(runtime)) + }), + ( + "provider_outage_input_reader_retries_after_reconnect_test", + |runtime| { + Box::pin(run_provider_outage_input_reader_retries_after_reconnect_test(runtime)) + }, + ), + ( + "first_boot_no_cache_l1_unreachable_refuses_boot_test", + |runtime| { + Box::pin(run_first_boot_no_cache_l1_unreachable_refuses_boot_test( + runtime, + )) + }, + ), + ( + "chain_id_mismatch_via_live_rpc_refuses_boot_test", + |runtime| { + Box::pin(run_chain_id_mismatch_via_live_rpc_refuses_boot_test( + runtime, + )) + }, + ), + ( + "nonce_zero_recovery_invalidates_then_accepts_at_nonce_zero_test", + |runtime| { + Box::pin( + run_nonce_zero_recovery_invalidates_then_accepts_at_nonce_zero_test(runtime), + ) + }, + ), ] } @@ -241,7 +482,9 @@ async fn run_reconnect_from_offset_test(runtime: &mut ManagedSequencer) -> Scena let deposit_message = apply_safe_supported_deposit(runtime, &mut ws, &mut replay, &alice_l1, deposit_amount) .await?; - let reconnect_offset = deposit_message.offset().saturating_add(1); + // WS replay is cursor-based and exclusive: `from_offset` means + // "start after this already-consumed DB offset". + let reconnect_offset = deposit_message.offset(); drop(ws); alice_l2.transfer(bob_address, transfer_amount).await?; @@ -583,7 +826,15 @@ async fn run_multi_deposit_same_block_test(runtime: &mut ManagedSequencer) -> Sc Ok(()) } -async fn run_shutdown_during_inflight_test(runtime: &mut ManagedSequencer) -> ScenarioResult<()> { +// Restart after a committed tx and verify replay stays consistent. +// +// This is intentionally not an "in-flight request during shutdown" test: +// `WalletL2Client::transfer()` awaits the HTTP ack, so by the time restart +// happens the user-op is already durable. What this locks down is the +// committed-tx replay path across restart. +async fn run_restart_after_committed_tx_replays_cleanly_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { let alice = TestSigner::from_default(1)?; let alice_address = alice.address(); @@ -628,14 +879,2475 @@ async fn run_shutdown_during_inflight_test(runtime: &mut ManagedSequencer) -> Sc Ok(()) } -fn eip712_domain(runtime: &ManagedSequencer) -> alloy_sol_types::Eip712Domain { - alloy_sol_types::Eip712Domain { - name: Some("CartesiAppSequencer".to_string().into()), - version: Some("1".to_string().into()), - chain_id: Some(U256::from(runtime.domain_chain_id())), - verifying_contract: Some(runtime.verifying_contract()), - salt: None, - } +async fn run_recovery_after_stale_batches_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let mut ws = runtime.ws(0).await?; + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + let mut replay_before = ReplayWalletApp::devnet(); + + let deposit_amount = U256::from(600_000_u64); + let transfer_amount = U256::from(100_000_u64); + let post_recovery_transfer = U256::from(200_000_u64); + let gas = fee_to_linear(DEFAULT_FRAME_FEE); + + // Step 1: Fund Alice via L1 deposit. + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + deposit_amount, + ) + .await?; + + // Step 2: Alice transfers to Bob (this will be lost after recovery). + alice_l2.transfer(bob_address, transfer_amount).await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + + // Verify pre-recovery state. + assert_eq!( + replay_before.current_user_balance(alice_address), + deposit_amount - transfer_amount - gas, + ); + assert_eq!( + replay_before.current_user_balance(bob_address), + transfer_amount, + ); + + // Step 3: Kill the sequencer (Anvil stays up). + drop(ws); + runtime.stop().await?; + + // Step 4: Simulate ~4h of outage: advance both L1 and wall clock by + // MAX_WAIT_BLOCKS * SECONDS_PER_BLOCK = 1200 * 12 = 14400s. On respawn, + // l1_safe_head will be >1200 blocks past the frames' safe_block. + runtime + .advance_wall_and_mine(blocks_as_duration(MAX_WAIT_BLOCKS)) + .await?; + + // Step 5: Respawn the sequencer. Startup recovery should detect staleness. + runtime.respawn().await?; + + // Step 6: Replay from offset 0 after recovery. + // The deposit should be re-drained into the recovery batch. + // The transfer should be GONE (it was in an invalidated batch). + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + + // Expect the re-drained deposit. + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + + // No more messages — the transfer was invalidated. + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + // Alice should have her full deposit back (no transfer deducted). + assert_eq!( + replay_after.current_user_balance(alice_address), + deposit_amount, + "after recovery, Alice should have full deposit (transfer was invalidated)" + ); + assert_eq!( + replay_after.current_user_balance(bob_address), + U256::ZERO, + "after recovery, Bob should have zero (transfer was invalidated)" + ); + assert_eq!(replay_after.current_user_nonce(alice_address), 0); + + // Step 8: Verify new work succeeds after recovery. + let mut alice_l2_fresh = runtime.wallet_l2(alice)?; + alice_l2_fresh + .transfer(bob_address, post_recovery_transfer) + .await?; + replay_after.apply(ws_after.expect_user_op_from(alice_address).await?)?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + deposit_amount - post_recovery_transfer - gas, + ); + assert_eq!( + replay_after.current_user_balance(bob_address), + post_recovery_transfer, + ); + assert_eq!(replay_after.current_user_nonce(alice_address), 1); + + Ok(()) +} + +// ── §11.1.1 — Sequencer outage, pre-danger zone ──────────────────────────── +// +// Sequencer stops with an open batch (deposit + transfer); L1 advances 500 +// blocks (well below the danger threshold of ~1125). On restart: +// - Startup recovery runs but finds no danger zone → no flush. +// - No batches are stale → no cascade invalidation. +// - The deposit and transfer persist across the restart. +// - New txs succeed against the unchanged state. +// +// This is the positive control for the recovery procedure: it must NOT fire +// (or over-fire) when L1 hasn't drifted enough to cause trouble. + +async fn run_sequencer_outage_pre_danger_no_recovery_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Pick an advance that's safely below the 1125-block danger threshold + // (MAX_WAIT_BLOCKS 1200 - default margin 75 = 1125). + const PRE_DANGER: Duration = blocks_as_duration(PRE_DANGER_BLOCKS); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let mut ws = runtime.ws(0).await?; + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + let mut replay_before = ReplayWalletApp::devnet(); + + let deposit_amount = U256::from(600_000_u64); + let transfer_amount = U256::from(100_000_u64); + let gas = fee_to_linear(DEFAULT_FRAME_FEE); + + // Step 1: Fund Alice and record a transfer. + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + deposit_amount, + ) + .await?; + alice_l2.transfer(bob_address, transfer_amount).await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + + let expected_alice_balance = deposit_amount - transfer_amount - gas; + let expected_bob_balance = transfer_amount; + + // Step 2: Stop the sequencer. Leave Anvil running. + drop(ws); + runtime.stop().await?; + + // Step 3: Advance L1 + wall-clock a pre-danger amount (500 blocks ≈ 100min + // < 1125 block danger threshold). + runtime.advance_wall_and_mine(PRE_DANGER).await?; + + // Step 4: Restart. No recovery should fire. + runtime.respawn().await?; + + // Step 5: Replay via WS from offset 0. Both the deposit and transfer must + // still be present (no invalidation). + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + replay_after.apply(ws_after.expect_user_op_from(alice_address).await?)?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + expected_alice_balance, + "pre-danger restart must preserve Alice's balance", + ); + assert_eq!( + replay_after.current_user_balance(bob_address), + expected_bob_balance, + "pre-danger restart must preserve Bob's balance", + ); + assert_eq!( + replay_after.current_user_nonce(alice_address), + 1, + "Alice's nonce must NOT be reset", + ); + + // Step 6: No further messages queued. Confirm nothing else comes through. + // (A follow-up "new work succeeds" step is omitted here because the + // harness's `wallet_l2` initializes its local nonce counter at 0, and + // this scenario explicitly does NOT reset the on-chain nonce — the + // post-restart nonce is 1. Adding a "submit at nonce 1" check would + // require harness plumbing beyond the scope of this regression test.) + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + Ok(()) +} + +// ── §11.1.2 — Sequencer outage, danger zone (not yet stale) ──────────────── +// +// Sequencer stops; L1 advances into the danger zone (past 1125 blocks) but +// strictly below the staleness threshold (1200). On restart: +// - `check_danger_zone` returns Some(_) — flush runs (no-op: nothing was +// submitted and no w_nonce is pending). +// - `detect_and_recover` finds nothing stale — no cascade. +// - Pre-outage state is preserved (same positive invariant as §11.1.1). +// +// This exercises the flush-runs-but-cascade-doesn't path specifically. + +async fn run_sequencer_outage_danger_zone_no_cascade_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Pick advance in the danger zone: > danger_threshold (1125) but < MAX_WAIT (1200). + // Decoupled from wall clock on purpose: this test exercises the + // block-based danger check in isolation. A coupled advance (wall+L1) + // is more realistic but triggers the aged-Tip → close → submitter- + // detects-danger → flush-and-restart cycle, which is a different + // scenario (tracked separately — coupling this test would need the + // harness to handle the restart cycle). §11.2.x cells use the proxy + // to exercise the danger-zone + flush path with realistic timing. + // Uses module-level `DANGER_ZONE_BLOCKS` (see top-of-file zone constants). + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let mut ws = runtime.ws(0).await?; + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + let mut replay_before = ReplayWalletApp::devnet(); + + let deposit_amount = U256::from(600_000_u64); + let transfer_amount = U256::from(100_000_u64); + let gas = fee_to_linear(DEFAULT_FRAME_FEE); + + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + deposit_amount, + ) + .await?; + alice_l2.transfer(bob_address, transfer_amount).await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + + let expected_alice_balance = deposit_amount - transfer_amount - gas; + let expected_bob_balance = transfer_amount; + + drop(ws); + runtime.stop().await?; + + // L1 advances into the danger zone but strictly below the staleness + // threshold. The danger-zone path should fire (flush is a no-op here + // because no batch was ever submitted to L1), and the recovery procedure + // should find no stale batches. + runtime.mine_l1_blocks(DANGER_ZONE_BLOCKS).await?; + + runtime.respawn().await?; + + // Same positive invariant as §11.1.1: pre-outage state preserved, nonces + // not reset, feed replay produces identical history. + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + replay_after.apply(ws_after.expect_user_op_from(alice_address).await?)?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + expected_alice_balance, + "danger-zone restart must preserve Alice's balance \ + (flush runs but no cascade)", + ); + assert_eq!( + replay_after.current_user_balance(bob_address), + expected_bob_balance, + ); + assert_eq!( + replay_after.current_user_nonce(alice_address), + 1, + "nonce must not be reset when no cascade happens", + ); + + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + Ok(()) +} + +// ── §11.2.3 — Provider outage, past-stale (recovery through proxy) ────────── +// +// Scenario: the sequencer is routed through a `TcpProxy`, simulating a +// gateway in front of the real L1 node. While the sequencer is stopped, +// a temporary outage happens (proxy disconnected), L1 advances past the +// staleness threshold, and the outage ends (proxy reconnected). The next +// sequencer restart connects via the proxy, sees the advanced safe head, +// and cascade-invalidates the stale open batch. +// +// What this locks down that the sequencer-outage tests don't: +// - The proxy is actually wired into the RPC path. Subsequent RPC calls +// from the sequencer (safe-head sync, batch submission) route through +// it. If `set_l1_endpoint_override` ever regressed (e.g., respawn +// ignored the override), this test would fail. +// - Recovery over a non-direct connection works end-to-end. +// +// Note on wall-clock fallback: in principle this scenario would also test +// the fallback refusing to boot when L1 is unreachable AND real time has +// elapsed past the danger threshold. In practice, `anvil_mine(N)` takes +// milliseconds of real wall-clock time, so the fallback correctly reports +// "not yet in danger by wall-clock" and lets the sequencer boot with stale +// data. Exercising the wall-clock-refuses-to-boot path requires either +// direct `synced_at_ms` DB manipulation or a time-skew tool — deferred. + +async fn run_provider_outage_past_stale_cascades_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Advance comfortably past staleness so the test is robust to small + // scheduling drifts. + const PAST_STALE: Duration = blocks_as_duration(PAST_STALE_BLOCKS); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Step 1: Normal setup — deposit + transfer (the transfer will be lost). + let mut ws = runtime.ws(0).await?; + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + let mut replay_before = ReplayWalletApp::devnet(); + + let deposit_amount = U256::from(600_000_u64); + let transfer_amount = U256::from(100_000_u64); + + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + deposit_amount, + ) + .await?; + alice_l2.transfer(bob_address, transfer_amount).await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + + // Step 2: Stop the sequencer and insert a proxy into the L1 path. + drop(ws); + runtime.stop().await?; + + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + + // Step 3: Simulate a gateway outage that spans the staleness window. + // - Disconnect the proxy (gateway is down). + // - Mine 1250 blocks directly on Anvil (bypasses the proxy). + // - Reconnect the proxy (gateway is back). + // During the outage the sequencer is stopped; when it comes back up, + // it will see the advanced safe head through the proxy. + proxy.disconnect(); + runtime.advance_wall_and_mine(PAST_STALE).await?; + proxy.reconnect(); + + // Step 4: Respawn. The sequencer dials the proxy, the proxy forwards + // to Anvil, `sync_to_current_safe_head` returns 1250+ blocks past the + // open batch's first frame. `check_open_batch_staleness` fires, cascade + // invalidates, recovery batch opens. + runtime.respawn().await?; + + // Step 5: Verify via WS replay. + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + deposit_amount, + "transfer must be invalidated after past-stale outage routed through proxy", + ); + assert_eq!( + replay_after.current_user_balance(bob_address), + U256::ZERO, + "Bob's receiving balance must be rolled back", + ); + assert_eq!(replay_after.current_user_nonce(alice_address), 0); + + // Step 6: Tear down the proxy cleanly. + proxy.shutdown().await?; + + Ok(()) +} + +// ── §7.8.1 — Wall-clock fallback refuses to boot past danger threshold ───── +// +// Scenario: L1 is unreachable AND wall-clock time has elapsed past the +// danger threshold since the last successful L1 sync. The sequencer must +// refuse to boot — proceeding would mean issuing soft confirmations against +// stale L1 state, potentially missing that batches are already doomed. +// +// This test only became possible after the `find_first_batch_in_danger` +// unification. Prior to that fix, an open batch was invisible to +// `check_danger_zone`, so the wall-clock fallback could "miss" an open +// batch aging into danger while L1 was unreachable and boot anyway. +// +// The wall-clock illusion is created without OS tooling: `rewind_synced_at_ms` +// rewrites `l1_safe_head.synced_at_ms` to an older timestamp, equivalent +// to advancing the wall clock from the sequencer's perspective. We mine +// an equivalent number of blocks on Anvil to keep the block-time coupling +// documented in `docs/threat-model/README.md`. + +async fn run_provider_outage_wall_clock_refuses_boot_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Pick an elapsed time comfortably past the danger threshold. Defaults: + // seconds_per_block=12, danger_threshold=MAX_WAIT_BLOCKS(1200)-margin(75)=1125. + // We need elapsed_secs / 12 > 1125 → elapsed_secs > 13500. Use 5h. + const OUTAGE: Duration = Duration::from_secs(5 * 60 * 60); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Step 1: Normal setup — deposit + transfer (transfer will be lost). + let mut ws = runtime.ws(0).await?; + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + let mut replay_before = ReplayWalletApp::devnet(); + + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + alice_l2 + .transfer(bob_address, U256::from(100_000_u64)) + .await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + + // Step 2: Stop the sequencer, insert proxy, disconnect it, advance both + // the wall clock and L1 by the outage duration — block-time coupled so + // the sequencer sees a consistent view (5h ≈ 1500 blocks at 12s/block). + drop(ws); + runtime.stop().await?; + + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + proxy.disconnect(); + runtime.advance_wall_and_mine(OUTAGE).await?; + + // Step 3: Attempt respawn with proxy disconnected. The sequencer: + // - dials the proxy → sync_to_current_safe_head fails (L1 unreachable). + // - falls back to wall-clock estimation. + // - computes missed_blocks = 18000s / 12 = 1500 > danger_threshold 1125. + // - `find_first_batch_in_danger(adjusted_threshold=0)` flags the open + // batch (first_frame_safe_block << current_safe_block - 0). + // - decide_startup_action returns Refuse(StalledSafeHead) → process exits with failure. + let respawn_result = runtime.respawn().await; + assert!( + respawn_result.is_err(), + "respawn must fail: wall-clock says past-danger AND open batch is in danger", + ); + + // Step 4: Reconnect the proxy and respawn normally. Sync now succeeds, + // the stale open batch is cascade-invalidated, recovery batch opens. + proxy.reconnect(); + runtime.respawn().await?; + + // Step 5: Verify the invalidation: only the re-drained deposit appears. + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + U256::from(600_000_u64), + "transfer must be invalidated after wall-clock-triggered recovery", + ); + assert_eq!(replay_after.current_user_balance(bob_address), U256::ZERO,); + assert_eq!(replay_after.current_user_nonce(alice_address), 0); + + proxy.shutdown().await?; + Ok(()) +} + +// §7.8.3: `SystemTime::now()` backward jump → `saturating_sub` handles +// cleanly, no panic. +// +// Scenario: normal setup creates DB state at real time T. Stop, disconnect +// proxy, backward-jump the clock via faketime, respawn with L1 unreachable. +// The wall-clock fallback runs: +// +// elapsed = now(T-1h).saturating_sub(last_sync_at_ms(≈T)) = 0 +// +// No danger → boot proceeds. After reconnect, normal operation resumes. +// If `saturating_sub` ever regresses to a plain subtraction (underflow +// panic on u64), this test panics at respawn. +async fn run_wall_clock_backward_jump_no_panic_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + let alice = TestSigner::from_default(1)?; + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut ws = runtime.ws(0).await?; + let mut replay_before = ReplayWalletApp::devnet(); + + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + U256::from(100_000_u64), + ) + .await?; + drop(ws); + + runtime.stop().await?; + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + proxy.disconnect(); + runtime.set_faketime_offset(Some("-1h".to_string()))?; + + // Respawn must NOT panic. With L1 unreachable, the wall-clock fallback + // is the only path that sees `now - last_sync_ms` — if the subtraction + // ever became non-saturating, this call would panic via u64 underflow. + runtime.respawn().await?; + + // Clean up: reconnect and let the sequencer catch up normally. + proxy.reconnect(); + // Clear the offset for subsequent respawns (not used here, but keeps the + // teardown deterministic if future cleanup code respawns). + runtime.set_faketime_offset(None)?; + + proxy.shutdown().await?; + Ok(()) +} + +// §7.8.5 — Provider reachable, safe head frozen, startup refuses to boot. +// +// Scenario: +// 1. Create an open Tip and age it into the danger window while L1 is +// still reachable. +// 2. Stop the sequencer without mining any more L1 blocks, so the next +// startup sees the same safe head again. +// 3. Jump only the sequencer's wall clock forward by >1 block interval. +// Startup sync succeeds, but because the safe head did not advance, the +// reader preserves the old safe-progress timestamp. +// 4. `stalled_safe_head_danger_estimate` treats that as a reachable-but- +// frozen safe head and refuses boot. +// 5. Mine one more L1 block and respawn again; safe-head progress resumes, +// the timestamp refreshes, and the sequencer stays up. +async fn run_stalled_safe_head_startup_refuses_boot_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + const STALLED_SAFE_HEAD_OFFSET: &str = "+30s"; + const SAFE_HEAD_SYNC_WINDOW: Duration = Duration::from_secs(8); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice)?; + let mut replay = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + runtime.mine_l1_blocks(DANGER_ZONE_BLOCKS).await?; + + let early_exit = runtime.observe_for(SAFE_HEAD_SYNC_WINDOW).await?; + assert!( + early_exit.is_none(), + "aging open Tip alone must not crash while the safe head is still progressing: \ + got unexpected exit {early_exit:?}", + ); + + runtime.stop().await?; + runtime.set_faketime_offset(Some(STALLED_SAFE_HEAD_OFFSET.to_string()))?; + + let respawn_result = runtime.respawn().await; + assert!( + respawn_result.is_err(), + "startup must refuse when L1 is reachable but the safe head stayed frozen long enough to estimate danger", + ); + + let counts = runtime.count_batches()?; + assert_eq!( + counts.invalidated, 0, + "startup refusal on a reachable-but-stalled safe head must not cascade batches: {counts:?}", + ); + + runtime.mine_l1_blocks(1).await?; + runtime.respawn().await?; + + let stable_after_progress = runtime.observe_for(SAFE_HEAD_SYNC_WINDOW).await?; + assert!( + stable_after_progress.is_none(), + "once safe-head progress resumes, the sequencer should boot and remain stable; got {stable_after_progress:?}", + ); + + Ok(()) +} + +// §11.2.1: provider outage in the pre-danger zone while the sequencer stays +// running. +// +// Load-under-outage check: the sequencer must continue to accept user ops, +// persist them, broadcast on WS, and CLOSE BATCHES BY SIZE while its L1 +// connection is down. Proves the inclusion lane is independent of L1 +// reachability — as long as the wall-clock fallback keeps the pre-danger +// verdict, the sequencer keeps doing useful work. +// +// Scenario: +// 1. Spawn + apply a large deposit so Alice can fund many transfers. +// 2. Route the sequencer through a proxy (stop → set override → respawn). +// 3. Disconnect the proxy, advance L1 by a pre-danger amount (500 blocks). +// 4. Submit enough transfers (~150 × ~100 B each ≈ 15 KB) to exceed the +// default ~12 KB batch-size target, guaranteeing at least one size- +// triggered batch close during the outage. +// 5. Assert `count_batches().sealed` strictly increased during the outage. +// 6. Reconnect the proxy; confirm one more transfer goes through and the +// schema invariants hold post-test. +async fn run_provider_outage_pre_danger_sequencer_continues_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Pre-danger budget: see module-level `PRE_DANGER_BLOCKS` (500 blocks = + // 100min at 12s/block, well below the danger threshold). + // Enough transfers to exceed the default ~12 KB batch size target. Each + // transfer user_op is ≈ 100 B (SSZ-encoded Transfer + signature + nonce), + // so 150 ops ≈ 15 KB — one batch close is guaranteed; two or more is + // typical. + const TRANSFERS_DURING_OUTAGE: usize = 150; + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Step 1: Deposit big — Alice needs to cover 150+ transfers and their fees. + // Default fee per user-op ≈ 3873 units (log-fee 1060); reserve margin. + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let deposit_amount = U256::from(10_000_000_u64); + let mut replay = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit(runtime, &mut ws, &mut replay, &alice_l1, deposit_amount) + .await?; + } + + // Step 2: Insert the proxy and route the sequencer through it via + // stop → set override → respawn. The initial spawn (direct to Anvil) is + // treated as setup only; from here on, all sequencer → L1 traffic flows + // through the proxy. + runtime.stop().await?; + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + runtime.respawn().await?; + + // Step 3: Connect a fresh WS (catches up the deposit from offset 0) and + // a fresh L2 wallet. Consume the deposit replay so subsequent + // `expect_user_op_from` calls line up. + let mut ws = runtime.ws(0).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + ws.expect_direct_input_from(runtime.erc20_portal_address()) + .await?; + + // Baseline: one transfer while the proxy is still connected, confirming + // end-to-end plumbing works through the proxy. + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + + let batches_before = runtime.count_batches()?; + + // Step 4: Cut the L1 connection, advance Anvil by 500 blocks (pre-danger). + // The sequencer is still running; its wall-clock fallback sees real time + // not yet past the threshold, so it keeps retrying rather than shutting + // down. + proxy.disconnect(); + runtime.mine_l1_blocks(PRE_DANGER_BLOCKS).await?; + + // Step 5: Submit many transfers during the outage. Each should be + // accepted (POST /tx succeeds), broadcast on WS, and eventually packed + // into a new batch. Size-triggered close fires when the cumulative user-op + // bytes exceed the default target. + for _ in 0..TRANSFERS_DURING_OUTAGE { + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + // Step 6: Batch closure during outage — the whole point of this test. + let batches_mid = runtime.count_batches()?; + assert!( + batches_mid.sealed > batches_before.sealed, + "sequencer must continue closing batches during L1 outage: \ + before={before:?}, after={after:?}", + before = batches_before, + after = batches_mid, + ); + + // Step 7: Restore L1 connectivity. The batch submitter's next tick + // reaches L1 again and starts draining the pending batches. + proxy.reconnect(); + + // Final check: one more transfer goes through after reconnect, proving + // the sequencer didn't just survive — it's fully operational. + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + + // Sanity: total sealed batches grew from baseline to final, and nothing + // got invalidated (pre-danger → no recovery triggered). + let batches_final = runtime.count_batches()?; + assert!( + batches_final.sealed > batches_before.sealed, + "final sealed count {final:?} must exceed baseline {before:?}", + final = batches_final, + before = batches_before, + ); + assert_eq!( + batches_final.invalidated, 0, + "pre-danger outage must not invalidate any batches, got {:?}", + batches_final, + ); + + proxy.shutdown().await?; + Ok(()) +} + +// §11.2.2: provider outage aging into the danger zone while the sequencer is +// running — sequencer detects via its live wall-clock fallback and self-exits +// with `DangerZone`. Also verifies the startup wall-clock fallback refuses +// subsequent boots while L1 is still unreachable. +// +// The full "reconnect → recover → no cascade" cycle needs the harness to +// handle an orchestrator-style restart loop (the first post-reconnect boot +// may still trip the danger check and exit, requiring another boot after +// enough blocks age out). That's tracked as §11.1.5 / §11.2.2-follow-up and +// deliberately out of scope here. +// +// Uses dynamic faketime (FAKETIME_TIMESTAMP_FILE re-read on every time call) +// to jump the sequencer's clock past the danger threshold mid-run without +// respawning — the scenario we'd otherwise need 3h45min of real wall-clock +// time to reproduce. +async fn run_provider_outage_danger_zone_sequencer_self_exits_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Defaults: MAX_WAIT_BLOCKS=1200, margin=75, danger_threshold=1125 + // blocks at 12s/block = 13500s = 3h45min. Use 3h55min: past danger, + // under MAX_WAIT (so no cascade fires later). + const INTO_DANGER: Duration = Duration::from_secs(3 * 60 * 60 + 55 * 60); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Step 1: Baseline — deposit + transfer so there's observable state. + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut replay = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay, + &alice_l1, + U256::from(500_000_u64), + ) + .await?; + } + + // Step 2: Switch routing to the proxy (stop → set override → respawn). + runtime.stop().await?; + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + runtime.respawn().await?; + + let mut ws = runtime.ws(0).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + ws.expect_direct_input_from(runtime.erc20_portal_address()) + .await?; + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + + // Step 3: Disconnect the proxy and advance both clocks into the danger + // zone. The running sequencer's batch-submitter tick will try L1, hit + // the proxy's disconnect, fall into the wall-clock fallback, and see + // elapsed > danger_threshold → exit with DangerZone. + proxy.disconnect(); + runtime.advance_wall_and_mine(INTO_DANGER).await?; + + // Step 4: Wait for the sequencer to detect and self-exit. Dynamic + // faketime means the shift hits the submitter's next tick immediately — + // no real-time wait needed. + let exit_status = runtime.wait_for_exit(Duration::from_secs(30)).await?; + assert!( + !exit_status.success(), + "sequencer must self-exit with non-zero status on DangerZone, got {exit_status:?}", + ); + + // Step 5: Try to respawn while proxy is still disconnected. Startup + // runs the same wall-clock fallback via `run_preemptive_recovery` and + // should refuse to boot (`decide_startup_action → Refuse(...)`). + let respawn_result = runtime.respawn().await; + assert!( + respawn_result.is_err(), + "respawn must fail while proxy disconnected and wall-clock past danger", + ); + + // No cascade happened yet — batches under MAX_WAIT are not invalidated + // by startup recovery, only preemptively shut-down-and-flushed. + let counts = runtime.count_batches()?; + assert_eq!( + counts.invalidated, 0, + "danger-zone (not past-stale) must not invalidate batches: {counts:?}", + ); + + proxy.shutdown().await?; + Ok(()) +} + +// §11.4.1 — Short-duration provider hiccup, heals within pre-danger. +// +// The most-common production fault: an RPC gateway flakes briefly, retries +// succeed. No recovery should fire. +// +// What this tests that §11.2.1 doesn't: §11.2.1 disconnects for a +// 500-block L1 advance + 150 transfers worth of real time, exercising the +// inclusion lane under load. §11.4.1 instead exercises the "pure retry +// loop" path: **no** L1 advance, **no** faketime advance, just a few seconds +// of real-time wall-clock downtime across at least one +// `idle_poll_interval_ms` (default 5 s) so the submitter definitely attempts +// and fails a tick, then the reconnect path lets the next tick succeed. +// +// Scenario: +// 1. Route through proxy; establish a baseline transfer. +// 2. Disconnect, submit one more transfer (inclusion lane must still +// accept), sleep >5 s so the submitter's tick hits the disconnect. +// 3. Reconnect, submit another transfer. +// 4. Assert no batches were invalidated and POST /tx still works. +async fn run_provider_outage_short_hiccup_no_recovery_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Long enough to straddle the default 5 s submitter idle_poll_interval so + // at least one retry actually fails against the disconnected proxy. + const HICCUP_DURATION: Duration = Duration::from_secs(6); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let deposit_amount = U256::from(2_000_000_u64); + let mut replay = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit(runtime, &mut ws, &mut replay, &alice_l1, deposit_amount) + .await?; + } + + // Route through the proxy (stop → override → respawn). + runtime.stop().await?; + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + runtime.respawn().await?; + + let mut ws = runtime.ws(0).await?; + let mut alice_l2 = runtime.wallet_l2(alice)?; + ws.expect_direct_input_from(runtime.erc20_portal_address()) + .await?; + + // Baseline transfer via the proxy, proving the proxy path works. + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + + let batches_before = runtime.count_batches()?; + + // Disconnect: the submitter's next tick (within 5 s) fails against the + // disconnected proxy, runs wall_clock_danger_estimate with ~zero + // elapsed — far below danger threshold — and just retries. + proxy.disconnect(); + + // Inclusion lane is independent of L1; POST /tx still accepts. + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + + // Wait at least one full submitter idle_poll_interval (default 5 s) so the + // failed-retry path is definitely exercised under the disconnect. + tokio::time::sleep(HICCUP_DURATION).await; + + proxy.reconnect(); + + // Reconnect: another transfer goes through normally — proves the + // sequencer didn't just sit there, its next tick genuinely recovered. + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + + let batches_after = runtime.count_batches()?; + assert_eq!( + batches_after.invalidated, 0, + "a short pre-danger hiccup must not invalidate any batch: {batches_after:?}", + ); + assert!( + batches_after.sealed >= batches_before.sealed, + "sealed-batch count must be monotonic across a hiccup: \ + before={batches_before:?}, after={batches_after:?}", + ); + + proxy.shutdown().await?; + Ok(()) +} + +// §11.3.2 — Both down, sequencer returns first into the danger zone, refuses +// to boot. +// +// Companion to §11.2.3 (past-stale cascades through proxy): this is the +// *danger-zone* window of the same setup. Sequencer is stopped AND the proxy +// is disconnected; wall-clock and L1 advance into the danger zone but stay +// below `MAX_WAIT_BLOCKS`; the sequencer comes back first while L1 is still +// unreachable. Startup's wall-clock fallback must see "past danger" and +// refuse the boot — advancing the safe head off stale data would risk +// issuing soft confirmations against a state that may already be doomed. +// +// No cascade is expected yet (we haven't crossed MAX_WAIT_BLOCKS). The test +// stops at the refuse-to-boot assertion — the full reconnect+recovery cycle +// is covered by §11.3.3 below. +async fn run_both_down_danger_zone_sequencer_first_refuses_boot_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Safely inside the danger zone: past 1125-block threshold, below 1200. + // 3h55min at 12 s/block = 1175 blocks — same slot the existing + // §11.2.2 test uses. + const INTO_DANGER: Duration = Duration::from_secs(3 * 60 * 60 + 55 * 60); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Baseline: deposit + transfer (both will survive — no cascade expected + // in the danger-zone window). + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + let mut replay_before = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + alice_l2 + .transfer(bob_address, U256::from(100_000_u64)) + .await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + // Both down: stop sequencer, insert proxy, disconnect proxy. + runtime.stop().await?; + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + proxy.disconnect(); + + // Coupled advance into the danger zone. Anvil mines behind the proxy + // (direct connection via `mine_l1_blocks`), and faketime shifts the + // sequencer's wall clock cumulatively. + runtime.advance_wall_and_mine(INTO_DANGER).await?; + + // Respawn while proxy is still disconnected: sync fails → wall-clock + // fallback computes past-danger → refuses to boot. + let respawn_result = runtime.respawn().await; + assert!( + respawn_result.is_err(), + "sequencer must refuse to boot while L1 unreachable and wall-clock past danger", + ); + + // No cascade should have run yet — we haven't crossed MAX_WAIT_BLOCKS. + let counts = runtime.count_batches()?; + assert_eq!( + counts.invalidated, 0, + "refuse-to-boot must not invalidate any batch: {counts:?}", + ); + + proxy.shutdown().await?; + Ok(()) +} + +// §11.3.3 — Both down, proxy returns first, then sequencer — restart cycle +// converges. +// +// Complement to §11.3.2 (sequencer first): here L1 comes back before the +// sequencer does. Once the sequencer restarts, startup recovery's wall-clock +// fallback sees L1 is now reachable and proceeds. The first boot cycle +// closes the aged Tip, the submitter detects a closed batch in danger, and +// the process exits. The orchestrator (simulated by `respawn_until_stable`) +// retries after a small additional L1 advance — the closed batch ages past +// `MAX_WAIT_BLOCKS`, startup recovery cascades, a fresh recovery batch opens, +// and the sequencer is healthy. +// +// The key invariant this tests that the existing §11.x tests don't: the full +// *restart-loop* works. Earlier tests stopped at "first respawn exits" +// because the harness lacked an orchestrator-restart primitive; now we have +// `respawn_until_stable`, so we can drive the loop to convergence. +async fn run_both_down_danger_zone_proxy_first_restart_cycle_recovers_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + const INTO_DANGER: Duration = Duration::from_secs(3 * 60 * 60 + 55 * 60); + // Each restart attempt advances ~20 min (100 blocks) of additional L1 + // time, simulating the real orchestrator-restart cadence. One extra + // tick past the first failed attempt is enough to push an aged Tip's + // closed-batch form past MAX_WAIT_BLOCKS (1175 + 100 > 1200). + const ADVANCE_PER_RETRY: Duration = blocks_as_duration(RESPAWN_RETRY_ADVANCE_BLOCKS); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Baseline: deposit + transfer — the transfer will be invalidated when + // cascade finally fires. + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + let mut replay_before = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + alice_l2 + .transfer(bob_address, U256::from(100_000_u64)) + .await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + // Both down: stop sequencer, insert proxy, disconnect. + runtime.stop().await?; + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + proxy.disconnect(); + + // Coupled advance into the danger zone. + runtime.advance_wall_and_mine(INTO_DANGER).await?; + + // L1 recovers first — proxy back online while sequencer is still stopped. + proxy.reconnect(); + + // Simulated orchestrator loop. Each failed attempt advances L1 (and + // wall-clock) by ~20 min; the aged Tip eventually ages past + // `MAX_WAIT_BLOCKS` and cascade fires on a subsequent respawn. + let outcomes = runtime + .respawn_until_stable(RespawnPolicy { + max_attempts: 5, + stabilization: Duration::from_secs(8), + advance_per_retry: Some(ADVANCE_PER_RETRY), + }) + .await?; + assert!( + matches!(outcomes.last(), Some(RespawnAttemptOutcome::Stable)), + "restart cycle must converge to Stable, got: {outcomes:?}", + ); + + // The cascade fired somewhere in the loop — the transfer was invalidated. + let counts = runtime.count_batches()?; + assert!( + counts.invalidated >= 1, + "expected at least one invalidation after restart-cycle cascade: {counts:?}", + ); + + // Verify via WS replay: only the re-drained deposit appears, the transfer + // is gone. + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + U256::from(600_000_u64), + "Alice must get her full deposit back after cascade", + ); + assert_eq!( + replay_after.current_user_balance(bob_address), + U256::ZERO, + "Bob's balance must roll back", + ); + assert_eq!(replay_after.current_user_nonce(alice_address), 0); + + proxy.shutdown().await?; + Ok(()) +} + +// §11.1.5 — Sequencer outage, coupled wall+L1 advance into the danger zone, +// orchestrator restart cycle converges. +// +// The realistic counterpart to the decoupled §11.1.2 +// (`sequencer_outage_danger_zone_no_cascade_test`), which advances L1 without +// touching the wall clock to keep the aged-Tip-auto-close path out of scope. +// In a real outage both advance together, which means: on respawn, the aged +// Tip's `max_open_time` is exceeded, the inclusion lane closes it into a +// now-nonced closed batch, and the submitter's first tick detects the closed +// batch is in the danger zone (`age > danger_threshold`) and exits with +// `BatchSubmitterError::DangerZone`. +// +// That's a flush-and-restart signal, not a cascade. Under orchestration, the +// next boot's preemptive recovery runs `check_danger_zone` (closed-only), +// flushes the mempool, re-syncs, then runs `run_startup_recovery` with the +// `MAX_WAIT_BLOCKS` threshold. Two end states are valid: +// - the closed batch does NOT land before the detector-triggered shutdown, +// so a later respawn ages it past `MAX_WAIT_BLOCKS` and recovery cascades; +// - the submitter gets one last batch onto L1 before shutdown, so the next +// respawn sees that batch in `safe_inputs` and converges without any +// invalidation. +// +// The test's load-bearing assertion is therefore restart-loop convergence +// under a realistic coupled outage, not mandatory cascade. +// +// Proves the sequencer-outage danger-zone path (not just the provider-outage +// analogue §11.2.2) follows the same flush/shutdown → respawn → cascade +// lifecycle to a healthy state. +async fn run_sequencer_outage_danger_zone_coupled_restart_cycle_recovers_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + const INTO_DANGER: Duration = Duration::from_secs(3 * 60 * 60 + 55 * 60); + const ADVANCE_PER_RETRY: Duration = blocks_as_duration(RESPAWN_RETRY_ADVANCE_BLOCKS); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + let mut replay_before = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + alice_l2 + .transfer(bob_address, U256::from(100_000_u64)) + .await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + // Sequencer outage: stop, do NOT insert a proxy. Coupled L1+wall advance + // into the danger zone. + runtime.stop().await?; + runtime.advance_wall_and_mine(INTO_DANGER).await?; + + let outcomes = runtime + .respawn_until_stable(RespawnPolicy { + max_attempts: 5, + stabilization: Duration::from_secs(8), + advance_per_retry: Some(ADVANCE_PER_RETRY), + }) + .await?; + assert!( + matches!(outcomes.last(), Some(RespawnAttemptOutcome::Stable)), + "restart cycle must converge to Stable, got: {outcomes:?}", + ); + + // At least one orchestrator cycle expected before convergence — the + // first respawn succeeds but the submitter tick exits with DangerZone. + assert!( + outcomes.len() >= 2, + "danger-zone restart cycle must involve at least one failed attempt \ + before converging (else we're not exercising the flush/shutdown path): {outcomes:?}", + ); + + let counts = runtime.count_batches()?; + + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + if counts.invalidated >= 1 { + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + assert_eq!( + replay_after.current_user_balance(alice_address), + U256::from(600_000_u64), + "cascade must roll Alice back to the full deposit", + ); + assert_eq!(replay_after.current_user_balance(bob_address), U256::ZERO); + assert_eq!(replay_after.current_user_nonce(alice_address), 0); + } else { + replay_after.apply(ws_after.expect_user_op_from(alice_address).await?)?; + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + assert_eq!( + replay_after.current_user_balance(alice_address), + U256::from(500_000_u64), + "if no cascade fired, the pre-outage transfer must have remained canonical", + ); + assert_eq!( + replay_after.current_user_balance(bob_address), + U256::from(100_000_u64), + ); + assert_eq!(replay_after.current_user_nonce(alice_address), 1); + } + + Ok(()) +} + +// §11.2.2 follow-up — Provider outage into the danger zone while the +// sequencer is running, mid-run DangerZone exit, then reconnect + restart +// cycle converges. +// +// The existing §11.2.2 test stops at "refuse to reboot while proxy still +// disconnected". This completes that story: after the sequencer self-exits +// mid-run via its live wall-clock fallback and the proxy reconnects, the +// orchestrator restart cycle eventually converges — same +// `respawn_until_stable` pattern as §11.1.5 / §11.3.3. +// +// Ordering detail: the wall-clock advance only advances the sequencer's +// clock; the proxy has been disconnecting Anvil traffic, so Anvil's block +// count advanced via `mine_l1_blocks` (which bypasses the proxy). When the +// proxy reconnects, the sequencer sees both the shifted wall clock and the +// fresh safe head via the same RPC connection. +async fn run_provider_outage_danger_zone_mid_run_exit_then_restart_cycle_recovers_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + const INTO_DANGER: Duration = Duration::from_secs(3 * 60 * 60 + 55 * 60); + const ADVANCE_PER_RETRY: Duration = blocks_as_duration(RESPAWN_RETRY_ADVANCE_BLOCKS); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Baseline — deposit + transfer while running directly against Anvil, + // then route through the proxy for the outage. + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut replay_before = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + } + + runtime.stop().await?; + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + runtime.respawn().await?; + + let mut ws = runtime.ws(0).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + ws.expect_direct_input_from(runtime.erc20_portal_address()) + .await?; + alice_l2 + .transfer(bob_address, U256::from(100_000_u64)) + .await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + drop(ws); + + // Mid-run outage: proxy goes down, coupled wall+L1 advance into danger. + // The running sequencer's submitter tick hits the disconnect, runs + // wall_clock_danger_estimate, sees past-danger, and exits with + // `DangerZone`. + proxy.disconnect(); + runtime.advance_wall_and_mine(INTO_DANGER).await?; + + let exit_status = runtime.wait_for_exit(Duration::from_secs(30)).await?; + assert!( + !exit_status.success(), + "sequencer must self-exit on mid-run DangerZone, got {exit_status:?}", + ); + + // L1 comes back. Run the orchestrator cycle: the aged closed batch + // eventually ages past `MAX_WAIT_BLOCKS` and startup recovery cascades. + proxy.reconnect(); + + let outcomes = runtime + .respawn_until_stable(RespawnPolicy { + max_attempts: 5, + stabilization: Duration::from_secs(8), + advance_per_retry: Some(ADVANCE_PER_RETRY), + }) + .await?; + assert!( + matches!(outcomes.last(), Some(RespawnAttemptOutcome::Stable)), + "restart cycle must converge to Stable, got: {outcomes:?}", + ); + + let counts = runtime.count_batches()?; + assert!( + counts.invalidated >= 1, + "expected cascade after mid-run exit + restart cycle: {counts:?}", + ); + + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + U256::from(600_000_u64), + ); + assert_eq!(replay_after.current_user_balance(bob_address), U256::ZERO); + assert_eq!(replay_after.current_user_nonce(alice_address), 0); + + proxy.shutdown().await?; + Ok(()) +} + +// §7.8.2 — First-boot-with-L1-down refuses to boot (wall-clock fallback +// treats "never synced" as danger). +// +// `wall_clock_danger_estimate` has a distinguished branch for `last_sync_ms +// == 0`: it refuses to proceed because the sequencer has no baseline to +// measure drift against, so issuing soft confirmations against whatever +// stale safe head we last saw is unsafe. There's a unit test covering that +// branch in isolation; this e2e confirms the full `run()` boot path +// respects it end-to-end. +// +// How we reach the condition: the harness's `spawn()` does a successful +// first boot (needs L1 reachable to deploy contracts and bootstrap the +// chain-id/InputBox cache). We stop, rewrite `l1_safe_head.synced_at_ms` +// to 0 directly, then respawn with the proxy disconnected. The bootstrap +// cache is still populated — so the sequencer gets past the +// contract-discovery phase — but the wall-clock fallback sees the zeroed +// timestamp and `decide_startup_action` returns +// `Refuse(NeverSyncedAndUnreachable)`. +// +// Scope note: a "truly" first-ever boot would fail even earlier (no +// bootstrap cache, can't discover contracts). That's a separate test; this +// one targets only the wall-clock-fallback branch. +async fn run_first_boot_l1_unreachable_never_synced_refuses_boot_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Baseline boot so the bootstrap cache lands on disk. + { + let _ws = runtime.ws(0).await?; + } + + runtime.stop().await?; + + // Simulate "never synced L1" by zeroing the timestamp. The block number + // stays whatever it already is — the wall-clock fallback keys off + // `synced_at_ms == 0`, not the block count. + runtime.reset_l1_safe_head_synced_at_ms()?; + + // Route the sequencer through a disconnected proxy so L1 is unreachable + // from the sequencer's perspective. + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + proxy.disconnect(); + + let respawn_result = runtime.respawn().await; + assert!( + respawn_result.is_err(), + "never-synced + L1-unreachable must refuse to boot, got {respawn_result:?}", + ); + + // Confirm the refusal is reversible: reconnect the proxy and the + // sequencer boots normally (the wall-clock fallback path is gated on + // L1 unreachability, not on any persistent flag). + proxy.reconnect(); + runtime.respawn().await?; + + proxy.shutdown().await?; + Ok(()) +} + +// §11.1.4 — Past-stale closed+submitted batch (delayed-inclusion cascade). +// +// Scenario: a batch closes and the submitter's L1 tx is never mined (the +// gateway dropped it, mempool evicted it, whatever). Blocks accumulate. On +// the sequencer's next startup recovery, the batch's first frame is > +// `MAX_WAIT_BLOCKS` behind current_safe_block, so the scheduler skips it +// in `populate_safe_accepted_batches` and `find_first_batch_in_danger` +// flags it — cascade fires. +// +// This is the structural sibling of §11.1.3 (open-batch variant) for +// closed+submitted batches. The `find_first_batch_in_danger` path has two +// flavors: "open batch got old" (§11.1.3) and "closed batch submission +// got lost" (this one). Both need to cascade correctly; §11.1.3 had e2e +// coverage, the closed-submitted variant had none. +// +// Setup shape: we use Anvil's `setAutomine(false)` + `dropAllPendingTxs` +// (new T2 harness primitives) to hold the sequencer's batch-submission tx +// out of the chain, then drop it entirely — cleaner than the mempool-hold +// approach, because `anvil_mine(N)` with a pending tx would include it in +// the first mined block, not the Nth. Dropping simulates gateway packet +// loss directly and advances 1250 genuinely empty blocks. +async fn run_delayed_inclusion_cascades_on_restart_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Past-stale: 1250 blocks > MAX_WAIT_BLOCKS (1200). + const PAST_STALE: Duration = blocks_as_duration(PAST_STALE_BLOCKS); + // Enough transfers to trigger at least one size-based batch close. + // Matches §11.2.1's sizing (≈100 B/op × 150 ops ≈ 15 KB > 12 KB target). + const TRANSFERS_TO_FORCE_BATCH_CLOSE: usize = 150; + // After the last transfer, wait for the submitter's next tick so it + // picks up the closed batch and sends the L1 tx to the (now-held) + // mempool. Default `idle_poll_interval` is 5 s. + const WAIT_FOR_SUBMITTER_TICK: Duration = Duration::from_secs(7); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Fund Alice generously — 151 transfers + fees is well under 10 M. + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let deposit_amount = U256::from(10_000_000_u64); + let mut replay_before = ReplayWalletApp::devnet(); + + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + deposit_amount, + ) + .await?; + + // Capture the sealed-batch baseline BEFORE we disable auto-mining so + // we can assert that at least one new batch sealed during the + // mempool-held phase. + let batches_before_close = runtime.count_batches()?; + + // Hold the mempool. From here, txs go to Anvil but don't mine until + // we either re-enable auto-mining or call `anvil_mine`. + runtime.set_automine(false).await?; + + // Submit enough transfers to trigger at least one size-triggered + // batch close. Each POST /tx is processed by the sequencer + // synchronously; the inclusion lane seals a batch when cumulative + // user-op bytes exceed the target. + let mut alice_l2 = runtime.wallet_l2(alice)?; + for _ in 0..TRANSFERS_TO_FORCE_BATCH_CLOSE { + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + // Give the submitter tick time to fire and put the batch-submission + // tx into the (held) mempool. + tokio::time::sleep(WAIT_FOR_SUBMITTER_TICK).await; + + let batches_after_close = runtime.count_batches()?; + assert!( + batches_after_close.sealed > batches_before_close.sealed, + "expected at least one new sealed batch: before={batches_before_close:?} after={batches_after_close:?}", + ); + + // Shut the sequencer down, then drop the mempool so the submitted + // batch tx never lands. The sequencer's DB still shows a sealed + // batch; L1 has no corresponding event. + drop(ws); + runtime.stop().await?; + runtime.drop_all_pending_txs().await?; + + // Advance past MAX_WAIT_BLOCKS. With auto-mining still off but the + // mempool empty, these are genuinely empty blocks — nothing to + // include. `advance_wall_and_mine` also shifts the sequencer's + // faketime offset so the wall-clock fallback stays in sync with L1. + runtime.advance_wall_and_mine(PAST_STALE).await?; + + // Re-enable auto-mining before respawn: startup recovery's flush step + // submits a no-op at the stuck wallet-nonce slot and needs it mined + // to progress. With auto-mining off, the flusher would hang. + runtime.set_automine(true).await?; + + runtime.respawn().await?; + + // Verify cascade fired. + let counts = runtime.count_batches()?; + assert!( + counts.invalidated >= 1, + "expected cascade-invalidation of the delayed-inclusion batch: {counts:?}", + ); + + // Replay from offset 0: the deposit must be re-drained (it's still a + // safe L1 input), and the sealed batch's transfers must be gone. + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + deposit_amount, + "Alice must have her full deposit back (all transfers invalidated)", + ); + assert_eq!( + replay_after.current_user_balance(bob_address), + U256::ZERO, + "Bob's receiving balance must roll back", + ); + assert_eq!(replay_after.current_user_nonce(alice_address), 0); + + Ok(()) +} + +// §7.3.5 — Aging Tip while sequencer is UP and L1 is reachable. +// +// Negative control for the danger-check split (see TEST_PLAN §7.3.5 and +// `check_danger_zone_does_not_flag_open_batch_zombie`). The submitter's +// `check_danger_zone` runs every tick and is intentionally **closed-only**: +// its response (shutdown → flush → restart) only makes sense for batches +// that have a pending L1 submission and could zombie on confirm. An open +// Tip has no submission and no zombie risk — flagging it would trigger a +// pointless restart loop. +// +// This test exercises the invariant end-to-end. With L1 reachable and the +// wall clock held (`max_batch_open` not yet elapsed), we advance L1 into +// the danger zone and verify the sequencer keeps running. Only once we +// shift the wall clock past `max_batch_open` — forcing the Tip to close +// naturally — does the submitter's tick rightly fire `DangerZone`. +// +// Staging (decoupled L1/wall-clock advance): +// 1. Baseline: deposit + transfer → Tip at first_frame_safe_block X. +// 2. `mine_l1_blocks(1150)` — current_safe_block jumps 1150 past X +// (into the danger window, below MAX_WAIT). Wall clock unchanged, +// so inclusion lane's time-based close doesn't fire and Tip stays +// open. +// 3. `observe_for(8 s)` — real wall-clock wait that gives the input +// reader (~2 s poll) time to sync the new safe head and the +// submitter (~5 s poll) time to tick at least once. Assert the +// child is still alive: the only way it would exit here is if the +// zombie check wrongly flagged the open Tip. +// 4. `set_faketime_offset("+2h5m")` — jump the wall clock past +// `DEFAULT_MAX_BATCH_OPEN` (2 h). Inclusion lane's next iteration +// closes the Tip. Closed batch's age in L1 blocks is already 1150 +// > `danger_threshold` (1125), so the submitter's next tick exits +// with `DangerZone`. +// 5. `wait_for_exit` + assert exit status is non-zero and +// `counts.invalidated == 0` (we never crossed MAX_WAIT_BLOCKS, so +// no cascade). +// +// If someone accidentally unifies `check_danger_zone` to include open +// batches, step 3's `observe_for` captures a `Some(exit)` and this test +// fails with a clear message. That's the bug class the schema refactor +// was designed to prevent. +async fn run_aging_open_tip_tolerated_by_zombie_check_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Comfortably past `DANGER_THRESHOLD_BLOCKS`, below `MAX_WAIT_BLOCKS`. No + // cascade expected. Uses module-level `DANGER_ZONE_BLOCKS`. + // Must exceed `DEFAULT_MAX_BATCH_OPEN` (2 h = 7200 s). 5 min of headroom. + // Use the `+Ns` format that `advance_wall_and_mine` writes — libfaketime + // parses it reliably; combined-unit forms like `+2h5m` are unreliable. + const WALL_CLOCK_PAST_MAX_BATCH_OPEN: &str = "+7500s"; + // Spans at least one submitter `idle_poll_interval` (default 5 s) plus + // input-reader lag (~2 s) with a safety margin, so we can reliably + // observe "did not exit" rather than racing the first tick. + const TOLERATE_WINDOW: Duration = Duration::from_secs(8); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Baseline: one transfer into the open Tip. The Tip's first frame is + // anchored at the current safe_block; we'll advance L1 past this + // without closing. + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice)?; + let mut replay = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + // L1 jumps into the danger window; wall clock stays put (Tip stays + // open). `mine_l1_blocks` doesn't touch faketime, so this is a + // genuinely decoupled advance. + runtime.mine_l1_blocks(DANGER_ZONE_BLOCKS).await?; + + // Negative control: the submitter's zombie check must NOT fire on + // the aging open Tip. If it did, `observe_for` returns `Some(exit)` + // and we fail with a clear message. + let early_exit = runtime.observe_for(TOLERATE_WINDOW).await?; + assert!( + early_exit.is_none(), + "sequencer must tolerate an aging open Tip while L1 is reachable — \ + zombie check is closed-only; got unexpected exit {early_exit:?}", + ); + + // Trigger the natural close: jump the wall clock past + // `max_batch_open`. The inclusion lane closes on its next iteration + // (~10 ms), producing a closed batch already in danger. The + // submitter's next tick (within ~5 s) sees it and exits. + runtime.set_faketime_offset(Some(WALL_CLOCK_PAST_MAX_BATCH_OPEN.to_string()))?; + + let exit = runtime.wait_for_exit(Duration::from_secs(15)).await?; + assert!( + !exit.success(), + "sequencer must exit non-zero on submitter `DangerZone` after Tip closes, got {exit:?}", + ); + + // Below MAX_WAIT_BLOCKS: no cascade. The batch is flush-eligible but + // not invalidated. If anyone changes `run_startup_recovery` to + // cascade at `danger_threshold` instead of `MAX_WAIT_BLOCKS`, this + // assertion fails and signals the regression. + let counts = runtime.count_batches()?; + assert_eq!( + counts.invalidated, 0, + "danger-zone self-exit must not invalidate batches: {counts:?}", + ); + + Ok(()) +} + +// §7.8.6 — Provider reachable, safe head frozen, live submitter self-exits. +// +// This is the runtime twin of §7.8.5. We first reproduce the existing +// "aging open Tip under reachable L1" negative control: the reader catches up +// to a danger-window safe head and the sequencer stays alive because the +// closed-batch zombie check intentionally ignores the open Tip. We then freeze +// safe-head progress (no more L1 blocks) and jump only the sequencer's wall +// clock forward. The live submitter should notice the missing safe-progress +// timestamp advance and exit with `DangerZone` before the provider itself +// fails. +async fn run_stalled_safe_head_live_exit_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + const STALLED_SAFE_HEAD_OFFSET: &str = "+30s"; + const SAFE_HEAD_SYNC_WINDOW: Duration = Duration::from_secs(8); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice)?; + let mut replay = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + runtime.mine_l1_blocks(DANGER_ZONE_BLOCKS).await?; + + let early_exit = runtime.observe_for(SAFE_HEAD_SYNC_WINDOW).await?; + assert!( + early_exit.is_none(), + "the closed-only zombie check must keep tolerating an aging open Tip before the safe head stalls; got unexpected exit {early_exit:?}", + ); + + runtime.set_faketime_offset(Some(STALLED_SAFE_HEAD_OFFSET.to_string()))?; + + let exit = runtime.wait_for_exit(Duration::from_secs(15)).await?; + assert!( + !exit.success(), + "reachable-but-stalled safe head must force a non-zero self-exit before the provider fails, got {exit:?}", + ); + + let counts = runtime.count_batches()?; + assert_eq!( + counts.invalidated, 0, + "live stalled-safe-head shutdown must not cascade batches on its own: {counts:?}", + ); + + Ok(()) +} + +// §4.4.2 — Reconnect at a previously-observed offset that got invalidated +// after the WS connection dropped. +// +// A WS connection cannot span invalidation: the sequencer necessarily exits +// (DangerZone or stop) before `detect_and_recover` runs, and the socket dies +// with the process. The meaningful invariant is the **reconnect** behavior — +// a client that reconnects at `from_offset=N`, where `N` was an offset it +// previously received and whose row is *now invalidated*, must see the +// cursor skip cleanly past `N` and deliver only post-recovery events. +// +// §4.4.1 covers the adjacent case (`from_offset=0`), which trivially walks +// `valid_sequenced_l2_txs` from the start. This case is distinct because +// the query `WHERE offset > N` is pointed at an offset that no longer +// exists in the valid view. +async fn run_ws_reconnect_at_invalidated_offset_skips_cleanly_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Past-stale: matches `recovery_after_stale_batches_test` sizing. + const PAST_STALE: Duration = blocks_as_duration(MAX_WAIT_BLOCKS); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice)?; + let mut replay_before = ReplayWalletApp::devnet(); + + // Build up offsets 0 (deposit) and 1 (transfer) and capture the + // transfer's offset so we can later reconnect at it. + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + U256::from(600_000_u64), + ) + .await?; + alice_l2 + .transfer(bob_address, U256::from(100_000_u64)) + .await?; + let transfer_msg = ws.expect_user_op_from(alice_address).await?; + let last_seen_offset = transfer_msg.offset(); + replay_before.apply(transfer_msg)?; + + // Kill the WS socket and the sequencer (same way a real reconnect arc + // works — process dies, client dials back in). + drop(ws); + runtime.stop().await?; + + runtime.advance_wall_and_mine(PAST_STALE).await?; + runtime.respawn().await?; + + // Reconnect at the last offset the client observed — now invalidated. + // The query `WHERE offset > last_seen_offset` against + // `valid_sequenced_l2_txs` must skip cleanly past the invalidated + // rows and deliver only the post-recovery events (the re-drained + // deposit). + let mut ws_after = runtime.ws(last_seen_offset).await?; + let redrained = ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?; + // The re-drained deposit's offset is strictly greater than the + // last-seen offset — if the cursor ever delivered an invalidated row + // or the same offset again, that'd be the regression. + assert!( + redrained.offset() > last_seen_offset, + "re-drained event must have a strictly-greater offset: \ + last_seen={last_seen_offset}, redrained={}", + redrained.offset(), + ); + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + // Sanity check: also reconnecting at 0 produces the same single event + // (§4.4.1's property), to rule out any one-off weirdness in the + // non-zero reconnect path. + drop(ws_after); + let mut ws_from_zero = runtime.ws(0).await?; + let redrained_from_zero = ws_from_zero + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?; + assert_eq!( + redrained.offset(), + redrained_from_zero.offset(), + "reconnect-at-invalidated and reconnect-at-zero must deliver the \ + same next valid event", + ); + ws_from_zero + .expect_no_message_for(NO_WS_MESSAGE_WAIT) + .await?; + + Ok(()) +} + +// §4.1.3 — `from_offset=future` waits silently without erroring. +// +// A subscribe at a far-future offset is a valid subscription that should +// behave the same way `from_offset=0` does on an empty feed: sit idle on +// the live broadcast channel until an event with a greater offset arrives, +// no error, no close. +// +// The behavior is deliberately consistent with `from_offset=0` on an empty +// head — otherwise we'd be making the wait-for-something-new path differ +// based on whether history exists. Test pins this as part of the WS +// subscription contract. +async fn run_ws_subscribe_from_future_offset_waits_silently_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Comfortably beyond any offset this test will produce. `sequenced_l2_txs` + // is rowid-based; rowid_u64 ≤ a few by the end of the short workload. + const FUTURE_OFFSET: u64 = 1_000_000; + // Enough real time to observe "waits silently" without being slow. + const WAIT_WINDOW: Duration = Duration::from_secs(2); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + // Seed some actual events so we're not testing "empty head, future + // offset" (trivial case). We want "non-trivial head, offset beyond it". + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice)?; + let mut replay = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay, + &alice_l1, + U256::from(500_000_u64), + ) + .await?; + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + // Subscribe far beyond the current head. The subscribe itself must + // succeed (no 4xx / WS close code), and the resulting stream must be + // quiet until something with a greater offset arrives. + let mut ws_future = runtime.ws(FUTURE_OFFSET).await?; + ws_future.expect_no_message_for(WAIT_WINDOW).await?; + + // Generate more activity. These events are still at offsets far below + // `FUTURE_OFFSET`, so they must not be delivered — the subscription + // keeps waiting. + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + ws_future.expect_no_message_for(WAIT_WINDOW).await?; + + Ok(()) +} + +// §7.4.2 — Safe direct input that was NOT yet drained before the cascade +// must be drained into the recovery batch's first frame. +// +// Distinct from §7.4.1 (`recovery_after_stale_batches_test`), where the +// direct input was drained into an invalidated batch and gets *re*-drained +// on recovery. Here we exercise the simpler case: the input hit the +// sequencer's view post-stop, so it was never referenced by any frame; +// recovery must include it in the fresh batch's leading range. +// +// Setup: +// 1. Spawn + stop immediately. Initial Tip is empty and anchored at an +// early safe_block. +// 2. Deposit on L1 directly (sequencer is stopped, so the event isn't +// consumed yet). +// 3. Advance L1 past MAX_WAIT_BLOCKS to age the empty initial Tip past +// stale. +// 4. Respawn. Startup recovery syncs the new safe head, sees the +// deposit in `safe_inputs`, cascades the aged initial Tip, and opens +// a recovery batch with `leading_range = [next_undrained, end)` — +// including the undrained deposit. +// 5. WS replay at offset 0 must deliver the deposit event (drained +// exactly once, into the recovery batch's first frame). +async fn run_recovery_drains_safe_but_undrained_direct_input_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + const PAST_STALE: Duration = blocks_as_duration(PAST_STALE_BLOCKS); + + let alice = TestSigner::from_default(1)?; + let alice_address = alice.address(); + + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + + // Stop the sequencer before any user-level activity. The initial Tip + // is empty and anchored at whatever safe_block the lane saw on first + // boot. + runtime.stop().await?; + + // Deposit happens entirely on L1 while the sequencer is offline — + // WalletL1Client dials Anvil directly, not through the sequencer. + let deposit_amount = U256::from(600_000_u64); + alice_l1.mint_supported_token(deposit_amount).await?; + alice_l1.deposit_supported_token(deposit_amount).await?; + + // Advance L1 past MAX_WAIT_BLOCKS + safe-depth so the aged empty + // initial Tip gets cascaded and the deposit event is safe. + runtime.advance_wall_and_mine(PAST_STALE).await?; + + runtime.respawn().await?; + + // WS from offset 0. Recovery batch's first frame must contain the + // deposit (never drained before), and nothing else. + let mut ws_after = runtime.ws(0).await?; + let mut replay_after = ReplayWalletApp::devnet(); + let deposit_msg = ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?; + replay_after.apply(deposit_msg)?; + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + assert_eq!( + replay_after.current_user_balance(alice_address), + deposit_amount, + "the deposit, never previously drained, must land in the recovery \ + batch's first frame", + ); + + // Cascade fired on the empty initial Tip. + let counts = runtime.count_batches()?; + assert!( + counts.invalidated >= 1, + "expected the empty initial Tip to be cascaded: {counts:?}", + ); + + Ok(()) +} + +// §7.4.3 — Recovery batch opens empty when no direct inputs are pending. +// +// Negative control for §7.4.2: same overall shape but with no L1 deposit +// before respawn. The recovery batch's `leading_range` is `[0, 0)` and the +// batch's first frame is empty. WS replay delivers nothing. +async fn run_recovery_batch_opens_empty_when_no_direct_inputs_pending_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + const PAST_STALE: Duration = blocks_as_duration(PAST_STALE_BLOCKS); + + runtime.stop().await?; + + // No deposits, no user ops. Just age the initial Tip past stale. + runtime.advance_wall_and_mine(PAST_STALE).await?; + + runtime.respawn().await?; + + // WS from offset 0 must deliver nothing — the recovery batch is empty. + let mut ws_after = runtime.ws(0).await?; + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + // Cascade still fired (empty initial Tip past MAX_WAIT). + let counts = runtime.count_batches()?; + assert!( + counts.invalidated >= 1, + "expected the empty initial Tip to be cascaded even without direct \ + inputs: {counts:?}", + ); + + Ok(()) +} + +// §10.1.1 — Replay determinism: for any workload accepted live, catch-up +// replay must produce an identical per-user state. +// +// This is the `Application` trait's fundamental contract (see +// `AGENTS.md` §Application-Trait-Contract). Without it, restart +// replay and WS catch-up aren't equivalent to live execution — the +// whole soft-confirmation model collapses. +// +// `restart_and_replay_test` covers a single-user two-op workload; this +// test uses a deliberately diverse multi-user, multi-op mix (three +// senders, deposits interleaved with transfers and withdrawals) and +// asserts a *direct* equality between the live replay (assembled from +// WS events observed during execution) and the post-restart replay +// (assembled from WS catch-up at offset 0). Any per-user balance or +// nonce divergence would signal a non-deterministic application or a +// catch-up bug. +async fn run_replay_matches_live_for_mixed_workload_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let charlie = TestSigner::from_default(3)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + let charlie_address = charlie.address(); + + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + let charlie_l1 = runtime.wallet_l1(charlie.clone()).await?; + let mut alice_l2 = runtime.wallet_l2(alice)?; + let mut bob_l2 = runtime.wallet_l2(bob)?; + let mut charlie_l2 = runtime.wallet_l2(charlie)?; + + let mut ws = runtime.ws(0).await?; + let mut replay_live = ReplayWalletApp::devnet(); + + // Diverse workload — exercises deposit-interleaving and every op + // combination supported by the wallet app. + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_live, + &alice_l1, + U256::from(1_000_000_u64), + ) + .await?; + alice_l2 + .transfer(bob_address, U256::from(400_000_u64)) + .await?; + replay_live.apply(ws.expect_user_op_from(alice_address).await?)?; + + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_live, + &charlie_l1, + U256::from(500_000_u64), + ) + .await?; + bob_l2 + .transfer(charlie_address, U256::from(150_000_u64)) + .await?; + replay_live.apply(ws.expect_user_op_from(bob_address).await?)?; + + charlie_l2.withdraw(U256::from(100_000_u64)).await?; + replay_live.apply(ws.expect_user_op_from(charlie_address).await?)?; + + alice_l2 + .transfer(charlie_address, U256::from(50_000_u64)) + .await?; + replay_live.apply(ws.expect_user_op_from(alice_address).await?)?; + + bob_l2.withdraw(U256::from(50_000_u64)).await?; + replay_live.apply(ws.expect_user_op_from(bob_address).await?)?; + + let expected_input_count = replay_live.executed_input_count(); + + // Restart + catch-up replay. Each WS catch-up event feeds the fresh + // replay identically to how the live stream fed the original; if the + // application is deterministic, the two replays must be bit-identical + // across every per-user view the replay exposes. + drop(ws); + runtime.restart().await?; + let mut ws_after = runtime.ws(0).await?; + let mut replay_post = ReplayWalletApp::devnet(); + + // Two deposits + five user ops = seven events. + for _ in 0..expected_input_count { + replay_post.apply(ws_after.next_message().await?)?; + } + ws_after.expect_no_message_for(NO_WS_MESSAGE_WAIT).await?; + + for addr in [alice_address, bob_address, charlie_address] { + assert_eq!( + replay_post.current_user_balance(addr), + replay_live.current_user_balance(addr), + "balance divergence for {addr:?}: live vs. replay must match", + ); + assert_eq!( + replay_post.current_user_nonce(addr), + replay_live.current_user_nonce(addr), + "nonce divergence for {addr:?}: live vs. replay must match", + ); + } + assert_eq!( + replay_post.executed_input_count(), + replay_live.executed_input_count(), + ); + + Ok(()) +} + +// §5.4.1 / §5.4.2 — Transient provider outage: the L1 input reader must +// retry on provider errors (connection refused, timeout) without +// crashing, and pick up the backlog on reconnect. +// +// Distinct from §11.4.1 (short hiccup under load), which tests the batch +// submitter's retry path via POST activity. Here the interesting +// component is the **input reader**: its only job is polling L1 for new +// events, so the only observable signal that its retry loop works is +// whether a deposit made *during the disconnect* (and thus invisible +// until the proxy comes back) lands on the WS feed after reconnect. +// +// Scenario: +// 1. Route the sequencer through the proxy. +// 2. Disconnect proxy. Alice deposits on L1 (via `WalletL1Client`, +// which dials Anvil directly — bypassing the proxy). +// 3. Advance a few L1 blocks to push the deposit past safe depth. The +// sequencer's reader keeps failing to fetch (connection refused +// from the disconnected proxy) and retrying. +// 4. Reconnect proxy. The reader's next poll succeeds; backlog is +// pulled in; the WS subscriber (still connected) receives the +// deposit event. +// 5. Assert the sequencer didn't crash (no respawn needed, still same +// child) and the deposit landed. +async fn run_provider_outage_input_reader_retries_after_reconnect_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Well below any stale threshold — we just need safe-depth headroom. + const SAFE_DEPTH_HEADROOM_BLOCKS: u64 = 20; + + let alice = TestSigner::from_default(1)?; + let alice_address = alice.address(); + + // Route through the proxy (stop → override → respawn). + runtime.stop().await?; + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + runtime.respawn().await?; + + let alice_l1 = runtime.wallet_l1(alice).await?; + let mut ws = runtime.ws(0).await?; + let mut replay = ReplayWalletApp::devnet(); + + // Baseline deposit with the proxy connected — proves the WS + reader + // path works end-to-end before we break it. + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay, + &alice_l1, + U256::from(300_000_u64), + ) + .await?; + + // Proxy down. The sequencer's reader polls on a ~2 s cadence; each + // poll will fail with a connection-refused-style provider error until + // we reconnect. + proxy.disconnect(); + + // Deposit while the proxy is down. The L1 wallet bypasses the proxy, + // so Anvil sees the deposit but the sequencer can't. + let late_amount = U256::from(400_000_u64); + alice_l1.mint_supported_token(late_amount).await?; + alice_l1.deposit_supported_token(late_amount).await?; + runtime.mine_l1_blocks(SAFE_DEPTH_HEADROOM_BLOCKS).await?; + + // During the disconnect, the reader should keep retrying rather than + // crashing. Assert the sequencer stays up for a few real seconds + // (long enough for multiple reader polls to fail + retry). + let early_exit = runtime.observe_for(Duration::from_secs(5)).await?; + assert!( + early_exit.is_none(), + "input reader must retry provider errors, not crash the process: \ + got unexpected exit {early_exit:?}", + ); + + // Reconnect. The reader's next poll succeeds, picks up the backlog, + // WS subscriber receives the event. + proxy.reconnect(); + + let late_deposit_msg = ws + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?; + replay.apply(late_deposit_msg)?; + + assert_eq!( + replay.current_user_balance(alice_address), + U256::from(700_000_u64), + "both deposits should be reflected after reader catches up", + ); + + proxy.shutdown().await?; + Ok(()) +} + +// §8.1.2 — First-ever boot with empty bootstrap cache + L1 unreachable +// returns a fixed `RunError::Io("L1 unreachable and no bootstrap cache")`. +// +// Distinct from §7.8.2 (already covered): that test exercises the +// wall-clock fallback inside `run_preemptive_recovery`, which only fires +// AFTER bootstrap discovery has succeeded once (so the cache is +// populated). §8.1.2 targets the EARLIER failure — the +// `InputReader::new` discovery step where the sequencer asks L1 for the +// InputBox address + chain id. With nothing cached, that call has no +// fallback and the boot fails before recovery logic runs. +// +// The harness simulates "no cache" by `clear_l1_bootstrap_cache()` after +// a normal boot has populated it (truly first-ever boot would also lack +// a chain-id cache, but the failure mode is identical: the bootstrap +// step has nothing to fall back to). +async fn run_first_boot_no_cache_l1_unreachable_refuses_boot_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Baseline boot to ensure the schema is fully migrated. We then + // clear the cache to mimic a first-ever boot. + { + let _ws = runtime.ws(0).await?; + } + runtime.stop().await?; + runtime.clear_l1_bootstrap_cache()?; + + // Route through a disconnected proxy so InputReader::new fails with + // a provider error. + let proxy = TcpProxy::spawn(runtime.l1_endpoint()).await?; + runtime.set_l1_endpoint_override(Some(proxy.endpoint())); + proxy.disconnect(); + + let respawn_result = runtime.respawn().await; + assert!( + respawn_result.is_err(), + "first boot with no cache + L1 unreachable must refuse boot, got {respawn_result:?}", + ); + + // Verify reversibility: reconnect proxy, respawn, this time the + // bootstrap step succeeds and populates the cache. + proxy.reconnect(); + runtime.respawn().await?; + + proxy.shutdown().await?; + Ok(()) +} + +// §8.2.1 / §8.3.1 — Chain-id mismatch via the live RPC path. +// +// Companion to `chain_id_mismatch_from_cache_returns_typed_error` in +// `sequencer/tests/chain_id_validation.rs`. The cache-path test runs +// in-process against a stub; this test runs the full sequencer binary +// against real Anvil with a deliberately mismatched `--chain-id`, +// proving the RPC-comparison path returns +// `RunError::ChainIdMismatch` *before* writing the wrong-chain +// bootstrap cache. +// +// The pre-write ordering matters: a regression that swapped the +// cache-write and the chain-id check would leave a bad cache row on +// disk, poisoning future startups. Asserting `respawn_result.is_err()` +// alone catches the bad-error case; we additionally verify a +// post-correction respawn succeeds, which only happens if the cache +// wasn't poisoned (bootstrap reads the L1 chain-id again, sees it +// matches, writes the correct cache). +async fn run_chain_id_mismatch_via_live_rpc_refuses_boot_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Anvil runs at `DEVNET_CHAIN_ID = 31337`. Pick something obviously + // different that's still valid (chain_id > 0). + const WRONG_CHAIN_ID: u64 = 99_999; + + // Initial boot completes normally (no override). This populates the + // cache with the correct chain id. + { + let _ws = runtime.ws(0).await?; + } + runtime.stop().await?; + + // Clear the cache so the live RPC path runs (otherwise the cache + // path would catch the mismatch first). + runtime.clear_l1_bootstrap_cache()?; + + // Configure a mismatched chain id and respawn. The bootstrap-time + // RPC check returns the actual chain id (31337), compares it with + // the configured `--chain-id` (99999), and returns ChainIdMismatch. + runtime.set_chain_id_override(Some(WRONG_CHAIN_ID)); + let respawn_result = runtime.respawn().await; + assert!( + respawn_result.is_err(), + "chain-id mismatch via live RPC must refuse boot, got {respawn_result:?}", + ); + + // Reset to the correct chain id. Respawn must succeed — proves the + // failed attempt didn't poison the cache or other DB state. + runtime.set_chain_id_override(None); + runtime.respawn().await?; + + Ok(()) +} + +// §7.5.1 / §7.5.2 — Nonce-0 first batch recovery edge. +// +// Two coupled invariants: +// - §7.5.1: If the FIRST-EVER batch (nonce 0) goes stale before any +// batch reaches `Gold` (i.e., before any batch is L1-accepted), +// recovery cascades it and opens a fresh recovery batch that itself +// has nonce 0 (parent NULL — there's no valid ancestor to point +// at). No genesis sentinel exists in the implementation; the +// parent-pointer schema must handle "all batches invalidated" +// natively. +// - §7.5.2: After §7.5.1, the recovery batch (with nonce 0 reused) +// submits to L1, gets accepted by `populate_safe_accepted_batches`, +// and lands in `safe_accepted_batches` — proving the scheduler- +// simulation cursor handles a reused nonce after cascade correctly. +// +// The structural invariants in §7.5.1 are validated by +// `assert_schema_invariants` (post-test hook in `tests/e2e/src/main.rs`): +// it checks that NULL-parent batches have nonce 0 and that valid-path +// nonces form a contiguous `0..N`. So this test asserts those +// observable consequences plus the explicit `safe_accepted_batches` +// post-condition for §7.5.2. +// +// Setup uses T2 (auto-mining off + drop) so the first batch's L1 +// submission is dropped before reaching the chain — guaranteeing it +// never reaches `Gold` before being cascaded. +async fn run_nonce_zero_recovery_invalidates_then_accepts_at_nonce_zero_test( + runtime: &mut ManagedSequencer, +) -> ScenarioResult<()> { + // Past stale to ensure the cascade fires. + const PAST_STALE: Duration = blocks_as_duration(PAST_STALE_BLOCKS); + // Force a size-triggered batch close. Same sizing as §11.1.4. + const TRANSFERS_TO_FORCE_CLOSE: usize = 150; + // Submitter idle_poll_interval = 5 s; allow one tick for the batch + // to enter the (held) mempool. + const WAIT_FOR_SUBMITTER_TICK: Duration = Duration::from_secs(7); + + let alice = TestSigner::from_default(1)?; + let bob = TestSigner::from_default(2)?; + let alice_address = alice.address(); + let bob_address = bob.address(); + + let alice_l1 = runtime.wallet_l1(alice.clone()).await?; + + // Fund Alice and queue many transfers into the open batch (which is + // the FIRST EVER batch — nonce 0). Using auto-mining-off across the + // submitter's tick so the batch's L1 tx hits the mempool but never + // mines, then dropping it. + let mut replay_before = ReplayWalletApp::devnet(); + { + let mut ws = runtime.ws(0).await?; + let mut alice_l2 = runtime.wallet_l2(alice.clone())?; + apply_safe_supported_deposit( + runtime, + &mut ws, + &mut replay_before, + &alice_l1, + U256::from(10_000_000_u64), + ) + .await?; + + runtime.set_automine(false).await?; + + for _ in 0..TRANSFERS_TO_FORCE_CLOSE { + alice_l2.transfer(bob_address, U256::from(1_u64)).await?; + replay_before.apply(ws.expect_user_op_from(alice_address).await?)?; + } + + // Let the submitter tick fire and put the (nonce-0) batch's L1 + // tx into the held mempool. + tokio::time::sleep(WAIT_FOR_SUBMITTER_TICK).await; + } + + runtime.stop().await?; + runtime.drop_all_pending_txs().await?; + + runtime.advance_wall_and_mine(PAST_STALE).await?; + runtime.set_automine(true).await?; + + runtime.respawn().await?; + + // §7.5.1 assertions: the only existing batch (the original nonce-0 + // one) was cascaded, and a recovery batch was opened. The recovery + // batch's invariants (NULL parent → nonce 0) are checked structurally + // by the post-test `assert_schema_invariants` hook. + let counts = runtime.count_batches()?; + assert!( + counts.invalidated >= 1, + "expected the original nonce-0 batch to be invalidated: {counts:?}", + ); + assert!( + counts.total > counts.invalidated, + "recovery batch must exist alongside the invalidated original: {counts:?}", + ); + + // Replay shows the deposit re-drained, transfers gone (rolled back). + // Recreate WS + wallet against the post-respawn HTTP endpoint + // (`runtime.endpoint()` rebinds to a fresh port on every respawn). + let mut ws_after = runtime.ws(0).await?; + let mut alice_l2_fresh = runtime.wallet_l2(alice)?; + let mut replay_after = ReplayWalletApp::devnet(); + replay_after.apply( + ws_after + .expect_direct_input_from(runtime.erc20_portal_address()) + .await?, + )?; + assert_eq!( + replay_after.current_user_balance(alice_address), + U256::from(10_000_000_u64), + "Alice must have her full deposit back after nonce-0 cascade", + ); + assert_eq!(replay_after.current_user_balance(bob_address), U256::ZERO,); + assert_eq!(replay_after.current_user_nonce(alice_address), 0); + + // §7.5.2 — drive enough work into the recovery batch that the + // submitter closes it by size and submits to L1. With auto-mining + // back on, the submission lands and the input reader picks it up + // into `safe_inputs`; `populate_safe_accepted_batches` accepts it + // at the expected nonce (0, reused). + for _ in 0..TRANSFERS_TO_FORCE_CLOSE { + alice_l2_fresh + .transfer(bob_address, U256::from(1_u64)) + .await?; + replay_after.apply(ws_after.expect_user_op_from(alice_address).await?)?; + } + + // Wait for the submitter to fire a tick + submit the batch. Anvil's + // instamine puts the submission at 1 confirmation; the submitter's + // `wait_for_confirmations` needs `confirmation_depth + 1 = 3`. We + // explicitly mine the remaining 2 blocks below to unblock it without + // having to wait the full 72s timeout. + tokio::time::sleep(Duration::from_secs(7)).await; + runtime.mine_l1_blocks(2).await?; + + // After confirmations land, the submitter's tick loop continues: + // next iteration runs `refresh_recovery_metadata` → + // `populate_safe_accepted_batches_inner`, which appends the batch + // to `safe_accepted_batches` at its expected nonce (0, reused). + tokio::time::sleep(Duration::from_secs(10)).await; + + let (accepted_count, min_accepted_nonce) = runtime.count_safe_accepted_batches()?; + assert!( + accepted_count >= 1, + "expected at least one batch to land in safe_accepted_batches \ + post-recovery (proves §7.5.2 reused-nonce-0 was accepted): \ + count={accepted_count}", + ); + assert_eq!( + min_accepted_nonce, + Some(0), + "the first L1-accepted batch must have nonce 0 (reused after \ + cascade) — got {min_accepted_nonce:?}", + ); + + Ok(()) +} + +fn eip712_domain(runtime: &ManagedSequencer) -> alloy_sol_types::Eip712Domain { + sequencer_core::build_input_domain(runtime.domain_chain_id(), runtime.verifying_contract()) } fn ssz_encode_transfer(to: Address, amount: U256) -> Vec { diff --git a/tests/harness/Cargo.toml b/tests/harness/Cargo.toml index 7c9644b..76256a8 100644 --- a/tests/harness/Cargo.toml +++ b/tests/harness/Cargo.toml @@ -17,6 +17,7 @@ alloy-sol-types = "1.4.1" cartesi-rollups-contracts = "=2.2.0" futures-util = "0.3" k256 = "0.13.4" +rusqlite = { version = "0.38.0", features = ["bundled"] } sequencer-core = { path = "../../sequencer-core" } sequencer-rust-client = { path = "../../sdk/rust-client" } serde = { version = "1", features = ["derive"] } diff --git a/tests/harness/src/lib.rs b/tests/harness/src/lib.rs index 2f4d5c5..a528461 100644 --- a/tests/harness/src/lib.rs +++ b/tests/harness/src/lib.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 (see LICENSE) pub mod paths; +pub mod proxy; pub mod replay; pub mod rollups; pub mod sequencer; @@ -11,11 +12,12 @@ pub mod ws; pub type HarnessResult = Result>; +pub use proxy::TcpProxy; pub use replay::ReplayWalletApp; pub use rollups::{DEVNET_CHAIN_ID, DevnetRollupsStack}; pub use sequencer::{ - DEFAULT_DEVNET_SEQUENCER_BIN, DEFAULT_TEST_LOGS_DIR, ManagedSequencer, ManagedSequencerConfig, - default_devnet_sequencer_config, + BatchCounts, DEFAULT_DEVNET_SEQUENCER_BIN, DEFAULT_TEST_LOGS_DIR, ManagedSequencer, + ManagedSequencerConfig, RespawnAttemptOutcome, RespawnPolicy, default_devnet_sequencer_config, }; pub use wallet::{ TestSigner, WalletL1Client, WalletL2Client, address_from_signing_key, sign_user_op_hex, diff --git a/tests/harness/src/proxy.rs b/tests/harness/src/proxy.rs new file mode 100644 index 0000000..778325b --- /dev/null +++ b/tests/harness/src/proxy.rs @@ -0,0 +1,439 @@ +// (c) Cartesi and individual authors (see AUTHORS) +// SPDX-License-Identifier: Apache-2.0 (see LICENSE) + +//! TCP proxy with programmatic `disconnect()` / `reconnect()` for outage +//! simulation in tests. +//! +//! Layout: +//! +//! ```text +//! Sequencer ──→ TcpProxy (127.0.0.1:proxy_port) ──→ Anvil (upstream) +//! ↑ +//! disconnect() / reconnect() +//! controlled from test code +//! ``` +//! +//! Behavior: +//! +//! - `disconnect()` flips an internal flag. All existing forwarded connections +//! are torn down (their forwarding tasks observe the flag and exit, dropping +//! the sockets). New connection attempts still succeed at the TCP accept +//! level, but are immediately closed. To the sequencer, this looks like the +//! upstream aggressively resets every connection — the same client-visible +//! behavior as a node that went down. +//! +//! - `reconnect()` flips the flag back. Subsequent connections forward +//! normally; the sequencer's next retry after backoff reconnects as if the +//! upstream is back. +//! +//! - Anvil (the real upstream) stays running behind the proxy the whole time, +//! so the test can bypass the proxy to mine blocks on it directly via a +//! separate client connected to the upstream port. That's how we simulate +//! "L1 advanced while the sequencer's gateway was down." +//! +//! The proxy listens on `127.0.0.1:0` by default, picking an ephemeral port +//! the OS hands out; the actual port is read back via `endpoint()`. + +use std::net::SocketAddr; +use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; + +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tokio::net::{TcpListener, TcpStream}; +use tokio::task::JoinHandle; + +use crate::HarnessResult; +use crate::util::io_other; + +/// A programmable TCP proxy for L1 RPC outage simulation. +/// +/// Construct with [`TcpProxy::spawn`]. Flip the outage flag via +/// [`TcpProxy::disconnect`] / [`TcpProxy::reconnect`]. Retrieve the HTTP +/// endpoint for the sequencer to connect to via [`TcpProxy::endpoint`]. +pub struct TcpProxy { + listen_addr: SocketAddr, + upstream_addr: SocketAddr, + connected: Arc, + accept_task: Option>, + shutdown: Arc, +} + +impl TcpProxy { + /// Spawn a proxy forwarding to `upstream_url` (e.g., `http://127.0.0.1:8545`). + /// + /// The proxy binds `127.0.0.1:0` (an ephemeral port) and starts accepting + /// immediately. Use [`Self::endpoint`] to get the `http://127.0.0.1:` + /// URL for the sequencer to connect to. + pub async fn spawn(upstream_url: &str) -> HarnessResult { + let upstream_addr = parse_http_upstream(upstream_url)?; + let listener = TcpListener::bind("127.0.0.1:0") + .await + .map_err(|err| io_other(format!("proxy bind failed: {err}")))?; + let listen_addr = listener + .local_addr() + .map_err(|err| io_other(format!("proxy local_addr failed: {err}")))?; + + let connected = Arc::new(AtomicBool::new(true)); + let shutdown = Arc::new(AtomicBool::new(false)); + + let accept_task = { + let connected = connected.clone(); + let shutdown = shutdown.clone(); + tokio::spawn(async move { + accept_loop(listener, upstream_addr, connected, shutdown).await; + }) + }; + + Ok(Self { + listen_addr, + upstream_addr, + connected, + accept_task: Some(accept_task), + shutdown, + }) + } + + /// HTTP URL the sequencer should dial (e.g., `http://127.0.0.1:54321`). + pub fn endpoint(&self) -> String { + format!("http://{}", self.listen_addr) + } + + /// TCP address the proxy listens on. + pub fn listen_addr(&self) -> SocketAddr { + self.listen_addr + } + + /// Upstream Anvil TCP address (so tests can bypass the proxy to mine blocks). + pub fn upstream_addr(&self) -> SocketAddr { + self.upstream_addr + } + + /// Simulate upstream outage. All active connections are torn down and + /// future connection attempts are immediately closed. + /// + /// Idempotent: calling while already disconnected is a no-op. + pub fn disconnect(&self) { + self.connected.store(false, Ordering::SeqCst); + } + + /// Restore forwarding. Future connections forward to the upstream normally. + /// + /// Idempotent: calling while already connected is a no-op. Note that + /// existing TCP sockets that were torn down during `disconnect()` remain + /// closed; clients must establish new connections. + pub fn reconnect(&self) { + self.connected.store(true, Ordering::SeqCst); + } + + /// Returns `true` if the proxy is currently forwarding. + pub fn is_connected(&self) -> bool { + self.connected.load(Ordering::SeqCst) + } + + /// Shutdown the proxy cleanly. Called automatically on drop. + pub async fn shutdown(mut self) -> HarnessResult<()> { + self.shutdown.store(true, Ordering::SeqCst); + // Nudge the accept loop by opening a self-connection so it observes + // the shutdown flag on the next iteration. + let _ = TcpStream::connect(self.listen_addr).await; + if let Some(task) = self.accept_task.take() { + task.abort(); + let _ = task.await; + } + Ok(()) + } +} + +impl Drop for TcpProxy { + fn drop(&mut self) { + self.shutdown.store(true, Ordering::SeqCst); + if let Some(task) = self.accept_task.take() { + task.abort(); + } + } +} + +async fn accept_loop( + listener: TcpListener, + upstream_addr: SocketAddr, + connected: Arc, + shutdown: Arc, +) { + loop { + if shutdown.load(Ordering::SeqCst) { + return; + } + let (client, _) = match listener.accept().await { + Ok(pair) => pair, + Err(_) => continue, + }; + + // If the proxy is in "disconnected" mode, accept the TCP connection + // and immediately drop it. This produces the same visible effect as + // an upstream node refusing new connections. + if !connected.load(Ordering::SeqCst) { + drop(client); + continue; + } + + let connected = connected.clone(); + let shutdown = shutdown.clone(); + tokio::spawn(async move { + forward_connection(client, upstream_addr, connected, shutdown).await; + }); + } +} + +async fn forward_connection( + mut client: TcpStream, + upstream_addr: SocketAddr, + connected: Arc, + shutdown: Arc, +) { + let Ok(mut upstream) = TcpStream::connect(upstream_addr).await else { + // Upstream is unreachable — drop client (mirrors a broken forward). + return; + }; + + let (mut client_read, mut client_write) = client.split(); + let (mut upstream_read, mut upstream_write) = upstream.split(); + + // Pump bytes both directions concurrently. Exit on: + // - either half closing cleanly + // - proxy disconnect() being called + // - proxy shutdown + let client_to_upstream = async { + copy_until_disconnected(&mut client_read, &mut upstream_write, &connected, &shutdown).await + }; + let upstream_to_client = async { + copy_until_disconnected(&mut upstream_read, &mut client_write, &connected, &shutdown).await + }; + + // Race: as soon as either direction ends, the whole connection is done. + tokio::select! { + _ = client_to_upstream => {} + _ = upstream_to_client => {} + } +} + +/// Copy bytes until EOF, error, or disconnect/shutdown flag flips. +async fn copy_until_disconnected( + mut reader: R, + mut writer: W, + connected: &AtomicBool, + shutdown: &AtomicBool, +) where + R: AsyncReadExt + Unpin, + W: AsyncWriteExt + Unpin, +{ + // Small buffer is fine; JSON-RPC messages are small. We poll the flags + // between reads so a disconnect() is observed within one read of + // additional latency. + let mut buf = [0_u8; 8 * 1024]; + loop { + if shutdown.load(Ordering::SeqCst) || !connected.load(Ordering::SeqCst) { + return; + } + let read_result = + tokio::time::timeout(std::time::Duration::from_millis(50), reader.read(&mut buf)).await; + let n = match read_result { + Err(_) => continue, // timeout — poll the flags again + Ok(Ok(0)) => return, // clean EOF + Ok(Ok(n)) => n, + Ok(Err(_)) => return, + }; + if writer.write_all(&buf[..n]).await.is_err() { + return; + } + } +} + +fn parse_http_upstream(url: &str) -> HarnessResult { + // Expect `http://host:port` (optionally with a trailing slash). The proxy + // operates at the TCP level, so the scheme must be http(s) and the + // host:port pair must resolve to a single address synchronously. + let stripped = url + .strip_prefix("http://") + .or_else(|| url.strip_prefix("https://")) + .ok_or_else(|| io_other(format!("proxy upstream URL must be http(s)://, got: {url}")))?; + let host_port = stripped + .trim_end_matches('/') + .split('/') + .next() + .unwrap_or(""); + host_port + .parse::() + .map_err(|err| { + io_other(format!( + "proxy upstream URL '{url}' has invalid host:port: {err}" + )) + }) + .map_err(Into::into) +} + +#[cfg(test)] +mod tests { + use super::*; + use tokio::io::AsyncReadExt; + + async fn start_echo_server() -> (tokio::task::JoinHandle<()>, SocketAddr) { + let listener = TcpListener::bind("127.0.0.1:0").await.expect("bind"); + let addr = listener.local_addr().expect("local_addr"); + let handle = tokio::spawn(async move { + loop { + let Ok((mut stream, _)) = listener.accept().await else { + return; + }; + tokio::spawn(async move { + let mut buf = [0_u8; 1024]; + while let Ok(n) = stream.read(&mut buf).await { + if n == 0 { + return; + } + if stream.write_all(&buf[..n]).await.is_err() { + return; + } + } + }); + } + }); + (handle, addr) + } + + #[tokio::test] + async fn forwards_bytes_when_connected() { + let (_echo, echo_addr) = start_echo_server().await; + let proxy = TcpProxy::spawn(&format!("http://{echo_addr}")) + .await + .expect("spawn proxy"); + + let mut client = TcpStream::connect(proxy.listen_addr()) + .await + .expect("connect via proxy"); + client.write_all(b"hello").await.expect("write"); + + let mut buf = [0_u8; 5]; + client.read_exact(&mut buf).await.expect("read"); + assert_eq!(&buf, b"hello"); + } + + #[tokio::test] + async fn disconnect_closes_new_connections() { + let (_echo, echo_addr) = start_echo_server().await; + let proxy = TcpProxy::spawn(&format!("http://{echo_addr}")) + .await + .expect("spawn proxy"); + proxy.disconnect(); + + // New connection is accepted at TCP level but immediately closed. + let mut client = TcpStream::connect(proxy.listen_addr()) + .await + .expect("connect"); + let _ = client.write_all(b"hello").await; // may succeed or fail + let mut buf = [0_u8; 8]; + // Reading should end quickly. The OS may deliver this as EOF (n=0) or + // as ConnectionReset depending on whether our write raced ahead of + // the proxy's drop. Both are valid "connection closed" signals — we + // just assert the read doesn't hang. + let result = + tokio::time::timeout(std::time::Duration::from_millis(500), client.read(&mut buf)) + .await + .expect("read did not hang"); + match result { + Ok(0) => {} // clean EOF + Err(err) + if matches!( + err.kind(), + std::io::ErrorKind::ConnectionReset + | std::io::ErrorKind::ConnectionAborted + | std::io::ErrorKind::BrokenPipe + ) => {} // RST, also valid + other => panic!("disconnected proxy must close the connection, got: {other:?}"), + } + } + + #[tokio::test] + async fn disconnect_tears_down_active_connections() { + let (_echo, echo_addr) = start_echo_server().await; + let proxy = TcpProxy::spawn(&format!("http://{echo_addr}")) + .await + .expect("spawn proxy"); + + let mut client = TcpStream::connect(proxy.listen_addr()) + .await + .expect("connect"); + client.write_all(b"hi").await.expect("write"); + let mut buf = [0_u8; 2]; + client.read_exact(&mut buf).await.expect("initial read"); + assert_eq!(&buf, b"hi"); + + // Now disconnect. The active socket should be torn down. + proxy.disconnect(); + let mut tail = [0_u8; 8]; + let result = tokio::time::timeout( + std::time::Duration::from_millis(500), + client.read(&mut tail), + ) + .await + .expect("read did not hang"); + match result { + Ok(0) => {} // clean EOF + Err(err) + if matches!( + err.kind(), + std::io::ErrorKind::ConnectionReset + | std::io::ErrorKind::ConnectionAborted + | std::io::ErrorKind::BrokenPipe + ) => {} // RST + other => { + panic!("disconnected proxy must tear down existing connections, got: {other:?}") + } + } + } + + #[tokio::test] + async fn reconnect_accepts_new_connections_again() { + let (_echo, echo_addr) = start_echo_server().await; + let proxy = TcpProxy::spawn(&format!("http://{echo_addr}")) + .await + .expect("spawn proxy"); + + proxy.disconnect(); + // Old socket is dead. Reconnect and try a fresh one. + proxy.reconnect(); + + let mut client = TcpStream::connect(proxy.listen_addr()) + .await + .expect("connect after reconnect"); + client.write_all(b"back").await.expect("write"); + let mut buf = [0_u8; 4]; + client + .read_exact(&mut buf) + .await + .expect("read after reconnect"); + assert_eq!(&buf, b"back"); + } + + #[tokio::test] + async fn is_connected_reflects_state() { + let (_echo, echo_addr) = start_echo_server().await; + let proxy = TcpProxy::spawn(&format!("http://{echo_addr}")) + .await + .expect("spawn proxy"); + assert!(proxy.is_connected()); + proxy.disconnect(); + assert!(!proxy.is_connected()); + proxy.reconnect(); + assert!(proxy.is_connected()); + } + + #[test] + fn parse_upstream_url_forms() { + assert!(parse_http_upstream("http://127.0.0.1:8545").is_ok()); + assert!(parse_http_upstream("http://127.0.0.1:8545/").is_ok()); + assert!(parse_http_upstream("https://127.0.0.1:8545").is_ok()); + assert!(parse_http_upstream("ws://127.0.0.1:8545").is_err()); + assert!(parse_http_upstream("127.0.0.1:8545").is_err()); + assert!(parse_http_upstream("http://not-a-host").is_err()); + } +} diff --git a/tests/harness/src/rollups.rs b/tests/harness/src/rollups.rs index e14a412..e8e7ea6 100644 --- a/tests/harness/src/rollups.rs +++ b/tests/harness/src/rollups.rs @@ -91,6 +91,18 @@ impl DevnetRollupsStack { self.anvil.mine_blocks(block_count).await } + /// Toggle Anvil's auto-mining mode. When disabled, txs accumulate in + /// the mempool until an explicit `anvil_mine` call (or re-enable). + pub async fn set_automine(&self, enabled: bool) -> HarnessResult<()> { + self.anvil.set_automine(enabled).await + } + + /// Drop every pending tx from Anvil's mempool. Useful for simulating + /// mempool eviction or gateway packet loss. + pub async fn drop_all_pending_txs(&self) -> HarnessResult<()> { + self.anvil.drop_all_pending_txs().await + } + pub async fn shutdown(self) -> HarnessResult<()> { self.anvil.shutdown().await } @@ -214,6 +226,30 @@ impl ManagedAnvil { })?; Ok(()) } + + async fn set_automine(&self, enabled: bool) -> HarnessResult<()> { + let provider = ProviderBuilder::new() + .connect(self.endpoint.as_str()) + .await + .map_err(|err| io_other(format!("failed to connect anvil provider: {err}")))?; + provider + .anvil_set_auto_mine(enabled) + .await + .map_err(|err| io_other(format!("failed to set auto_mine={enabled}: {err}")))?; + Ok(()) + } + + async fn drop_all_pending_txs(&self) -> HarnessResult<()> { + let provider = ProviderBuilder::new() + .connect(self.endpoint.as_str()) + .await + .map_err(|err| io_other(format!("failed to connect anvil provider: {err}")))?; + provider + .anvil_drop_all_transactions() + .await + .map_err(|err| io_other(format!("failed to drop all pending txs: {err}")))?; + Ok(()) + } } fn read_deployment_address(path: &Path, contract_name: &str) -> HarnessResult
{ diff --git a/tests/harness/src/sequencer.rs b/tests/harness/src/sequencer.rs index e69a69b..559a630 100644 --- a/tests/harness/src/sequencer.rs +++ b/tests/harness/src/sequencer.rs @@ -34,6 +34,47 @@ pub struct ManagedSequencerConfig { pub logs_dir: PathBuf, } +/// Snapshot of the `batches` table. Returned by +/// [`ManagedSequencer::count_batches`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct BatchCounts { + pub total: u64, + pub sealed: u64, + pub invalidated: u64, +} + +/// Outcome of a single [`ManagedSequencer::respawn_and_watch`] attempt. +#[derive(Debug)] +pub enum RespawnAttemptOutcome { + /// The child came up and stayed alive for the requested stabilization + /// window. + Stable, + /// `respawn()` itself returned `Err` — the child exited during bootstrap + /// before HTTP became ready. Typically surfaces + /// `RecoveryError::Refuse(...)` from the startup decision table. + RespawnFailed(String), + /// `respawn()` returned `Ok` but the child exited within the + /// stabilization window. Typically surfaces + /// `RunError::DangerZoneDetected` from the runtime danger detector's + /// first post-boot poll. + ExitedPostRespawn(std::process::ExitStatus), +} + +impl RespawnAttemptOutcome { + pub fn is_stable(&self) -> bool { + matches!(self, Self::Stable) + } +} + +/// Parameters for [`ManagedSequencer::respawn_until_stable`]. See that +/// method's doc for how `advance_per_retry` interacts with the restart cycle. +#[derive(Debug, Clone)] +pub struct RespawnPolicy { + pub max_attempts: u32, + pub stabilization: Duration, + pub advance_per_retry: Option, +} + pub struct ManagedSequencer { rollups: DevnetRollupsStack, child: Child, @@ -45,6 +86,27 @@ pub struct ManagedSequencer { data_dir_path: PathBuf, endpoint: String, log_path: PathBuf, + /// Overrides the `--eth-rpc-url` the sequencer uses. When `None`, the + /// sequencer dials Anvil directly. When `Some(url)`, it dials the + /// override (e.g., a `TcpProxy` in front of Anvil for outage tests). + /// Persists across `respawn()` so post-restart behavior is consistent. + l1_endpoint_override: Option, + /// Overrides the `--chain-id` argument passed to the sequencer binary. + /// When `None`, defaults to `DEVNET_CHAIN_ID` (matches Anvil). Set to + /// a non-matching value to test chain-id-mismatch failure modes + /// (§8.2.1 / §8.3.1). + chain_id_override: Option, + /// Path to the file libfaketime re-reads for its offset, on every time + /// call (combined with `FAKETIME_NO_CACHE=1`). Writing to this file + /// shifts the sequencer's view of `SystemTime::now()` / `Instant::now()` + /// immediately — no respawn needed. + faketime_rc_path: PathBuf, + /// Cached libfaketime dylib/so path (computed once on spawn). + libfaketime_path: PathBuf, + /// Internal cumulative forward-offset tracker for + /// [`Self::advance_wall_and_mine`]. Not touched by + /// [`Self::set_faketime_offset`]. + cumulative_offset_secs: u64, } pub fn default_devnet_sequencer_config(log_prefix: impl Into) -> ManagedSequencerConfig { @@ -66,6 +128,16 @@ impl ManagedSequencer { let data_dir = TempDir::new() .map_err(|err| io_other(format!("failed to create temp data dir: {err}")))?; let data_dir_path = data_dir.path().to_path_buf(); + + // Set up faketime: locate libfaketime + create the rc file. Initial + // content `+0` means no offset; tests can overwrite with a new offset + // at any time and the running sequencer will see it on its next + // `SystemTime::now()` / `Instant::now()` call (FAKETIME_NO_CACHE=1). + let libfaketime_path = find_libfaketime()?; + let faketime_rc_path = data_dir_path.join("faketime.rc"); + fs::write(faketime_rc_path.as_path(), "+0\n") + .map_err(|err| io_other(format!("create faketime rc file: {err}")))?; + let SpawnedSequencerProcess { child, endpoint, @@ -76,6 +148,10 @@ impl ManagedSequencer { logs_dir.as_path(), data_dir_path.as_path(), &rollups, + None, + None, + libfaketime_path.as_path(), + faketime_rc_path.as_path(), ) .await?; @@ -90,9 +166,268 @@ impl ManagedSequencer { data_dir_path, endpoint, log_path, + l1_endpoint_override: None, + chain_id_override: None, + faketime_rc_path, + libfaketime_path, + cumulative_offset_secs: 0, }) } + /// Configure the sequencer to dial `l1_endpoint` instead of Anvil directly. + /// The override applies to the *next* `respawn()` and persists until cleared. + /// Intended for tests that route through a [`crate::TcpProxy`]. + /// + /// Does not affect the currently-running sequencer process. + pub fn set_l1_endpoint_override(&mut self, l1_endpoint: Option) { + self.l1_endpoint_override = l1_endpoint; + } + + /// Override the `--chain-id` argument the sequencer is spawned with on + /// the next [`Self::respawn`]. When `None`, defaults to the devnet + /// chain id (matches Anvil). + /// + /// Used by §8.2.1 / §8.3.1 to inject a mismatched chain id and assert + /// that bootstrap returns `RunError::ChainIdMismatch` instead of + /// silently writing a wrong-chain bootstrap cache. Does not affect + /// the currently-running sequencer process. + pub fn set_chain_id_override(&mut self, chain_id: Option) { + self.chain_id_override = chain_id; + } + + /// Write a faketime offset to the rc file. Effective **immediately** for + /// the running sequencer (if any) and persists across respawns. The + /// libfaketime library re-reads the file on every time call (we pass + /// `FAKETIME_NO_CACHE=1`), so the next `SystemTime::now()` inside the + /// child sees the new offset. + /// + /// Format follows faketime's `-f` flag: `"+5h"`, `"-1h"`, `"+1d"`, or + /// `"+NNNs"` for absolute seconds. Passing `None` resets to `+0`. + /// See `man faketime` for advanced options (speed-up, interval mode). + /// + /// Does not mine L1 blocks — use [`Self::advance_wall_and_mine`] when you + /// want wall-clock and L1 to move together. + /// + /// Replaces any cumulative advance tracked by + /// [`Self::advance_wall_and_mine`], and resets its counter. + pub fn set_faketime_offset(&mut self, offset: Option) -> HarnessResult<()> { + let s = offset.as_deref().unwrap_or("+0"); + fs::write(self.faketime_rc_path.as_path(), format!("{s}\n")) + .map_err(|err| io_other(format!("write faketime rc file: {err}")))?; + self.cumulative_offset_secs = 0; + Ok(()) + } + + /// Delete the row in `l1_bootstrap_cache`, simulating a DB that has + /// never successfully completed bootstrap discovery (no cached + /// `input_box_address` / `genesis_block` / `chain_id`). Call while the + /// sequencer is stopped. + /// + /// Used by §8.1.2: with no cache and L1 unreachable, the bootstrap + /// path returns the "L1 required for first startup" error before any + /// recovery logic can run. + pub fn clear_l1_bootstrap_cache(&self) -> HarnessResult<()> { + let db_path = self.data_dir_path.join("sequencer.db"); + let conn = rusqlite::Connection::open(db_path.as_path()) + .map_err(|err| io_other(format!("open DB: {err}")))?; + conn.execute("DELETE FROM l1_bootstrap_cache", []) + .map_err(|err| io_other(format!("clear l1_bootstrap_cache: {err}")))?; + Ok(()) + } + + /// Rewrite `l1_safe_head.synced_at_ms` to `0`, simulating a DB that has + /// never successfully synced from L1. Call while the sequencer is + /// stopped. + /// + /// Used by §7.8.2: the wall-clock fallback treats `synced_at_ms == 0` + /// as "first boot, L1 required" and refuses to proceed if L1 is + /// unreachable. Setting this field while the bootstrap cache is + /// populated lets us hit that branch without losing the cached chain + /// ID / InputBox address (which would fail earlier in bootstrap, not + /// in the wall-clock fallback). + pub fn reset_l1_safe_head_synced_at_ms(&self) -> HarnessResult<()> { + let db_path = self.data_dir_path.join("sequencer.db"); + let conn = rusqlite::Connection::open(db_path.as_path()) + .map_err(|err| io_other(format!("open DB: {err}")))?; + conn.execute( + "UPDATE l1_safe_head SET synced_at_ms = 0 WHERE singleton_id = 0", + [], + ) + .map_err(|err| io_other(format!("reset synced_at_ms: {err}")))?; + Ok(()) + } + + /// Read-only snapshot of the `safe_accepted_batches` view: rows + /// recovered from the L1-side scheduler frontier (i.e., batches the + /// sequencer has *observed accepted on chain*). Returns `(count, + /// min_nonce)` — count is the row count, min_nonce is `MIN(nonce)` or + /// `None` if empty. + /// + /// Used by §7.5.2 to confirm a recovery batch (which reuses nonce 0) + /// actually lands and gets accepted on L1 — proving the + /// `populate_safe_accepted_batches_inner` cursor handles + /// reused-nonce-after-cascade correctly. + pub fn count_safe_accepted_batches(&self) -> HarnessResult<(u64, Option)> { + let db_path = self.data_dir_path.join("sequencer.db"); + let conn = rusqlite::Connection::open_with_flags( + db_path.as_path(), + rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY, + ) + .map_err(|err| io_other(format!("open DB read-only: {err}")))?; + + let count: i64 = conn + .query_row("SELECT COUNT(*) FROM safe_accepted_batches", [], |row| { + row.get(0) + }) + .map_err(|err| io_other(format!("count safe_accepted_batches: {err}")))?; + let min_nonce: Option = conn + .query_row("SELECT MIN(nonce) FROM safe_accepted_batches", [], |row| { + row.get(0) + }) + .map_err(|err| io_other(format!("min nonce: {err}")))?; + Ok((count as u64, min_nonce.map(|n| n as u64))) + } + + /// Snapshot of the `batches` table: `(total, sealed, invalidated)`. + /// Reads the DB file read-only; safe to call while the sequencer is + /// running. Useful for asserting that batch closure happened during a + /// test segment (e.g., the sequencer kept processing through an outage). + pub fn count_batches(&self) -> HarnessResult { + let db_path = self.data_dir_path.join("sequencer.db"); + let conn = rusqlite::Connection::open_with_flags( + db_path.as_path(), + rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY, + ) + .map_err(|err| io_other(format!("open DB read-only: {err}")))?; + + let total: i64 = conn + .query_row("SELECT COUNT(*) FROM batches", [], |row| row.get(0)) + .map_err(|err| io_other(format!("count batches: {err}")))?; + let sealed: i64 = conn + .query_row( + "SELECT COUNT(*) FROM batches WHERE sealed_at_ms IS NOT NULL", + [], + |row| row.get(0), + ) + .map_err(|err| io_other(format!("count sealed batches: {err}")))?; + let invalidated: i64 = conn + .query_row( + "SELECT COUNT(*) FROM batches WHERE invalidated_at_ms IS NOT NULL", + [], + |row| row.get(0), + ) + .map_err(|err| io_other(format!("count invalidated batches: {err}")))?; + + Ok(BatchCounts { + total: total as u64, + sealed: sealed as u64, + invalidated: invalidated as u64, + }) + } + + /// Assert the schema-level tree invariants on the sequencer's DB. Runs + /// against the DB file read-only; safe to call whether the sequencer is + /// running or stopped (SQLite WAL + read-only flag handles concurrent + /// writers). + /// + /// Invariants checked: + /// 1. At most one `valid_open_batch` row (partial unique index + /// `ux_single_valid_tip` should guarantee this structurally — + /// we verify it in case the index ever regressed). + /// 2. Every valid batch's `nonce` equals `parent.nonce + 1`, or 0 if + /// `parent_batch_index IS NULL`. + /// 3. Every `parent_batch_index` is NULL or references an existing + /// batch (FK-backed, verified explicitly for cross-DB-tool + /// portability). + /// 4. The nonces on the valid path form a contiguous `0..N` sequence. + /// + /// Panics with a specific violation message if any invariant fails. + /// See `tests/TEST_PLAN.md` §12.5.3 for the design rationale — this is + /// a harness-only check (no sequencer changes) that catches regressions + /// which slip past user-visible e2e assertions. + pub fn assert_schema_invariants(&self) -> HarnessResult<()> { + let db_path = self.data_dir_path.join("sequencer.db"); + let conn = rusqlite::Connection::open_with_flags( + db_path.as_path(), + rusqlite::OpenFlags::SQLITE_OPEN_READ_ONLY, + ) + .map_err(|err| io_other(format!("open DB read-only: {err}")))?; + + // 1. At most one valid open batch. + let open_count: i64 = conn + .query_row("SELECT COUNT(*) FROM valid_open_batch", [], |row| { + row.get(0) + }) + .map_err(|err| io_other(format!("count valid_open_batch: {err}")))?; + if open_count > 1 { + panic!("schema invariant: more than one valid Tip ({open_count} rows)"); + } + + // 2. Nonce contiguity via parent. + let mut stmt = conn + .prepare( + "SELECT b.batch_index, b.parent_batch_index, b.nonce, p.nonce \ + FROM batches b LEFT JOIN batches p ON p.batch_index = b.parent_batch_index", + ) + .map_err(|err| io_other(format!("prepare nonce-check: {err}")))?; + let rows: Vec<(i64, Option, i64, Option)> = stmt + .query_map([], |row| { + Ok((row.get(0)?, row.get(1)?, row.get(2)?, row.get(3)?)) + }) + .map_err(|err| io_other(format!("query nonce-check: {err}")))? + .collect::>() + .map_err(|err| io_other(format!("collect nonce-check: {err}")))?; + for (bi, parent, nonce, parent_nonce) in &rows { + match (parent, parent_nonce) { + (None, _) => { + if *nonce != 0 { + panic!( + "schema invariant: batch {bi} has NULL parent but nonce {nonce} (expected 0)" + ); + } + } + (Some(p), None) => { + panic!( + "schema invariant: batch {bi}'s parent {p} doesn't exist (FK violation)" + ); + } + (Some(_), Some(pn)) => { + if *nonce != pn + 1 { + panic!( + "schema invariant: batch {bi} nonce={nonce}, expected parent.nonce+1 = {}", + pn + 1 + ); + } + } + } + } + + // 3. Valid-path nonce uniqueness and contiguity. + let mut stmt = conn + .prepare("SELECT nonce FROM valid_batches ORDER BY nonce ASC") + .map_err(|err| io_other(format!("prepare valid-nonces: {err}")))?; + let valid_nonces: Vec = stmt + .query_map([], |row| row.get::<_, i64>(0)) + .map_err(|err| io_other(format!("query valid-nonces: {err}")))? + .collect::>() + .map_err(|err| io_other(format!("collect valid-nonces: {err}")))?; + for pair in valid_nonces.windows(2) { + if pair[0] == pair[1] { + panic!( + "schema invariant: duplicate valid nonce {} in {valid_nonces:?}", + pair[0] + ); + } + } + for (i, &n) in valid_nonces.iter().enumerate() { + if n != i as i64 { + panic!("schema invariant: valid nonces not contiguous: {valid_nonces:?}"); + } + } + + Ok(()) + } + pub fn endpoint(&self) -> &str { self.endpoint.as_str() } @@ -141,8 +476,198 @@ impl ManagedSequencer { self.rollups.mine_l1_blocks(block_count).await } - pub async fn restart(&mut self) -> HarnessResult<()> { - self.shutdown_child().await?; + /// Toggle Anvil's auto-mining mode. When disabled, txs accumulate in + /// the mempool until an explicit mine or re-enable. Used to hold a + /// sequencer's batch-submission tx out of a block while the chain + /// advances, reproducing the "delayed inclusion" fault that the + /// scheduler handles by skipping past-stale batches. + pub async fn set_automine(&self, enabled: bool) -> HarnessResult<()> { + self.rollups.set_automine(enabled).await + } + + /// Drop every pending tx from Anvil's mempool. Typical use: after the + /// sequencer has submitted a batch-submission tx, drop it to simulate + /// a gateway losing the payload. Combined with `mine_l1_blocks` to + /// advance the chain without the dropped tx landing, this reproduces + /// the "tx never mined" variant of delayed-inclusion. + pub async fn drop_all_pending_txs(&self) -> HarnessResult<()> { + self.rollups.drop_all_pending_txs().await + } + + /// Advance both the sequencer's wall clock and the L1 chain by `duration`, + /// maintaining the block-time coupling invariant (`seconds_per_block`, + /// default 12 for Ethereum mainnet parity). + /// + /// This is the primary tool for simulating elapsed outage time. Effective + /// **immediately** — works whether the sequencer is running or stopped: + /// - The faketime rc file is updated; the running sequencer's next time + /// call (or a post-respawn first call) sees the shifted clock. + /// - Anvil mines `duration.as_secs() / SECONDS_PER_BLOCK` blocks. + /// + /// **Cumulative**: calling with `1h` twice totals `+2h`, not `+1h`. Use + /// [`Self::set_faketime_offset`] to jump to a specific offset or reset. + /// + /// Tests that need decoupled wall-clock vs L1 (e.g., the `saturating_sub` + /// backward-jump test) should use [`Self::set_faketime_offset`] and + /// [`Self::mine_l1_blocks`] directly. + /// + /// Assumes `SEQ_SECONDS_PER_BLOCK = 12`. If a test changes that via env, + /// this helper's block count will be wrong — prefer the direct dials in + /// that case. + pub async fn advance_wall_and_mine(&mut self, duration: Duration) -> HarnessResult<()> { + const SECONDS_PER_BLOCK: u64 = 12; + let secs = duration.as_secs(); + let blocks = secs / SECONDS_PER_BLOCK; + self.mine_l1_blocks(blocks).await?; + self.cumulative_offset_secs = self.cumulative_offset_secs.saturating_add(secs); + fs::write( + self.faketime_rc_path.as_path(), + format!("+{}s\n", self.cumulative_offset_secs), + ) + .map_err(|err| io_other(format!("write faketime rc file: {err}")))?; + Ok(()) + } + + /// Watch the sequencer child for `grace` time without consuming its + /// exit handle. + /// + /// - Returns `Ok(None)` if the child is still alive when `grace` + /// elapses. The internal `wait()` future is dropped, so subsequent + /// calls to [`Self::wait_for_exit`] / [`Self::respawn_and_watch`] + /// still work. + /// - Returns `Ok(Some(status))` if the child exited inside the + /// window. The exit status is captured and the child is reaped; + /// the caller shouldn't call `wait_for_exit` afterwards (it would + /// hang). + /// + /// Used by negative-control tests that need to assert the sequencer + /// *stayed up* across a condition that, if a bug existed, would make + /// it exit. + pub async fn observe_for( + &mut self, + grace: Duration, + ) -> HarnessResult> { + tokio::select! { + wait_result = self.child.wait() => { + let status = wait_result + .map_err(|err| io_other(format!("child.wait(): {err}")))?; + Ok(Some(status)) + } + _ = tokio::time::sleep(grace) => Ok(None), + } + } + + /// Wait for the sequencer process to exit on its own. Returns the + /// process's exit status. Times out after `timeout` to avoid hanging + /// tests when the process refuses to exit. + /// + /// Used by tests that expect the sequencer to detect a condition + /// (e.g., wall-clock danger) and self-exit with a non-zero status. + /// After this returns, call [`Self::respawn`] to start a fresh process. + pub async fn wait_for_exit( + &mut self, + timeout: Duration, + ) -> HarnessResult { + let status = tokio::time::timeout(timeout, self.child.wait()) + .await + .map_err(|_| { + io_other(format!( + "wait_for_exit: sequencer did not exit within {timeout:?}" + )) + })? + .map_err(|err| io_other(format!("wait_for_exit: {err}")))?; + Ok(status) + } + + /// Respawn the sequencer and watch the child for `stabilization` to + /// confirm it stays alive. Classifies the outcome so tests can model an + /// orchestrator restart cycle without re-deriving the failure modes. + /// + /// There are two distinct "unstable" shapes the sequencer can take: + /// - The child dies during bootstrap (before HTTP readiness), which + /// makes `respawn()` itself return `Err`. Canonical cause: + /// `RecoveryError::Refuse(...)` from the startup decision table + /// when L1 is unreachable and the persisted state looks stalled. + /// - The child comes up (HTTP ready, bootstrap passed), then one of + /// the internal tasks returns a fatal error and the process exits. + /// Canonical cause: `RunError::DangerZoneDetected` when the first + /// danger-detector poll after boot sees a batch past `danger_threshold`. + /// + /// The race between bootstrap-finishes and submitter-first-tick is + /// short (the poll interval is 5s by default, but the first tick runs + /// immediately), so both cases can surface for a single logical event — + /// tests should generally treat either as "not stable" and retry. + /// + /// Callers must ensure the previous child is already reaped (via + /// [`Self::stop`] or [`Self::wait_for_exit`]) — same rule as + /// [`Self::respawn`]. + pub async fn respawn_and_watch( + &mut self, + stabilization: Duration, + ) -> HarnessResult { + if let Err(err) = self.respawn().await { + return Ok(RespawnAttemptOutcome::RespawnFailed(err.to_string())); + } + tokio::select! { + wait_result = self.child.wait() => { + let status = wait_result + .map_err(|err| io_other(format!("child.wait(): {err}")))?; + Ok(RespawnAttemptOutcome::ExitedPostRespawn(status)) + } + _ = tokio::time::sleep(stabilization) => { + Ok(RespawnAttemptOutcome::Stable) + } + } + } + + /// Loop [`Self::respawn_and_watch`] until the sequencer stays up for + /// `policy.stabilization`, or `policy.max_attempts` is reached. Returns + /// the full sequence of attempts. + /// + /// The restart-loop convergence story: an aged Tip in the danger zone + /// (not yet past-stale) auto-closes on respawn, and the resulting closed + /// batch is in the danger zone, so the submitter exits with `DangerZone`. + /// Startup recovery's cascade fires at `MAX_WAIT_BLOCKS`, not at the + /// danger threshold — so the loop only converges once enough *additional* + /// L1 blocks have aged the batch past `MAX_WAIT_BLOCKS`. In production + /// the orchestrator restart itself takes seconds, during which real L1 + /// blocks are produced; `advance_per_retry` simulates that drift. Tests + /// that expect a short hiccup to self-heal (no danger involved) should + /// leave `advance_per_retry` unset. + /// + /// The loop always returns Ok — assert on the final attempt's outcome + /// to decide pass/fail in the test body. + pub async fn respawn_until_stable( + &mut self, + policy: RespawnPolicy, + ) -> HarnessResult> { + let mut outcomes = Vec::with_capacity(policy.max_attempts as usize); + for attempt in 0..policy.max_attempts { + let outcome = self.respawn_and_watch(policy.stabilization).await?; + let stable = outcome.is_stable(); + outcomes.push(outcome); + if stable { + break; + } + let is_last = attempt + 1 == policy.max_attempts; + if let Some(advance) = policy.advance_per_retry.filter(|_| !is_last) { + self.advance_wall_and_mine(advance).await?; + } + } + Ok(outcomes) + } + + /// Kill the sequencer process. Anvil stays running, so `mine_l1_blocks()` still works. + pub async fn stop(&mut self) -> HarnessResult<()> { + self.shutdown_child().await + } + + /// Respawn the sequencer process using the same data directory and Anvil instance. + /// + /// Honors any `l1_endpoint_override` set via [`Self::set_l1_endpoint_override`] + /// and the faketime offset in the rc file (see [`Self::set_faketime_offset`] / + /// [`Self::advance_wall_and_mine`]). + pub async fn respawn(&mut self) -> HarnessResult<()> { let SpawnedSequencerProcess { child, endpoint, @@ -153,6 +678,10 @@ impl ManagedSequencer { self.logs_dir.as_path(), self.data_dir_path.as_path(), &self.rollups, + self.l1_endpoint_override.as_deref(), + self.chain_id_override, + self.libfaketime_path.as_path(), + self.faketime_rc_path.as_path(), ) .await?; self.child = child; @@ -161,6 +690,16 @@ impl ManagedSequencer { Ok(()) } + pub async fn restart(&mut self) -> HarnessResult<()> { + self.stop().await?; + self.respawn().await + } + + /// Read the current sequencer log file contents. + pub fn read_log_contents(&self) -> HarnessResult { + std::fs::read_to_string(&self.log_path).map_err(Into::into) + } + pub async fn ws(&self, from_offset: u64) -> HarnessResult { let client = self.sequencer_client()?; WsClient::connect(&client, from_offset).await @@ -222,12 +761,17 @@ struct SpawnedSequencerProcess { log_path: PathBuf, } +#[allow(clippy::too_many_arguments)] async fn spawn_sequencer_process( sequencer_bin: &Path, log_prefix: &str, logs_dir: &Path, data_dir: &Path, rollups: &DevnetRollupsStack, + l1_endpoint_override: Option<&str>, + chain_id_override: Option, + libfaketime_path: &Path, + faketime_rc_path: &Path, ) -> HarnessResult { let (endpoint, http_addr) = build_local_endpoint()?; let log_path = timestamped_log_path(logs_dir, log_prefix); @@ -241,15 +785,27 @@ async fn spawn_sequencer_process( let batch_submitter_key = default_private_keys().first().cloned().unwrap_or_else(|| { "0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80".to_string() }); - let mut child = Command::new(path_as_str(sequencer_bin)?) + let eth_rpc_url = l1_endpoint_override.unwrap_or_else(|| rollups.l1_endpoint()); + + // Set up libfaketime via env vars (not the `faketime` wrapper binary). + // The wrapper sets the FAKETIME env var, which has priority over + // FAKETIME_TIMESTAMP_FILE — bypassing it lets the file-based mechanism + // work. The file's contents are re-read on every `SystemTime::now()` / + // `Instant::now()` call thanks to FAKETIME_NO_CACHE=1, so tests can + // shift the clock dynamically during a run. + let mut cmd = Command::new(path_as_str(sequencer_bin)?); + apply_faketime_env(&mut cmd, libfaketime_path, faketime_rc_path)?; + + let chain_id = chain_id_override.unwrap_or(DEVNET_CHAIN_ID); + let mut child = cmd .arg("--http-addr") .arg(http_addr) .arg("--data-dir") .arg(path_as_str(data_dir)?) .arg("--eth-rpc-url") - .arg(rollups.l1_endpoint()) + .arg(eth_rpc_url) .arg("--chain-id") - .arg(DEVNET_CHAIN_ID.to_string()) + .arg(chain_id.to_string()) .arg("--app-address") .arg(rollups.app_address().to_string()) .arg("--batch-submitter-private-key") @@ -278,3 +834,161 @@ async fn spawn_sequencer_process( log_path, }) } + +/// Configure the child process env to preload libfaketime and point it at +/// the rc file for dynamic offsets. macOS uses `DYLD_INSERT_LIBRARIES` + +/// `DYLD_FORCE_FLAT_NAMESPACE=1`; Linux uses `LD_PRELOAD`. +fn apply_faketime_env( + cmd: &mut Command, + libfaketime_path: &Path, + faketime_rc_path: &Path, +) -> HarnessResult<()> { + let lib = path_as_str(libfaketime_path)?; + let rc = path_as_str(faketime_rc_path)?; + if cfg!(target_os = "macos") { + cmd.env("DYLD_INSERT_LIBRARIES", lib) + .env("DYLD_FORCE_FLAT_NAMESPACE", "1"); + } else { + cmd.env("LD_PRELOAD", lib); + } + cmd.env("FAKETIME_TIMESTAMP_FILE", rc) + .env("FAKETIME_NO_CACHE", "1"); + Ok(()) +} + +/// Locate the libfaketime shared library. Searches: +/// 1. `$LIBFAKETIME_LIB` (explicit override). +/// 2. `lib/faketime/libfaketime.{1.dylib,so.1}` relative to the `faketime` +/// binary's prefix (Nix layout). +/// 3. Linux distro multiarch lib dirs such as +/// `/usr/lib/x86_64-linux-gnu/faketime` (Debian/Ubuntu apt layout). +fn find_libfaketime() -> HarnessResult { + if let Ok(p) = std::env::var("LIBFAKETIME_LIB") { + let p = PathBuf::from(p); + if p.exists() { + return Ok(p); + } + return Err(io_other(format!("LIBFAKETIME_LIB={p:?} does not exist")).into()); + } + + let path = + std::env::var("PATH").map_err(|err| io_other(format!("PATH env var unreadable: {err}")))?; + let faketime_bin = std::env::split_paths(&path) + .map(|p| p.join("faketime")) + .find(|p| p.exists()) + .ok_or_else(|| { + io_other("`faketime` binary not found in PATH; add libfaketime to the dev shell") + })?; + + let prefix = faketime_bin + .parent() + .and_then(|p| p.parent()) + .ok_or_else(|| { + io_other(format!( + "faketime path has no grandparent: {faketime_bin:?}" + )) + })?; + let lib_dirs = candidate_libfaketime_dirs(prefix); + let candidates = libfaketime_file_names(); + if let Some(path) = find_libfaketime_in_dirs(lib_dirs.as_slice(), candidates) { + return Ok(path); + } + + let searched = lib_dirs + .iter() + .map(|p| format!("{p:?}")) + .collect::>() + .join(", "); + Err(io_other(format!( + "libfaketime not found under any searched directory [{searched}] (tried {candidates:?})" + )) + .into()) +} + +fn libfaketime_file_names() -> &'static [&'static str] { + if cfg!(target_os = "macos") { + &["libfaketime.1.dylib", "libfaketime.dylib"] + } else { + &["libfaketime.so.1", "libfaketime.so"] + } +} + +fn candidate_libfaketime_dirs(prefix: &Path) -> Vec { + let mut dirs = Vec::new(); + let lib_dir = prefix.join("lib"); + dirs.push(lib_dir.join("faketime")); + + if cfg!(target_os = "linux") { + if let Ok(entries) = fs::read_dir(&lib_dir) { + let mut multiarch_dirs = entries + .filter_map(Result::ok) + .map(|entry| entry.path()) + .filter(|path| path.is_dir()) + .filter(|path| path.file_name().is_some_and(|name| name != "faketime")) + .map(|path| path.join("faketime")) + .collect::>(); + multiarch_dirs.sort(); + dirs.extend(multiarch_dirs); + } + dirs.push(prefix.join("lib64").join("faketime")); + } + + dirs.dedup(); + dirs +} + +fn find_libfaketime_in_dirs(lib_dirs: &[PathBuf], candidates: &[&str]) -> Option { + for lib_dir in lib_dirs { + for name in candidates { + let path = lib_dir.join(name); + if path.exists() { + return Some(path); + } + } + } + None +} + +#[cfg(test)] +mod tests { + use std::fs; + + use super::{candidate_libfaketime_dirs, find_libfaketime_in_dirs}; + + #[cfg(target_os = "linux")] + #[test] + fn libfaketime_lookup_finds_debian_multiarch_layout() { + let temp = tempfile::TempDir::new().expect("tempdir"); + let prefix = temp.path(); + let multiarch_dir = prefix.join("lib").join("x86_64-linux-gnu").join("faketime"); + fs::create_dir_all(&multiarch_dir).expect("create multiarch faketime dir"); + let expected = multiarch_dir.join("libfaketime.so.1"); + fs::write(&expected, b"fake so").expect("write fake lib"); + + let dirs = candidate_libfaketime_dirs(prefix); + let found = find_libfaketime_in_dirs(dirs.as_slice(), &["libfaketime.so.1"]) + .expect("multiarch lib should be discovered"); + + assert_eq!(found, expected); + } + + #[test] + fn libfaketime_lookup_prefers_direct_prefix_layout() { + let temp = tempfile::TempDir::new().expect("tempdir"); + let prefix = temp.path(); + let direct_dir = prefix.join("lib").join("faketime"); + let multiarch_dir = prefix.join("lib").join("x86_64-linux-gnu").join("faketime"); + fs::create_dir_all(&direct_dir).expect("create direct faketime dir"); + fs::create_dir_all(&multiarch_dir).expect("create multiarch faketime dir"); + let expected = direct_dir.join("libfaketime.so.1"); + let fallback = multiarch_dir.join("libfaketime.so.1"); + fs::write(&expected, b"direct").expect("write direct lib"); + fs::write(&fallback, b"fallback").expect("write fallback lib"); + + let dirs = candidate_libfaketime_dirs(prefix); + let found = find_libfaketime_in_dirs(dirs.as_slice(), &["libfaketime.so.1"]) + .expect("direct lib should be discovered"); + + assert_eq!(found, expected); + } +} diff --git a/tests/harness/src/wallet.rs b/tests/harness/src/wallet.rs index 14768ff..f713c58 100644 --- a/tests/harness/src/wallet.rs +++ b/tests/harness/src/wallet.rs @@ -234,13 +234,7 @@ impl WalletL2Client { endpoint.to_string(), DEFAULT_SEQUENCER_CLIENT_TIMEOUT, )?; - let domain = Eip712Domain { - name: Some("CartesiAppSequencer".to_string().into()), - version: Some("1".to_string().into()), - chain_id: Some(U256::from(chain_id)), - verifying_contract: Some(verifying_contract), - salt: None, - }; + let domain = sequencer_core::build_input_domain(chain_id, verifying_contract); Ok(Self { signer, client,