diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f4dccda3a..2c66c12796 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ #### Changes - Documented sortedness precondition more prominently for sorted array operations ([#2832](https://github.com/0xMiden/miden-vm/pull/2832)). +- Added a synthetic transaction-kernel Criterion benchmark for VM-level proving regression detection ([#3024](https://github.com/0xMiden/miden-vm/pull/3024)). - Borrowed operation slices in basic-block batching helpers to avoid cloning in the fingerprinting path ([#2994](https://github.com/0xMiden/miden-vm/pull/2994)). - [BREAKING] Sync execution and proving APIs now require `SyncHost`; async `Host`, `execute`, and `prove` remain available ([#2865](https://github.com/0xMiden/miden-vm/pull/2865)). - [BREAKING] `miden_processor::execute()` and `execute_sync()` now return `ExecutionOutput`; trace building remains explicit via `execute_trace_inputs*()` and `trace::build_trace()` ([#2865](https://github.com/0xMiden/miden-vm/pull/2865)). diff --git a/Cargo.lock b/Cargo.lock index c1a326829f..d99ffde2c6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1607,6 +1607,19 @@ dependencies = [ "p3-goldilocks", ] +[[package]] +name = "miden-synthetic-tx-kernel" +version = "0.23.0" +dependencies = [ + "criterion", + "miden-core", + "miden-processor", + "miden-vm", + "serde", + "serde_json", + "thiserror", +] + [[package]] name = "miden-test-serde-macros" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index bb188afcb4..f5128b6fae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,7 @@ [workspace] members = [ "air", + "benches/synthetic-tx-kernel", "core", "crates/ace-codegen", "crates/assembly", diff --git a/benches/synthetic-tx-kernel/Cargo.toml b/benches/synthetic-tx-kernel/Cargo.toml new file mode 100644 index 0000000000..72e285b977 --- /dev/null +++ b/benches/synthetic-tx-kernel/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "miden-synthetic-tx-kernel" +version.workspace = true +edition.workspace = true +license.workspace = true +authors.workspace = true +homepage.workspace = true +repository.workspace = true +publish = false + +[lints] +workspace = true + +[dependencies] +miden-core = { workspace = true } +miden-processor = { workspace = true, features = ["concurrent"] } +miden-vm = { path = "../../miden-vm", features = ["concurrent"] } +serde = { workspace = true, features = ["std"] } +serde_json = { workspace = true, features = ["std"] } +thiserror = { workspace = true, features = ["std"] } + +[dev-dependencies] +criterion = { workspace = true } + +[[bench]] +name = "synthetic_kernel" +harness = false diff --git a/benches/synthetic-tx-kernel/README.md b/benches/synthetic-tx-kernel/README.md new file mode 100644 index 0000000000..2137037edf --- /dev/null +++ b/benches/synthetic-tx-kernel/README.md @@ -0,0 +1,213 @@ +# miden-synthetic-tx-kernel + +Criterion benchmark that reproduces the **proving-cost brackets** of a real +Miden transaction workload from a small JSON snapshot -- without pulling in +any protocol-level code. + +## Approach + +STARK proving cost is dominated by the padded power-of-two lengths of the +execution trace's segments. Everything else -- per-chiplet row counts, +instruction mix, which procedures get called -- is second-order once the +brackets are known. + +This crate takes a snapshot of per-segment trace-row counts captured from +a real transaction in the `protocol` repo, generates a tiny MASM program +whose execution reproduces those brackets, and runs `execute` + +`execute_and_prove` Criterion groups against it. The result is a +VM-level regression detector that isolates *prover* changes from +*workload* changes without depending on protocol's transaction +machinery. + +## Pipeline (per bench run) + +Each bench invocation rebuilds every synthetic program from scratch, +so the numbers always reflect the current commit's VM -- there are no +stale calibration constants checked into the repo. + +1. **Calibrate (once)** -- run each MASM snippet as `repeat.K ...` and + divide the resulting per-component row counts by `K` to learn how + many core/hasher/memory/... rows a single iteration costs *on this + VM*. Running this on every bench invocation is what keeps the + bench honest across VM changes: if `hperm` gets cheaper tomorrow, + tomorrow's iteration count grows to compensate, and the target + bracket is still hit. + +For each snapshot under `snapshots/` (or the single path in +`SYNTH_SNAPSHOT`): + +2. **Load snapshot** -- read the target row counts; this is the shape + we want the emitted program to reproduce. See + [Snapshot format](#snapshot-format). +3. **Solve** -- pick an iteration count for each snippet so that their + combined row contributions add up to the snapshot's target. We do + this by fixed-point refinement: start from zero, and on each pass + update every snippet's count from the current guesses of the others, + clamping negatives to zero. A handful of passes is enough because + each snippet is designed to drive mostly *one* component, so the + counts barely depend on each other and the sweep converges quickly. + (For the linear-algebra reader: this is Jacobi iteration on a + near-diagonal matrix with a non-negativity projection.) +4. **Emit** -- wrap each snippet's body in a `repeat.N ... end` block, + concatenate, and enclose in `begin ... end`. The output is the MASM + program that Criterion actually runs. +5. **Verify** -- execute the emitted program, measure its real row + counts, and assert that `padded_core_side` and `padded_chiplets` + match the snapshot's. A bracket miss fails the bench; smaller drift + inside the same bracket is reported but tolerated, because proving + cost is driven by the padded length, not the raw count. + +## Snippets + +Five patterns cover every component the solver targets: + +| Snippet | Body | Drives | +|---------------|----------------------------------------------|-------------------------------| +| `hasher` | `hperm` | Poseidon2 hasher chiplet | +| `bitwise` | `u32split u32xor` | bitwise chiplet | +| `u32arith` | `u32assert2 push.65537 add swap push.65537 add swap` | range chiplet | +| `memory` | `dup.4 mem_storew_le dup.4 mem_loadw_le movup.4 push.262148 add movdn.4` | memory chiplet | +| `decoder_pad` | `swap dup.1 add` | core (decoder + stack) | + +`u32arith` and `memory` use banded counters (strides of 65537 and +262148) so that their 16-bit limbs form disjoint contiguous bands, +keeping the range chiplet from deduplicating limb values across +iterations. + +The solver has no snippets targeting the ACE or kernel-ROM chiplets. + +- **ACE** is reachable from plain MASM, but exercising it requires + building an arithmetic circuit and preparing a memory region for its + READ section -- more setup than the other snippets warrant, and not + currently done here. +- **Kernel-ROM** rows are a small, near-constant contribution in + practice, so we simplify by folding them into the memory target + rather than driving them directly. + +Since snapshots still carry row counts for both, they're **folded into +the memory target** -- growing memory ops preserves the overall +chiplet-trace length and therefore the chiplet bracket. + +One producer-side caveat: the consumer can measure `ace_chiplet_len()` +when it runs synthetic programs, but protocol snapshots may report +`ace_rows: 0` until the protocol-side `miden-processor` dependency +exposes that accessor. Treat zero ACE rows in a snapshot as a producer +visibility limitation, not as proof that the VM emitted no ACE rows. + +## Snapshot format + +Two-tier: **hard-contract totals** in `trace`, **advisory breakdown** in +`shape`. The loader validates `trace.chiplets_rows == sum(shape) + 1`. +Schema version is currently `"0"`. + +```json +{ + "schema_version": "0", + "source": "protocol/bench-transaction:consume-single-p2id", + "timestamp": "unix-1776428820", + "miden_vm_version": "0.22", + "trace": { + "core_rows": 77699, + "chiplets_rows": 123129, + "range_rows": 20203 + }, + "shape": { + "hasher_rows": 120352, + "bitwise_rows": 416, + "memory_rows": 2297, + "kernel_rom_rows": 63, + "ace_rows": 0 + } +} +``` + +Snapshots live in `snapshots/`. The bench loads every `*.json` file in +that directory and runs one Criterion group per snapshot, named +`synthetic_transaction_kernel/`. Set +`SYNTH_SNAPSHOT=/path/to/file.json` to bench a single snapshot +instead. + +## Verifier contract + +Once the emitted program has run, the verifier compares its actual +row counts against the snapshot's targets and decides whether the +bench passed. The checks come in three tiers -- **hard**, **soft**, +and **info** -- graded by how directly each number maps to proving +cost. There's also one free-standing **warning** for snippet-balance +regressions. + +### Hard checks -- fail the bench + +Proving cost is dominated by the padded (power-of-two) length of each +trace segment, not by the raw row count. So the only assertions that +can fail the bench are on two padded proxies: + +- `padded_core_side = next_pow2(max(core_rows, range_rows))` -- the + non-chiplets side of the AIR. +- `padded_chiplets = next_pow2(chiplets_rows)`. + +These two can land in *different* brackets on the same workload -- +`consume-two-p2id`, for example, has `padded_core_side = 131072` but +`padded_chiplets = 262144`. Checking them independently catches a +bracket miss on either side that a single global `padded_total` +check would hide. + +### Soft checks -- report, don't fail + +`core_rows` and `chiplets_rows` are compared against the targets +within a 2% band. A drift inside that band is harmless for proving +cost (same bracket either way), so the bench only reports it. A +drift that *crosses* a bracket is already caught by the hard tier +above, so this tier exists purely to surface in-bracket near-misses +worth noticing. + +### Info -- no judgement + +Per-chiplet deltas (hasher/bitwise/memory/...) from `shape` are +printed for visibility but never asserted. Some divergence is +unavoidable: MAST hashing at program init contributes hasher rows +that the synthetic program can't suppress, so a snapshot with +`core_rows / hasher_rows > 4` cannot be per-chiplet-matched even +though it still matches both padded brackets. See `src/snippets.rs` +for the cases where this structural mismatch shows up. + +### Warning -- range dominates + +If `range_rows` turns out to be the largest unpadded component in +either the target or the actual shape, the bench prints a warning. +The solver treats range as a derived quantity driven mostly by u32 +arithmetic; if it starts setting the bracket, snippet balance has +drifted and should be revisited. + +## Refreshing snapshots from protocol + +The producer lives in the `protocol` repo as +`bin/bench-transaction/src/bin/tx-trace-snapshot.rs` and emits one +JSON per scenario under `bin/bench-transaction/snapshots/`. Flow: + +1. In `protocol`: `cargo run --release -p bench-transaction --bin tx-trace-snapshot` +2. Copy the regenerated JSONs over the same-named files in + `miden-vm/benches/synthetic-tx-kernel/snapshots/`. +3. Run `cargo bench -p miden-synthetic-tx-kernel` and verify + `=> BRACKET MATCH` for every snapshot in the printed verifier + tables. + +Snapshots travel by hand so that the two repos can evolve independently. The +loader rejects unknown `schema_version` values. A `miden_vm_version` major/minor +mismatch is intentionally a loud warning, not a hard failure, because protocol +often pins one miden-vm release behind `next`. Read bracket matches across a +version mismatch as useful regression signals, then refresh the snapshots when +the protocol-side pin catches up. + +## Running + +```sh +cargo bench -p miden-synthetic-tx-kernel +``` + +Env vars: + +- `SYNTH_SNAPSHOT=` -- bench only the specified snapshot file + (instead of iterating over every `snapshots/*.json`). +- `SYNTH_MASM_WRITE=1` -- dump each emitted MASM program to + `target/synthetic_kernel_.masm` for inspection. diff --git a/benches/synthetic-tx-kernel/benches/synthetic_kernel.rs b/benches/synthetic-tx-kernel/benches/synthetic_kernel.rs new file mode 100644 index 0000000000..1dab09f53b --- /dev/null +++ b/benches/synthetic-tx-kernel/benches/synthetic_kernel.rs @@ -0,0 +1,346 @@ +//! Synthetic transaction-kernel benchmark. +//! +//! Pipeline each bench run: +//! 1. Calibrate each snippet's per-iteration cost against the current VM (shared across all +//! snapshots in this run). +//! 2. For each snapshot JSON (every `snapshots/*.json`, or the single file in `SYNTH_SNAPSHOT`): +//! solve for per-snippet iteration counts, emit the resulting MASM program, verify it lands in +//! the snapshot's padded-trace bracket, and run `execute` + `execute_and_prove` Criterion +//! benches. +//! +//! Env vars: +//! - `SYNTH_SNAPSHOT`: path to a single snapshot JSON; if set, only this snapshot is benched. +//! Otherwise every `snapshots/*.json` in the manifest dir is used. +//! - `SYNTH_MASM_WRITE`: if set, write the emitted MASM to +//! `target/synthetic_kernel_.masm`. + +use std::{hint::black_box, path::PathBuf, time::Duration}; + +use criterion::{BatchSize, Criterion, SamplingMode, criterion_group, criterion_main}; +use miden_processor::{ + DefaultHost, ExecutionOptions, FastProcessor, StackInputs, advice::AdviceInputs, +}; +use miden_synthetic_tx_kernel::{ + calibrator::{Calibration, calibrate, measure_program}, + snapshot::{TraceShape, TraceSnapshot}, + snippets::{SNIPPETS, memory_max_iters, u32arith_max_iters}, + solver::{Plan, emit, solve}, + verifier::VerificationReport, +}; +use miden_vm::{Assembler, ProvingOptions, prove_sync}; + +fn resolve_snapshot_paths() -> Vec { + if let Ok(explicit) = std::env::var("SYNTH_SNAPSHOT") { + return vec![PathBuf::from(explicit)]; + } + let snapshots_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("snapshots"); + let mut paths: Vec = std::fs::read_dir(&snapshots_dir) + .unwrap_or_else(|e| panic!("failed to read {}: {e}", snapshots_dir.display())) + .filter_map(|entry| entry.ok().map(|e| e.path())) + .filter(|p| p.extension().and_then(|e| e.to_str()) == Some("json")) + .collect(); + paths.sort(); + assert!(!paths.is_empty(), "no snapshots found under {}", snapshots_dir.display()); + paths +} + +fn synthetic_transaction_kernel(c: &mut Criterion) { + let calibration = calibrate().expect("failed to calibrate snippets"); + println!("\n=== calibration (rows/iter)"); + for snippet in SNIPPETS { + let cost = calibration[snippet.name]; + println!( + " {:<14} core={:7.3} hasher={:6.3} bitwise={:6.3} memory={:6.3} range={:6.3}", + snippet.name, cost.core, cost.hasher, cost.bitwise, cost.memory, cost.range, + ); + } + + for path in resolve_snapshot_paths() { + bench_one_snapshot(c, &calibration, &path); + } +} + +fn bench_one_snapshot(c: &mut Criterion, calibration: &Calibration, snapshot_path: &PathBuf) { + let snapshot = TraceSnapshot::load(snapshot_path) + .unwrap_or_else(|e| panic!("failed to load snapshot at {}: {e}", snapshot_path.display())); + + println!("\n=== snapshot: {}", snapshot_path.display()); + println!(" source: {}", snapshot.source); + println!(" vm_ver: {}", snapshot.miden_vm_version); + + let local_vm_version = env!("CARGO_PKG_VERSION"); + if !versions_align(&snapshot.miden_vm_version, local_vm_version) { + println!( + " WARNING: snapshot captured against miden-vm {}, consumer is running {}; \ + continuing because version skew is expected when protocol lags miden-vm", + snapshot.miden_vm_version, local_vm_version, + ); + } + println!( + " trace: core={} chiplets={} range={} (padded_total={})", + snapshot.trace.core_rows, + snapshot.trace.chiplets_rows, + snapshot.trace.range_rows, + snapshot.trace.padded_total(), + ); + println!( + " shape: hasher={} bitwise={} memory={} kernel_rom={} ace={}", + snapshot.shape.hasher_rows, + snapshot.shape.bitwise_rows, + snapshot.shape.memory_rows, + snapshot.shape.kernel_rom_rows, + snapshot.shape.ace_rows, + ); + if snapshot.shape.substituted_rows() > 0 { + println!( + " note: {} rows (ace={} + kernel_rom={}) folded into memory target", + snapshot.shape.substituted_rows(), + snapshot.shape.ace_rows, + snapshot.shape.kernel_rom_rows, + ); + } + + let target_shape = snapshot.shape(); + let mut plan = solve(calibration, &target_shape); + range_correction_pass(&mut plan, &target_shape); + assert_counters_fit(&plan); + + println!("\n=== plan"); + for snippet in SNIPPETS { + println!(" {:<14} iters={}", snippet.name, plan.iters(snippet.name),); + } + + let snapshot_stem = snapshot_path.file_stem().and_then(|s| s.to_str()).unwrap_or("unknown"); + + let source = emit(&plan); + if std::env::var("SYNTH_MASM_WRITE").is_ok() { + let out = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("target") + .join(format!("synthetic_kernel_{snapshot_stem}.masm")); + std::fs::create_dir_all(out.parent().expect("parent")) + .expect("create target dir for MASM dump"); + std::fs::write(&out, &source).expect("write MASM dump"); + println!("\n=== wrote MASM dump to {}", out.display()); + } + + let actual = measure_program(&source).expect("measure emitted program"); + let report = VerificationReport::new(target_shape, actual); + println!("\n=== verification\n{report}"); + assert!( + report.brackets_match(), + "emitted program lands in a different padded-trace bracket than the snapshot", + ); + + let program = Assembler::default() + .assemble_program(&source) + .expect("assemble emitted program"); + + let mut group = c.benchmark_group(format!("synthetic_transaction_kernel/{snapshot_stem}")); + group + .sampling_mode(SamplingMode::Flat) + .sample_size(10) + .warm_up_time(Duration::from_millis(500)) + .measurement_time(Duration::from_secs(10)); + + group.bench_function("execute", |b| { + b.iter_batched( + || { + let host = DefaultHost::default(); + let processor = FastProcessor::new_with_options( + StackInputs::default(), + AdviceInputs::default(), + ExecutionOptions::default(), + ); + (host, program.clone(), processor) + }, + |(mut host, program, processor)| { + black_box(processor.execute_sync(&program, &mut host).expect("execute")); + }, + BatchSize::SmallInput, + ); + }); + + group.bench_function("execute_and_prove", |b| { + b.iter_batched( + || { + let host = DefaultHost::default(); + let stack = StackInputs::default(); + let advice = AdviceInputs::default(); + (host, program.clone(), stack, advice) + }, + |(mut host, program, stack, advice)| { + black_box( + prove_sync( + &program, + stack, + advice, + &mut host, + ExecutionOptions::default(), + ProvingOptions::default(), + ) + .expect("prove"), + ); + }, + BatchSize::SmallInput, + ); + }); + + group.finish(); +} + +/// Returns true when snapshot and local versions agree on `major.minor`. Patch differences are +/// ignored. +fn versions_align(snapshot: &str, local: &str) -> bool { + let mut s = snapshot.split('.'); + let mut l = local.split('.'); + (s.next(), s.next()) == (l.next(), l.next()) +} + +/// Panic if any snippet's plan iteration count would overflow its counter beyond `u32::MAX` (which +/// would trip `u32assert2` or a memory op at runtime). Cheap safeguard in case a snapshot with +/// unusually large range or memory targets gets fed in. +fn assert_counters_fit(plan: &Plan) { + let u32arith = plan.iters("u32arith"); + assert!( + u32arith <= u32arith_max_iters(), + "u32arith iters ({}) would overflow its u32 counter (max {}); \ + revisit the banded-counter start constants", + u32arith, + u32arith_max_iters(), + ); + let memory = plan.iters("memory"); + assert!( + memory <= memory_max_iters(), + "memory iters ({}) would overflow its u32 address counter (max {}); \ + revisit the banded-address start constants", + memory, + memory_max_iters(), + ); +} + +// RANGE CORRECTION PASS +// ------------------------------------------------------------------------ +// +// Range's trace length is not perfectly linear under composition, so the primary solver can leave +// a few-percent residual. This pass closes it by measuring marginal rates and applying a combo of +// `+u32arith -hasher -decoder_pad` that lifts range while keeping core and chiplets approximately +// fixed. Memory is deliberately left alone so the emitted memory-chiplet workload stays +// representative. + +#[derive(Debug, Clone, Copy)] +struct MarginalRates { + core: f64, + chiplets: f64, + range: f64, +} + +const CORRECTION_PROBE_DELTA: u64 = 256; +const CORRECTION_TOLERANCE: f64 = 0.01; +const CORRECTION_MAX_PASSES: usize = 2; + +fn measure_plan(plan: &Plan) -> TraceShape { + let source = emit(plan); + measure_program(&source).expect("measure emitted plan") +} + +fn measure_marginal( + base_plan: &Plan, + base_shape: TraceShape, + snippet: &'static str, + delta: u64, +) -> MarginalRates { + let mut probe = base_plan.clone(); + probe.add(snippet, delta); + let shape = measure_plan(&probe); + let d = delta as f64; + MarginalRates { + core: (shape.totals.core_rows as f64 - base_shape.totals.core_rows as f64) / d, + chiplets: (shape.totals.chiplets_rows as f64 - base_shape.totals.chiplets_rows as f64) / d, + range: (shape.totals.range_rows as f64 - base_shape.totals.range_rows as f64) / d, + } +} + +/// Solve for `(add_u32arith, sub_hasher, sub_pad)` that lifts range by `range_residual` while +/// holding core and chiplets approximately fixed. Returns `None` if the 2x2 system for the two +/// subtractions is degenerate, any coefficient comes out negative, or the net range gain is +/// non-positive. +fn solve_range_correction( + range_residual: f64, + u32arith: MarginalRates, + hasher: MarginalRates, + decoder_pad: MarginalRates, +) -> Option<(u64, u64, u64)> { + if range_residual <= 0.0 { + return None; + } + // Solve per unit of u32arith: + // hasher.core * y + decoder_pad.core * z = u32arith.core + // hasher.chiplets * y + decoder_pad.chiplets * z = u32arith.chiplets + let det = hasher.core * decoder_pad.chiplets - decoder_pad.core * hasher.chiplets; + if det.abs() < 1e-9 { + return None; + } + let y_per_x = + (u32arith.core * decoder_pad.chiplets - decoder_pad.core * u32arith.chiplets) / det; + let z_per_x = (hasher.core * u32arith.chiplets - u32arith.core * hasher.chiplets) / det; + if y_per_x < 0.0 || z_per_x < 0.0 { + return None; + } + let net_range_per_x = u32arith.range - hasher.range * y_per_x - decoder_pad.range * z_per_x; + if net_range_per_x <= 0.0 { + return None; + } + let x = (range_residual / net_range_per_x).round(); + if x <= 0.0 { + return None; + } + Some((x as u64, (x * y_per_x).round() as u64, (x * z_per_x).round() as u64)) +} + +fn range_correction_pass(plan: &mut Plan, target: &TraceShape) { + let target_range = target.totals.range_rows; + if target_range == 0 { + return; + } + let tolerance = (target_range as f64 * CORRECTION_TOLERANCE) as u64; + for pass in 0..CORRECTION_MAX_PASSES { + let actual = measure_plan(plan); + let actual_range = actual.totals.range_rows; + let residual = target_range as i64 - actual_range as i64; + println!( + "\n=== range correction pass {}: target={} actual={} residual={}", + pass + 1, + target_range, + actual_range, + residual, + ); + if residual <= tolerance as i64 { + // Already within band, or overshoot (residual <= 0). + return; + } + + let u32arith = measure_marginal(plan, actual, "u32arith", CORRECTION_PROBE_DELTA); + let hasher = measure_marginal(plan, actual, "hasher", CORRECTION_PROBE_DELTA); + let decoder_pad = measure_marginal(plan, actual, "decoder_pad", CORRECTION_PROBE_DELTA); + + match solve_range_correction(residual as f64, u32arith, hasher, decoder_pad) { + Some((add_u32arith, sub_hasher, sub_pad)) => { + let sub_hasher = sub_hasher.min(plan.iters("hasher")); + let sub_pad = sub_pad.min(plan.iters("decoder_pad")); + println!( + " applying: +u32arith {add_u32arith} -hasher {sub_hasher} -decoder_pad {sub_pad}" + ); + plan.add("u32arith", add_u32arith); + plan.sub_saturating("hasher", sub_hasher); + plan.sub_saturating("decoder_pad", sub_pad); + }, + None => { + println!(" no valid local correction found"); + return; + }, + } + } +} + +criterion_group!(benches, synthetic_transaction_kernel); +criterion_main!(benches); diff --git a/benches/synthetic-tx-kernel/snapshots/consume-single-p2id.json b/benches/synthetic-tx-kernel/snapshots/consume-single-p2id.json new file mode 100644 index 0000000000..3b625cf6d1 --- /dev/null +++ b/benches/synthetic-tx-kernel/snapshots/consume-single-p2id.json @@ -0,0 +1,18 @@ +{ + "schema_version": "0", + "source": "protocol/bench-transaction:consume-single-p2id", + "timestamp": "unix-1776673245", + "miden_vm_version": "0.22", + "trace": { + "core_rows": 77699, + "chiplets_rows": 123129, + "range_rows": 20203 + }, + "shape": { + "hasher_rows": 120352, + "bitwise_rows": 416, + "memory_rows": 2297, + "kernel_rom_rows": 63, + "ace_rows": 0 + } +} \ No newline at end of file diff --git a/benches/synthetic-tx-kernel/snapshots/consume-two-p2id.json b/benches/synthetic-tx-kernel/snapshots/consume-two-p2id.json new file mode 100644 index 0000000000..a84c84c0c9 --- /dev/null +++ b/benches/synthetic-tx-kernel/snapshots/consume-two-p2id.json @@ -0,0 +1,18 @@ +{ + "schema_version": "0", + "source": "protocol/bench-transaction:consume-two-p2id", + "timestamp": "unix-1776673247", + "miden_vm_version": "0.22", + "trace": { + "core_rows": 80522, + "chiplets_rows": 141260, + "range_rows": 20121 + }, + "shape": { + "hasher_rows": 138208, + "bitwise_rows": 592, + "memory_rows": 2392, + "kernel_rom_rows": 67, + "ace_rows": 0 + } +} \ No newline at end of file diff --git a/benches/synthetic-tx-kernel/src/calibrator.rs b/benches/synthetic-tx-kernel/src/calibrator.rs new file mode 100644 index 0000000000..8e73f56e63 --- /dev/null +++ b/benches/synthetic-tx-kernel/src/calibrator.rs @@ -0,0 +1,255 @@ +//! Calibration and measurement helpers. +//! +//! Each snippet is run as a `repeat.K` loop, measured through the real trace builder, and +//! converted into per-iteration row costs. Calibration happens on every bench run so the synthetic +//! adapts to the current VM's row accounting. + +use std::collections::BTreeMap; + +use miden_processor::{DefaultHost, FastProcessor, StackInputs, trace::build_trace}; +use miden_vm::Assembler; + +use crate::{ + snapshot::{TraceBreakdown, TraceShape, TraceTotals}, + snippets::{self, Component, SNIPPETS}, +}; + +pub const CALIBRATION_ITERS: u64 = 1000; + +// MEASUREMENT +// ------------------------------------------------------------------------ + +/// Assemble and execute `source`, returning the shape of the resulting execution trace. Wraps +/// assembler + fast processor + trace builder. +pub fn measure_program(source: &str) -> Result { + let program = Assembler::default() + .assemble_program(source) + .map_err(|e| MeasurementError::Assembly(format!("{e}")))?; + + let mut host = DefaultHost::default(); + let processor = FastProcessor::new(StackInputs::default()); + let trace_inputs = processor + .execute_trace_inputs_sync(&program, &mut host) + .map_err(|e| MeasurementError::Execution(format!("{e}")))?; + let trace = + build_trace(trace_inputs).map_err(|e| MeasurementError::TraceBuild(format!("{e}")))?; + let summary = trace.trace_len_summary(); + let chiplets = summary.chiplets_trace_len(); + + let breakdown = TraceBreakdown { + hasher_rows: chiplets.hash_chiplet_len() as u64, + bitwise_rows: chiplets.bitwise_chiplet_len() as u64, + memory_rows: chiplets.memory_chiplet_len() as u64, + kernel_rom_rows: chiplets.kernel_rom_len() as u64, + ace_rows: chiplets.ace_chiplet_len() as u64, + }; + let totals = TraceTotals { + core_rows: summary.main_trace_len() as u64, + chiplets_rows: chiplets.trace_len() as u64, + range_rows: summary.range_trace_len() as u64, + }; + + // Cross-check our derived formulas against the processor's authoritative values; a drift here + // means the AIR-side definitions have moved and the rest of the pipeline will silently + // miscalibrate. + let derived_chiplets = breakdown.chiplets_sum(); + if totals.chiplets_rows != derived_chiplets { + return Err(MeasurementError::InvariantDrift { + quantity: "chiplets_total", + processor: totals.chiplets_rows, + derived: derived_chiplets, + }); + } + let derived_padded = totals.padded_total(); + let processor_padded = summary.padded_trace_len() as u64; + if derived_padded != processor_padded { + return Err(MeasurementError::InvariantDrift { + quantity: "padded_total", + processor: processor_padded, + derived: derived_padded, + }); + } + + Ok(TraceShape::new(totals, breakdown)) +} + +#[derive(Debug, thiserror::Error)] +pub enum MeasurementError { + #[error("failed to assemble program: {0}")] + Assembly(String), + #[error("failed to execute program: {0}")] + Execution(String), + #[error("failed to build trace: {0}")] + TraceBuild(String), + /// One of our derived formulas drifted from the processor's authoritative value; AIR-side + /// definitions have probably changed and the snapshot/verifier formulas need updating. + #[error( + "invariant drift: {quantity} from processor = {processor}, but our derivation = {derived}" + )] + InvariantDrift { + quantity: &'static str, + processor: u64, + derived: u64, + }, +} + +// CALIBRATION +// ------------------------------------------------------------------------ + +/// Per-iteration row rates, kept as `f64` and rounded by the solver. +#[derive(Debug, Clone, Copy, Default)] +pub struct IterCost { + pub core: f64, + pub hasher: f64, + pub bitwise: f64, + pub memory: f64, + pub range: f64, +} + +impl IterCost { + pub fn get(&self, component: Component) -> f64 { + match component { + Component::Core => self.core, + Component::Hasher => self.hasher, + Component::Bitwise => self.bitwise, + Component::Memory => self.memory, + Component::Range => self.range, + } + } +} + +/// Per-snippet rows-per-iter across every tracked component. Cross-terms (e.g. a non-zero hasher +/// rate on the `decoder_pad` snippet) are measured so the solver can subtract them. +pub type Calibration = BTreeMap<&'static str, IterCost>; + +/// Run every snippet through a single-point calibration at [`CALIBRATION_ITERS`] and record +/// per-iter cost in each component. +pub fn calibrate() -> Result { + let mut cal = Calibration::new(); + for snippet in SNIPPETS { + let source = snippets::wrap_program(&snippets::render(snippet, CALIBRATION_ITERS)); + let shape = measure_program(&source)?; + cal.insert(snippet.name, per_iter_cost(shape, CALIBRATION_ITERS)); + } + Ok(cal) +} + +fn per_iter_cost(shape: TraceShape, iters: u64) -> IterCost { + let k = iters as f64; + IterCost { + core: shape.totals.core_rows as f64 / k, + hasher: shape.breakdown.hasher_rows as f64 / k, + bitwise: shape.breakdown.bitwise_rows as f64 / k, + memory: shape.breakdown.memory_rows as f64 / k, + range: shape.totals.range_rows as f64 / k, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // MEASUREMENT TESTS + // -------------------------------------------------------------------- + + #[test] + fn measures_trivial_program() { + // `measure_program()` already cross-checks our derived totals against the processor's + // authoritative values; this test just smoke-checks basic measurement. + let shape = measure_program("begin push.1 drop end").expect("measure"); + assert!(shape.totals.core_rows > 0, "main trace should include framing rows"); + assert!(shape.totals.padded_total().is_power_of_two()); + } + + #[test] + fn hperm_adds_rows_beyond_baseline() { + let baseline = measure_program("begin push.1 drop end").expect("baseline"); + let with_hperm = + measure_program("begin padw padw padw hperm dropw dropw dropw end").expect("hperm"); + assert!( + with_hperm.breakdown.hasher_rows > baseline.breakdown.hasher_rows, + "hperm should add hasher rows above the baseline ({} vs {})", + with_hperm.breakdown.hasher_rows, + baseline.breakdown.hasher_rows, + ); + } + + // CALIBRATION TESTS + // -------------------------------------------------------------------- + + fn cal() -> Calibration { + calibrate().expect("calibration should succeed") + } + + #[test] + fn every_snippet_has_an_entry() { + let c = cal(); + for snippet in SNIPPETS { + assert!(c.contains_key(snippet.name), "missing calibration for {}", snippet.name); + } + } + + #[test] + fn hasher_snippet_is_hasher_dominant() { + let c = cal(); + let hasher = c["hasher"]; + let pad = c["decoder_pad"]; + assert!( + hasher.hasher > pad.hasher * 10.0, + "hasher/iter ({}) not dominant over decoder_pad leak ({})", + hasher.hasher, + pad.hasher, + ); + } + + #[test] + fn bitwise_snippet_rows_match_op_cycle_len() { + let c = cal(); + let bitwise = c["bitwise"]; + // OP_CYCLE_LEN = 8 per u32 bitwise op. + assert!( + bitwise.bitwise >= 7.5, + "bitwise per-iter ({}) below OP_CYCLE_LEN", + bitwise.bitwise + ); + assert!( + bitwise.bitwise <= 9.0, + "bitwise per-iter ({}) above OP_CYCLE_LEN", + bitwise.bitwise + ); + } + + #[test] + fn memory_snippet_two_rows_per_iter() { + let c = cal(); + let memory = c["memory"]; + assert!(memory.memory >= 1.5, "memory per-iter ({}) too low", memory.memory); + assert!(memory.memory <= 2.5, "memory per-iter ({}) too high", memory.memory); + } + + #[test] + fn u32arith_snippet_drives_range() { + let c = cal(); + let arith = c["u32arith"]; + let pad = c["decoder_pad"]; + assert!( + arith.range > pad.range * 5.0, + "u32arith range/iter ({}) should dominate the baseline decoder_pad range/iter ({})", + arith.range, + pad.range, + ); + } + + #[test] + fn decoder_pad_is_core_dominant() { + let c = cal(); + let pad = c["decoder_pad"]; + assert!(pad.core > 1.0, "decoder_pad core/iter should be > 1.0"); + assert!( + pad.core > pad.hasher * 3.0, + "decoder_pad core ({}) should dominate hasher ({})", + pad.core, + pad.hasher, + ); + } +} diff --git a/benches/synthetic-tx-kernel/src/lib.rs b/benches/synthetic-tx-kernel/src/lib.rs new file mode 100644 index 0000000000..827a2357a0 --- /dev/null +++ b/benches/synthetic-tx-kernel/src/lib.rs @@ -0,0 +1,18 @@ +//! Synthetic benchmark generator for VM-level transaction-load regression tests. +//! +//! Given a snapshot captured from a representative transaction in `protocol`, this crate calibrates +//! a small catalog of MASM snippets against the current VM, solves for iteration counts, emits a +//! synthetic program, and verifies that the program lands in the target core/chiplets padded +//! brackets. +//! +//! The snapshot schema has two tiers: +//! - `trace`: hard totals (`core_rows`, `chiplets_rows`, `range_rows`) +//! - `shape`: advisory per-chiplet breakdown used by the solver +//! +//! See `README.md` for design rationale. + +pub mod calibrator; +pub mod snapshot; +pub mod snippets; +pub mod solver; +pub mod verifier; diff --git a/benches/synthetic-tx-kernel/src/snapshot.rs b/benches/synthetic-tx-kernel/src/snapshot.rs new file mode 100644 index 0000000000..d9a13e6862 --- /dev/null +++ b/benches/synthetic-tx-kernel/src/snapshot.rs @@ -0,0 +1,373 @@ +//! Snapshot schema shared by the protocol-side producer and the VM-side synthetic benchmark. +//! +//! `trace` contains the hard-target totals used by the verifier: +//! - `core_rows` +//! - `chiplets_rows` +//! - `range_rows` +//! +//! `shape` contains an advisory per-chiplet breakdown used by the solver to keep the synthetic +//! workload representative. The loader validates `trace.chiplets_rows == shape.chiplets_sum()`. + +use std::path::Path; + +use serde::{Deserialize, Serialize}; + +/// Schema version this crate understands. +pub const CURRENT_SCHEMA_VERSION: &str = "0"; + +/// Mirrors `miden_air::trace::MIN_TRACE_LEN`. Keep in sync when the processor's minimum padded +/// length changes. +const MIN_TRACE_LEN: u64 = 64; + +/// A snapshot captured from a representative transaction execution. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TraceSnapshot { + pub schema_version: String, + pub source: String, + pub timestamp: String, + pub miden_vm_version: String, + /// Hard-target totals. The verifier's bracket check operates on these. + pub trace: TraceTotals, + /// Advisory per-chiplet breakdown used by the solver for shaping. + pub shape: TraceBreakdown, +} + +/// Hard-target aggregates -- the verifier's primary contract. +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +pub struct TraceTotals { + /// System + decoder + stack trace length. + pub core_rows: u64, + /// Total chiplets trace length, matching `ChipletsLengths::trace_len` in the processor (sum of + /// per-chiplet lengths + 1 mandatory padding row). + pub chiplets_rows: u64, + /// Range-checker trace length. Derived from memory + bitwise activity; not independently + /// targeted but tracked so the verifier can warn if it ever dominates. + pub range_rows: u64, +} + +/// Per-chiplet row counts. Advisory only -- the solver uses these to size individual snippets so +/// the synthetic program stays representative (hasher work looks like hasher work, not a pile of +/// decoder-pad), but the verifier does not treat individual values as hard targets. +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +pub struct TraceBreakdown { + pub hasher_rows: u64, + pub bitwise_rows: u64, + pub memory_rows: u64, + /// Kernel ROM rows. Not drivable from plain MASM; folded into memory. + #[serde(default)] + pub kernel_rom_rows: u64, + /// ACE chiplet rows. Not drivable from plain MASM; folded into memory. Some producer versions + /// may report this as zero until their processor dependency exposes the ACE trace accessor. + #[serde(default)] + pub ace_rows: u64, +} + +/// In-memory bundle used by the solver and verifier; not serialized. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct TraceShape { + pub totals: TraceTotals, + pub breakdown: TraceBreakdown, +} + +impl TraceTotals { + /// Padded power-of-two bracket for the non-chiplet side of the trace: + /// `next_pow2(max(core_rows, range_rows))`. Under the current AIR this covers core + /// (system/decoder/stack) and range together; if a future AIR separates them, this accessor + /// can be revisited. + pub fn padded_core_side(&self) -> u64 { + self.core_rows.max(self.range_rows).next_power_of_two().max(MIN_TRACE_LEN) + } + + /// Padded power-of-two bracket for the chiplets side of the trace. + pub fn padded_chiplets(&self) -> u64 { + self.chiplets_rows.next_power_of_two().max(MIN_TRACE_LEN) + } + + /// Single global padded length as reported by the processor's + /// `TraceLenSummary::padded_trace_len`. Used by the calibrator to cross-check our derived + /// formulas against the prover. + pub fn padded_total(&self) -> u64 { + self.core_rows + .max(self.range_rows) + .max(self.chiplets_rows) + .next_power_of_two() + .max(MIN_TRACE_LEN) + } + + /// True iff `range_rows` is the largest unpadded component. + pub fn range_dominates(&self) -> bool { + self.range_rows > self.core_rows && self.range_rows > self.chiplets_rows + } +} + +impl TraceBreakdown { + /// Sum of all chiplet sub-traces plus the mandatory +1 padding row, matching + /// `ChipletsLengths::trace_len` in the processor. Used as the loader's consistency check + /// against `TraceTotals::chiplets_rows`. + pub fn chiplets_sum(&self) -> u64 { + self.hasher_rows + + self.bitwise_rows + + self.memory_rows + + self.kernel_rom_rows + + self.ace_rows + + 1 + } + + /// Memory-row target the solver aims for: snapshot memory plus ACE and kernel_rom (both + /// unreachable from plain MASM) folded in. + pub fn memory_target(&self) -> u64 { + self.memory_rows + self.kernel_rom_rows + self.ace_rows + } + + /// Rows folded into the memory target from unreachable chiplets. + pub fn substituted_rows(&self) -> u64 { + self.kernel_rom_rows + self.ace_rows + } +} + +impl TraceShape { + pub fn new(totals: TraceTotals, breakdown: TraceBreakdown) -> Self { + Self { totals, breakdown } + } +} + +impl TraceSnapshot { + /// Load a snapshot from disk, validate its schema version, and cross-check the internal + /// consistency of the two tiers. + pub fn load(path: impl AsRef) -> Result { + let bytes = std::fs::read(path.as_ref()).map_err(|source| SnapshotError::Io { + path: path.as_ref().display().to_string(), + source, + })?; + let snap: Self = serde_json::from_slice(&bytes).map_err(SnapshotError::Parse)?; + + if snap.schema_version != CURRENT_SCHEMA_VERSION { + return Err(SnapshotError::SchemaVersion { + file: snap.schema_version, + supported: CURRENT_SCHEMA_VERSION, + }); + } + + let expected = snap.shape.chiplets_sum(); + if snap.trace.chiplets_rows != expected { + return Err(SnapshotError::InconsistentChipletsTotal { + from_trace: snap.trace.chiplets_rows, + from_shape: expected, + }); + } + + Ok(snap) + } + + /// Combined target shape that the solver and verifier consume. + pub fn shape(&self) -> TraceShape { + TraceShape::new(self.trace, self.shape) + } +} + +#[derive(Debug, thiserror::Error)] +pub enum SnapshotError { + #[error("failed to read snapshot at {path}: {source}")] + Io { + path: String, + #[source] + source: std::io::Error, + }, + #[error("failed to parse snapshot JSON: {0}")] + Parse(#[source] serde_json::Error), + #[error( + "unsupported snapshot schema version {file:?}; this crate understands version {supported:?}" + )] + SchemaVersion { file: String, supported: &'static str }, + #[error( + "snapshot inconsistency: trace.chiplets_rows = {from_trace} but shape sums to {from_shape}" + )] + InconsistentChipletsTotal { from_trace: u64, from_shape: u64 }, +} + +#[cfg(test)] +mod tests { + use super::*; + + struct CommittedSnapshotExpectation { + file_stem: &'static str, + source: &'static str, + padded_core_side: u64, + padded_chiplets: u64, + } + + const COMMITTED_SNAPSHOT_EXPECTATIONS: &[CommittedSnapshotExpectation] = &[ + CommittedSnapshotExpectation { + file_stem: "consume-single-p2id", + source: "protocol/bench-transaction:consume-single-p2id", + padded_core_side: 131_072, + padded_chiplets: 131_072, + }, + CommittedSnapshotExpectation { + file_stem: "consume-two-p2id", + source: "protocol/bench-transaction:consume-two-p2id", + padded_core_side: 131_072, + padded_chiplets: 262_144, + }, + ]; + + fn expectation_for(file_stem: &str) -> Option<&'static CommittedSnapshotExpectation> { + COMMITTED_SNAPSHOT_EXPECTATIONS + .iter() + .find(|expected| expected.file_stem == file_stem) + } + + fn sample_shape() -> (TraceTotals, TraceBreakdown) { + let breakdown = TraceBreakdown { + hasher_rows: 200, + bitwise_rows: 50, + memory_rows: 300, + kernel_rom_rows: 40, + ace_rows: 60, + }; + let totals = TraceTotals { + core_rows: 1000, + chiplets_rows: breakdown.chiplets_sum(), + range_rows: 100, + }; + (totals, breakdown) + } + + #[test] + fn memory_target_folds_ace_and_kernel_rom() { + let (_, b) = sample_shape(); + assert_eq!(b.memory_target(), 400); + assert_eq!(b.substituted_rows(), 100); + // 200 + 50 + 300 + 40 + 60 + 1 padding row = 651 + assert_eq!(b.chiplets_sum(), 651); + } + + #[test] + fn padded_totals_match_processor_formula() { + let (t, _) = sample_shape(); + // max(1000, 100, 651) = 1000 → next pow2 = 1024 + assert_eq!(t.padded_total(), 1024); + // core + range: max(1000, 100) = 1000 → 1024 + assert_eq!(t.padded_core_side(), 1024); + // chiplets alone: 651 → 1024 + assert_eq!(t.padded_chiplets(), 1024); + } + + #[test] + fn padded_total_clamps_to_min_trace_len() { + let totals = TraceTotals { + core_rows: 1, + chiplets_rows: 1, + range_rows: 0, + }; + assert_eq!(totals.padded_total(), MIN_TRACE_LEN); + assert_eq!(totals.padded_core_side(), MIN_TRACE_LEN); + assert_eq!(totals.padded_chiplets(), MIN_TRACE_LEN); + } + + #[test] + fn range_dominates_is_detected() { + let totals = TraceTotals { + core_rows: 100, + chiplets_rows: 200, + range_rows: 500, + }; + assert!(totals.range_dominates()); + let totals = TraceTotals { + core_rows: 500, + chiplets_rows: 200, + range_rows: 100, + }; + assert!(!totals.range_dominates()); + } + + #[test] + fn committed_snapshots_roundtrip() { + let snapshots_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("snapshots"); + let entries = std::fs::read_dir(&snapshots_dir) + .unwrap_or_else(|e| panic!("read {}: {e}", snapshots_dir.display())); + let mut discovered = Vec::new(); + for entry in entries { + let path = entry.expect("dir entry").path(); + if path.extension().and_then(|e| e.to_str()) != Some("json") { + continue; + } + let file_stem = path.file_stem().and_then(|s| s.to_str()).expect("snapshot stem"); + let expected = expectation_for(file_stem) + .unwrap_or_else(|| panic!("unexpected committed snapshot: {}", path.display())); + let snap = TraceSnapshot::load(&path) + .unwrap_or_else(|e| panic!("load {}: {e}", path.display())); + assert_eq!(snap.schema_version, CURRENT_SCHEMA_VERSION); + assert_eq!(snap.source, expected.source); + assert!(snap.trace.core_rows > 0); + assert!(snap.trace.chiplets_rows > 0); + assert_eq!(snap.trace.chiplets_rows, snap.shape.chiplets_sum()); + assert_eq!(snap.trace.padded_core_side(), expected.padded_core_side); + assert_eq!(snap.trace.padded_chiplets(), expected.padded_chiplets); + + let reserialized = serde_json::to_string(&snap).expect("reserialize"); + let roundtripped: TraceSnapshot = + serde_json::from_str(&reserialized).expect("deserialize reserialized"); + assert_eq!(snap.trace, roundtripped.trace); + assert_eq!(snap.shape, roundtripped.shape); + discovered.push(file_stem.to_string()); + } + discovered.sort(); + let mut expected: Vec<_> = COMMITTED_SNAPSHOT_EXPECTATIONS + .iter() + .map(|expected| expected.file_stem) + .collect(); + expected.sort(); + assert_eq!(discovered, expected); + } + + #[test] + fn missing_optional_fields_default_to_zero() { + let minimal = r#"{ + "schema_version": "0", + "source": "test", + "timestamp": "t", + "miden_vm_version": "x", + "trace": { "core_rows": 100, "chiplets_rows": 11, "range_rows": 50 }, + "shape": { "hasher_rows": 10, "bitwise_rows": 0, "memory_rows": 0 } + }"#; + let snap: TraceSnapshot = serde_json::from_str(minimal).expect("parse minimal"); + assert_eq!(snap.shape.kernel_rom_rows, 0); + assert_eq!(snap.shape.ace_rows, 0); + } + + #[test] + fn rejects_unsupported_schema_version() { + let wrong = r#"{ + "schema_version": "9999", + "source": "test", + "timestamp": "t", + "miden_vm_version": "x", + "trace": { "core_rows": 100, "chiplets_rows": 11, "range_rows": 0 }, + "shape": { "hasher_rows": 10, "bitwise_rows": 0, "memory_rows": 0 } + }"#; + let tmp = std::env::temp_dir().join("synthetic-tx-kernel-schema-wrong-version.json"); + std::fs::write(&tmp, wrong).unwrap(); + let err = TraceSnapshot::load(&tmp).expect_err("expected schema version rejection"); + assert!(matches!(err, SnapshotError::SchemaVersion { .. })); + let _ = std::fs::remove_file(&tmp); + } + + #[test] + fn rejects_inconsistent_chiplets_total() { + // chiplets_rows says 500 but the breakdown sums to 11 (10 + 0 + 0 + 1). + let mismatched = r#"{ + "schema_version": "0", + "source": "test", + "timestamp": "t", + "miden_vm_version": "x", + "trace": { "core_rows": 100, "chiplets_rows": 500, "range_rows": 0 }, + "shape": { "hasher_rows": 10, "bitwise_rows": 0, "memory_rows": 0 } + }"#; + let tmp = std::env::temp_dir().join("synthetic-tx-kernel-chiplets-mismatch.json"); + std::fs::write(&tmp, mismatched).unwrap(); + let err = TraceSnapshot::load(&tmp).expect_err("expected inconsistency rejection"); + assert!(matches!(err, SnapshotError::InconsistentChipletsTotal { .. })); + let _ = std::fs::remove_file(&tmp); + } +} diff --git a/benches/synthetic-tx-kernel/src/snippets.rs b/benches/synthetic-tx-kernel/src/snippets.rs new file mode 100644 index 0000000000..ecd7ebb18b --- /dev/null +++ b/benches/synthetic-tx-kernel/src/snippets.rs @@ -0,0 +1,228 @@ +//! Static catalog of MASM snippets that drive individual VM components. +//! +//! The patterns are deliberately few and natural -- they mirror work a real transaction does, +//! rather than one synthetic op per chiplet: +//! +//! - **hasher** drives the Poseidon2 chiplet via repeated `hperm`. The state evolves between +//! iterations (one `padw padw padw` as setup, no reset), so each permutation has a distinct input +//! and the hasher AIR's multiplicity column does not collapse them. +//! - **bitwise** drives the bitwise chiplet via `u32split + u32xor`. +//! - **u32arith** drives the range checker via two banded u32 counters, advancing each by 65537 per +//! iter so their 16-bit halves evolve as disjoint contiguous bands. `u32assert2` issues fresh +//! range-check values each iter without cross-dedup. +//! - **memory** drives the memory chiplet. The address advances by `4 * 65537 = 262148` per iter so +//! the two 16-bit halves of the word index form disjoint contiguous bands, which also feeds the +//! range chiplet via address-limb decomposition. +//! - **decoder_pad** drives only the core (system/decoder/stack) trace so the solver can top up +//! core-trace budget without adding chiplet rows. +//! +//! Each snippet is partitioned into `setup` / `body` / `cleanup` so that the body alone can be +//! wrapped in a `repeat.N ... end` block. The body must leave stack depth unchanged -- the repeat +//! block would otherwise drift the stack each iteration. +//! +//! Note: on current `next`, decoder-only programs still incur hasher rows due to MAST hashing, so +//! low hasher targets may be unreachable; the solver clamps `hasher` iterations to zero in that +//! case. + +/// A VM component the solver targets. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)] +pub enum Component { + /// System + decoder + stack. + Core, + Hasher, + Bitwise, + Memory, + /// Range checker. Not currently a hard bracket target but the solver still sizes the + /// `u32arith` snippet against it so the synthetic's range workload is representative. + Range, +} + +/// A MASM fragment that dominantly drives one component. +#[derive(Debug, Clone, Copy)] +pub struct Snippet { + pub name: &'static str, + /// Runs once, outside the `repeat.N ... end` block. + pub setup: &'static str, + /// Runs N times inside the repeat block. Must be stack-balanced. + pub body: &'static str, + /// Runs once, outside the repeat block. + pub cleanup: &'static str, + /// The primary component this snippet drives. + pub dominant: Component, +} + +/// The full snippet catalog, in solver order: chiplet drivers first so they saturate their +/// targets, `decoder_pad` last so it absorbs leftover main-trace budget. +pub const SNIPPETS: &[Snippet] = &[ + Snippet { + name: "hasher", + setup: "padw padw padw", + body: "hperm", + cleanup: "dropw dropw dropw", + dominant: Component::Hasher, + }, + Snippet { + name: "bitwise", + setup: "push.1 neg", + body: "u32split u32xor", + cleanup: "drop", + dominant: Component::Bitwise, + }, + Snippet { + name: "u32arith", + // Two u32 counters, each advanced by `65537 = 0x0001_0001` per iter, so their four 16-bit + // half-limbs form disjoint contiguous bands at 45500+i, 50500+i, 55500+i, 60500+i (clear + // of the memory snippet's 4*w1 band, which tops out near 44800). Counters update with + // plain field `add` so only the `u32assert2` emits range-check values -- carry limbs don't + // pollute the bands. See `u32arith_max_iters` for the counter bound. + setup: "push.3637308500 push.2981938500", + body: "u32assert2 push.65537 add swap push.65537 add swap", + cleanup: "drop drop", + dominant: Component::Range, + }, + Snippet { + name: "memory", + // Advance the word-aligned address by `4 * 65537 = 262148` each iter so that both 16-bit + // halves of the word index evolve as disjoint contiguous bands. This makes the memory + // chiplet feed fresh range-check values rather than deduplicating, so the snippet also + // contributes meaningfully to the range chiplet. + // + // The start address `4 * ((10000 << 16) | 20000)` puts the address-limb bands in a high + // region that doesn't overlap u32arith's bands. The counter advances with plain field + // `add` so only the memory chiplet emits range-check values -- the update doesn't pollute + // the bands with carry-limb checks. Safe as long as the address stays below `u32::MAX`; + // see `assert_counters_fit`. + setup: "padw push.2621520000", + body: "dup.4 mem_storew_le dup.4 mem_loadw_le movup.4 push.262148 add movdn.4", + cleanup: "drop dropw", + dominant: Component::Memory, + }, + Snippet { + name: "decoder_pad", + setup: "", + body: "swap dup.1 add", + cleanup: "", + dominant: Component::Core, + }, +]; + +/// Look up a snippet by name. Only used by tests. +#[cfg(test)] +pub(crate) fn find(name: &str) -> Option<&'static Snippet> { + SNIPPETS.iter().find(|s| s.name == name) +} + +/// Assemble a single snippet into a complete program body: the setup, followed by +/// `repeat.iters body end`, followed by the cleanup. Returned text has no `begin`/`end` wrapping -- +/// caller composes multiple snippets. +pub fn render(snippet: &Snippet, iters: u64) -> String { + use std::fmt::Write; + let mut out = String::new(); + if !snippet.setup.is_empty() { + writeln!(out, " {}", snippet.setup).unwrap(); + } + if iters > 0 { + writeln!(out, " repeat.{iters}").unwrap(); + writeln!(out, " {}", snippet.body).unwrap(); + writeln!(out, " end").unwrap(); + } + if !snippet.cleanup.is_empty() { + writeln!(out, " {}", snippet.cleanup).unwrap(); + } + out +} + +/// Wrap a snippet fragment into a complete `begin ... end` program. +pub fn wrap_program(body: &str) -> String { + format!("begin\n{body}end\n") +} + +// COUNTER SAFETY +// ------------------------------------------------------------------------ +// +// Both `u32arith` and `memory` advance a counter with plain field `add`, so `u32assert2` / the +// memory chiplet would fail at runtime if the counter crossed `u32::MAX`. These helpers expose the +// per-snippet limits so callers can validate a plan before emitting. + +/// Starting value of `u32arith`'s higher counter -- the one closer to `u32::MAX` and therefore the +/// binding constraint. +const U32ARITH_COUNTER_START: u64 = 3_637_308_500; +const U32ARITH_COUNTER_STRIDE: u64 = 65_537; + +/// Starting value of `memory`'s address counter. +const MEMORY_COUNTER_START: u64 = 2_621_520_000; +const MEMORY_COUNTER_STRIDE: u64 = 262_148; + +const U32_MAX: u64 = u32::MAX as u64; + +/// Maximum iterations of `u32arith` before the `y` counter would exceed `u32::MAX` and trip the +/// next iteration's `u32assert2`. +pub fn u32arith_max_iters() -> u64 { + (U32_MAX - U32ARITH_COUNTER_START) / U32ARITH_COUNTER_STRIDE +} + +/// Maximum iterations of `memory` before the address counter would exceed `u32::MAX`. +pub fn memory_max_iters() -> u64 { + (U32_MAX - MEMORY_COUNTER_START) / MEMORY_COUNTER_STRIDE +} + +#[cfg(test)] +mod tests { + use miden_vm::Assembler; + + use super::*; + + #[test] + fn catalog_has_one_snippet_per_solver_component() { + let targets = [ + Component::Core, + Component::Hasher, + Component::Bitwise, + Component::Memory, + Component::Range, + ]; + for target in targets { + let count = SNIPPETS.iter().filter(|s| s.dominant == target).count(); + assert_eq!(count, 1, "expected exactly one snippet for {target:?}"); + } + } + + #[test] + fn each_snippet_assembles_as_a_standalone_program() { + // Fail fast: if a snippet has malformed MASM, the calibrator will blow up at bench time. + // Catch it in unit tests instead. + for snippet in SNIPPETS { + let source = wrap_program(&render(snippet, 4)); + Assembler::default() + .assemble_program(&source) + .unwrap_or_else(|e| panic!("snippet {:?} failed to assemble: {e}", snippet.name)); + } + } + + #[test] + fn counter_limits_cover_realistic_plans() { + // Realistic plans for consume/create P2ID transactions produce ~5k u32arith iters and + // ~1.2k memory iters. These guards have plenty of headroom for that regime. + assert!(u32arith_max_iters() >= 10_000); + assert!(memory_max_iters() >= 6_000); + } + + #[test] + fn render_emits_repeat_with_body() { + let snippet = find("bitwise").expect("bitwise snippet"); + let out = render(snippet, 42); + assert!(out.contains("repeat.42")); + assert!(out.contains("u32split u32xor")); + assert!(out.contains("push.1 neg")); + assert!(out.contains("drop")); + } + + #[test] + fn render_with_zero_iters_still_emits_setup_and_cleanup() { + let snippet = find("hasher").expect("hasher snippet"); + let out = render(snippet, 0); + assert!(out.contains("padw padw padw")); + assert!(out.contains("dropw dropw dropw")); + assert!(!out.contains("repeat.")); + } +} diff --git a/benches/synthetic-tx-kernel/src/solver.rs b/benches/synthetic-tx-kernel/src/solver.rs new file mode 100644 index 0000000000..91131d11ff --- /dev/null +++ b/benches/synthetic-tx-kernel/src/solver.rs @@ -0,0 +1,217 @@ +//! Solve for per-snippet iteration counts and emit the MASM program. +//! +//! The calibration matrix is close to diagonally dominant: each snippet primarily drives one +//! component and leaks small cross-terms into the others. A short Jacobi refinement with a +//! non-negativity clamp is enough for this problem size and handles infeasible targets gracefully. + +use std::collections::BTreeMap; + +use crate::{ + calibrator::Calibration, + snapshot::TraceShape, + snippets::{self, Component, SNIPPETS}, +}; + +/// Small fixed-point refinement count. The calibrated systems in this crate converge in a few +/// passes, and the clamp keeps iteration counts non-negative. +const REFINEMENT_PASSES: usize = 8; + +/// Iteration counts per snippet, ready to hand to the emitter. +/// +/// Implemented as a sparse map where absence means "zero iterations"; the newtype hides the +/// `unwrap_or(0)` convention behind [`Plan::iters`] so call sites can't forget it. +#[derive(Debug, Default, Clone)] +pub struct Plan { + entries: BTreeMap<&'static str, u64>, +} + +impl Plan { + pub fn new() -> Self { + Self::default() + } + + /// Iteration count for `name`, or 0 if the snippet has no entry. + pub fn iters(&self, name: &str) -> u64 { + self.entries.get(name).copied().unwrap_or(0) + } + + /// Set the iteration count for `name`, removing the entry entirely when `n == 0` so that + /// `iters() == 0` is equivalent to the entry being absent. + pub fn set(&mut self, name: &'static str, n: u64) { + if n == 0 { + self.entries.remove(name); + } else { + self.entries.insert(name, n); + } + } + + /// Increment the iteration count for `name` by `delta`. + pub fn add(&mut self, name: &'static str, delta: u64) { + if delta == 0 { + return; + } + self.set(name, self.iters(name) + delta); + } + + /// Decrement the iteration count for `name` by `delta`, saturating at zero. + pub fn sub_saturating(&mut self, name: &'static str, delta: u64) { + self.set(name, self.iters(name).saturating_sub(delta)); + } +} + +/// Solve for the iteration counts that reproduce `target`'s per-component row counts. Per-chiplet +/// targets come from the snapshot's advisory `shape` breakdown -- the solver uses them to keep the +/// synthetic program representative, but the verifier only hard-asserts totals/brackets. +pub fn solve(calibration: &Calibration, target: &TraceShape) -> Plan { + let mut iters: BTreeMap<&'static str, f64> = + SNIPPETS.iter().map(|s| (s.name, 0.0_f64)).collect(); + + let component_target = |c: Component| -> f64 { + match c { + Component::Core => target.totals.core_rows as f64, + Component::Hasher => target.breakdown.hasher_rows as f64, + Component::Bitwise => target.breakdown.bitwise_rows as f64, + Component::Memory => target.breakdown.memory_target() as f64, + Component::Range => target.totals.range_rows as f64, + } + }; + + for _ in 0..REFINEMENT_PASSES { + let snapshot = iters.clone(); + for snippet in SNIPPETS { + let cost = match calibration.get(snippet.name) { + Some(c) => *c, + None => continue, + }; + let rate = cost.get(snippet.dominant); + if rate <= 0.0 { + continue; + } + let target_rows = component_target(snippet.dominant); + let cross_rows: f64 = SNIPPETS + .iter() + .filter(|s| s.name != snippet.name) + .map(|s| { + let other = + calibration.get(s.name).map(|c| c.get(snippet.dominant)).unwrap_or(0.0); + other * snapshot[s.name] + }) + .sum(); + let needed = (target_rows - cross_rows).max(0.0); + iters.insert(snippet.name, needed / rate); + } + } + + let mut plan = Plan::new(); + for (name, v) in iters { + plan.set(name, v.round().max(0.0) as u64); + } + plan +} + +/// Render the plan as a single `begin ... end` program. +pub fn emit(plan: &Plan) -> String { + use std::fmt::Write; + let mut body = String::new(); + for snippet in SNIPPETS { + let n = plan.iters(snippet.name); + if n == 0 { + continue; + } + write!(body, "{}", snippets::render(snippet, n)).unwrap(); + } + snippets::wrap_program(&body) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{ + calibrator::{calibrate, measure_program}, + snapshot::{TraceBreakdown, TraceTotals}, + }; + + fn shape_of( + core_rows: u64, + range_rows: u64, + hasher: u64, + bitwise: u64, + memory: u64, + ) -> TraceShape { + let breakdown = TraceBreakdown { + hasher_rows: hasher, + bitwise_rows: bitwise, + memory_rows: memory, + kernel_rom_rows: 0, + ace_rows: 0, + }; + let totals = TraceTotals { + core_rows, + chiplets_rows: breakdown.chiplets_sum(), + range_rows, + }; + TraceShape::new(totals, breakdown) + } + + fn low_hasher_target() -> TraceShape { + // core/hasher ratio of ~8, well below the intrinsic core/4 floor. Memory kept modest + // (ratio core/memory ~30) so the test exercises the hasher-feasibility path without making + // it infeasible via memory overshoot into core. + shape_of(68900, 40000, 8200, 0, 2300) + } + + fn high_hasher_target() -> TraceShape { + // main/hasher ratio of ~2, above the intrinsic main/4 floor. + shape_of(16000, 0, 8000, 0, 0) + } + + #[test] + fn low_hasher_target_does_not_add_hperm() { + let cal = calibrate().expect("calibrate"); + let plan = solve(&cal, &low_hasher_target()); + assert_eq!( + plan.iters("hasher"), + 0, + "when the decoder (via memory + pad) already overshoots the hasher target, no hperm iterations should be added", + ); + assert!(plan.iters("memory") > 0); + } + + #[test] + fn high_hasher_target_requires_hperm() { + let cal = calibrate().expect("calibrate"); + let plan = solve(&cal, &high_hasher_target()); + assert!( + plan.iters("hasher") > 0, + "a hasher target above the main/4 floor should require hperm iterations", + ); + } + + #[test] + fn emitted_program_matches_padded_bracket() { + let cal = calibrate().expect("calibrate"); + let target = low_hasher_target(); + let plan = solve(&cal, &target); + let source = emit(&plan); + let actual = measure_program(&source).expect("measure emitted program"); + assert_eq!( + actual.totals.padded_total(), + target.totals.padded_total(), + "padded trace length must match target bracket (got {} vs {})", + actual.totals.padded_total(), + target.totals.padded_total(), + ); + } + + #[test] + fn zero_target_yields_empty_program() { + let cal = calibrate().expect("calibrate"); + let target = shape_of(0, 0, 0, 0, 0); + let plan = solve(&cal, &target); + for snippet in SNIPPETS { + assert_eq!(plan.iters(snippet.name), 0, "{}", snippet.name); + } + let source = emit(&plan); + assert_eq!(source.trim(), "begin\nend"); + } +} diff --git a/benches/synthetic-tx-kernel/src/verifier.rs b/benches/synthetic-tx-kernel/src/verifier.rs new file mode 100644 index 0000000000..ef4e78e657 --- /dev/null +++ b/benches/synthetic-tx-kernel/src/verifier.rs @@ -0,0 +1,300 @@ +//! Verification helpers for synthetic-trace matching. +//! +//! Hard checks: +//! - `padded_core_side(actual) == padded_core_side(target)` -- the current AIR's non-chiplets-side +//! bracket, `next_pow2(max(core_rows, range_rows))`. If a future AIR split gives range its own +//! segment, this check can be revised to assert separate brackets. +//! - `padded_chiplets(actual) == padded_chiplets(target)` +//! +//! Soft reporting: +//! - unpadded totals (`core_rows`, `chiplets_rows`) within [`PER_COMPONENT_TOLERANCE`] +//! - advisory breakdown deltas (info only) +//! - warning if `range_rows` dominates + +use std::fmt::{self, Display}; + +use crate::snapshot::TraceShape; + +/// Reporting tolerance for unpadded totals; never used for pass/fail. +pub const PER_COMPONENT_TOLERANCE: f64 = 0.02; + +/// Result of comparing an emitted program's measured shape against the snapshot target. +#[derive(Debug, Clone)] +pub struct VerificationReport { + pub target: TraceShape, + pub actual: TraceShape, + pub total_deltas: Vec, + pub breakdown_deltas: Vec, +} + +/// How a row-count entry participates in the verifier's reporting. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DeltaStatus { + /// Prints `ok` if within tolerance, `out` otherwise. + Enforced, + /// Always prints `info`; used for rows the solver does not target. + Informational, +} + +/// Per-row-count comparison. +#[derive(Debug, Clone, Copy)] +pub struct ComponentDelta { + pub name: &'static str, + pub target: u64, + pub actual: u64, + pub delta_pct: f64, + pub within_tolerance: bool, + pub status: DeltaStatus, +} + +impl VerificationReport { + pub fn new(target: TraceShape, actual: TraceShape) -> Self { + let total_rows: &[(&'static str, u64, u64, DeltaStatus)] = &[ + ( + "core_rows", + target.totals.core_rows, + actual.totals.core_rows, + DeltaStatus::Enforced, + ), + ( + "chiplets_rows", + target.totals.chiplets_rows, + actual.totals.chiplets_rows, + DeltaStatus::Enforced, + ), + ( + // range_rows is derived, not independently driven. + "range_rows", + target.totals.range_rows, + actual.totals.range_rows, + DeltaStatus::Informational, + ), + ]; + let breakdown_rows: &[(&'static str, u64, u64, DeltaStatus)] = &[ + ( + "hasher", + target.breakdown.hasher_rows, + actual.breakdown.hasher_rows, + DeltaStatus::Informational, + ), + ( + "bitwise", + target.breakdown.bitwise_rows, + actual.breakdown.bitwise_rows, + DeltaStatus::Informational, + ), + ( + "memory", + target.breakdown.memory_target(), + actual.breakdown.memory_rows, + DeltaStatus::Informational, + ), + ]; + Self { + target, + actual, + total_deltas: total_rows.iter().map(|r| component_delta(*r)).collect(), + breakdown_deltas: breakdown_rows.iter().map(|r| component_delta(*r)).collect(), + } + } + + /// True when both padded proxies match their targets exactly. + pub fn brackets_match(&self) -> bool { + self.target.totals.padded_core_side() == self.actual.totals.padded_core_side() + && self.target.totals.padded_chiplets() == self.actual.totals.padded_chiplets() + } + + /// True if `range_rows` is the largest unpadded component in either side, which means snippet + /// balance should be revisited. + pub fn range_dominates(&self) -> bool { + self.target.totals.range_dominates() || self.actual.totals.range_dominates() + } +} + +fn component_delta((name, t, a, status): (&'static str, u64, u64, DeltaStatus)) -> ComponentDelta { + let delta_pct = if t == 0 { + if a == 0 { 0.0 } else { f64::INFINITY } + } else { + (a as f64 - t as f64) / t as f64 + }; + ComponentDelta { + name, + target: t, + actual: a, + delta_pct, + within_tolerance: delta_pct.abs() <= PER_COMPONENT_TOLERANCE, + status, + } +} + +impl Display for VerificationReport { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "-- hard brackets (padded power-of-two) --")?; + write_bracket_row( + f, + "padded_core_side", + self.target.totals.padded_core_side(), + self.actual.totals.padded_core_side(), + )?; + write_bracket_row( + f, + "padded_chiplets", + self.target.totals.padded_chiplets(), + self.actual.totals.padded_chiplets(), + )?; + + writeln!(f, "\n-- totals (soft: {:.0}% band) --", PER_COMPONENT_TOLERANCE * 100.0)?; + write_delta_header(f)?; + for d in &self.total_deltas { + write_delta_row(f, d)?; + } + + writeln!(f, "\n-- breakdown (info) --")?; + write_delta_header(f)?; + for d in &self.breakdown_deltas { + write_delta_row(f, d)?; + } + + writeln!(f)?; + if self.brackets_match() { + writeln!(f, "=> BRACKET MATCH")?; + } else { + writeln!(f, "=> BRACKET MISS")?; + } + if self.range_dominates() { + writeln!( + f, + "!! WARNING: range_rows dominates — \"ignore range\" assumption is breaking" + )?; + } + Ok(()) + } +} + +fn write_delta_header(f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!( + f, + "{:<16} {:>12} {:>12} {:>10} status", + "component", "target", "actual", "delta" + ) +} + +fn write_delta_row(f: &mut fmt::Formatter<'_>, d: &ComponentDelta) -> fmt::Result { + let delta_str = if d.delta_pct.is_finite() { + format!("{:+6.2}%", d.delta_pct * 100.0) + } else { + "+∞".to_string() + }; + let status = match d.status { + DeltaStatus::Enforced => { + if d.within_tolerance { + "ok" + } else { + "out" + } + }, + DeltaStatus::Informational => "info", + }; + writeln!( + f, + "{:<16} {:>12} {:>12} {:>10} {}", + d.name, d.target, d.actual, delta_str, status + ) +} + +fn write_bracket_row( + f: &mut fmt::Formatter<'_>, + name: &str, + target: u64, + actual: u64, +) -> fmt::Result { + let ok = if target == actual { "==" } else { "MISS" }; + writeln!(f, "{name:<16} {target:>12} {actual:>12} {ok:>10}") +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::snapshot::{TraceBreakdown, TraceTotals}; + + fn shape(core: u64, hasher: u64, memory: u64) -> TraceShape { + let breakdown = TraceBreakdown { + hasher_rows: hasher, + bitwise_rows: 0, + memory_rows: memory, + kernel_rom_rows: 0, + ace_rows: 0, + }; + let totals = TraceTotals { + core_rows: core, + chiplets_rows: breakdown.chiplets_sum(), + range_rows: 0, + }; + TraceShape::new(totals, breakdown) + } + + #[test] + fn exact_match_is_bracket_ok_and_all_within_tolerance() { + let t = shape(68000, 8000, 12000); + let r = VerificationReport::new(t, t); + assert!(r.brackets_match()); + assert!(r.total_deltas.iter().all(|d| d.within_tolerance)); + assert!(r.breakdown_deltas.iter().all(|d| d.within_tolerance)); + } + + #[test] + fn bracket_miss_is_reported_when_core_bracket_differs() { + // target.core=68000 → 131072; actual.core=30000 → 32768 (different bracket) + let target = shape(68000, 8000, 12000); + let actual = shape(30000, 2000, 1000); + let r = VerificationReport::new(target, actual); + assert!(!r.brackets_match()); + assert!(r.to_string().contains("BRACKET MISS")); + } + + #[test] + fn chiplets_bracket_can_miss_independently_of_core() { + // core is the same (same padded bracket); chiplets_rows lands in different brackets. + // target chiplets = 8000 + 12000 + 1 = 20001 → 32768 + // actual chiplets = 20000 + 30000 + 1 = 50001 → 65536 + let target = shape(40000, 8000, 12000); + let actual = shape(40000, 20000, 30000); + let r = VerificationReport::new(target, actual); + // padded_core_side: both 40000 → 65536 (same) + assert_eq!(target.totals.padded_core_side(), actual.totals.padded_core_side()); + // padded_chiplets differs + assert_ne!(target.totals.padded_chiplets(), actual.totals.padded_chiplets()); + assert!(!r.brackets_match()); + } + + #[test] + fn range_dominates_is_warned() { + let breakdown = TraceBreakdown { + hasher_rows: 100, + bitwise_rows: 0, + memory_rows: 0, + kernel_rom_rows: 0, + ace_rows: 0, + }; + let totals = TraceTotals { + core_rows: 100, + chiplets_rows: breakdown.chiplets_sum(), + range_rows: 500, + }; + let t = TraceShape::new(totals, breakdown); + let r = VerificationReport::new(t, t); + assert!(r.range_dominates()); + assert!(r.to_string().contains("range_rows dominates")); + } + + #[test] + fn per_component_overshoot_stays_within_bracket() { + // Hasher overshoots but both core and chiplets stay within their brackets. + let target = shape(68000, 8000, 12000); + let actual = shape(68000, 14000, 12000); + let r = VerificationReport::new(target, actual); + assert!(r.brackets_match()); + let hasher_delta = r.breakdown_deltas.iter().find(|d| d.name == "hasher").unwrap(); + assert!(!hasher_delta.within_tolerance); + } +}