diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4f4dccda3a..2c66c12796 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@
 #### Changes
 
 - Documented sortedness precondition more prominently for sorted array operations ([#2832](https://github.com/0xMiden/miden-vm/pull/2832)).
+- Added a synthetic transaction-kernel Criterion benchmark for VM-level proving regression detection ([#3024](https://github.com/0xMiden/miden-vm/pull/3024)).
 - Borrowed operation slices in basic-block batching helpers to avoid cloning in the fingerprinting path ([#2994](https://github.com/0xMiden/miden-vm/pull/2994)).
 - [BREAKING] Sync execution and proving APIs now require `SyncHost`; async `Host`, `execute`, and `prove` remain available ([#2865](https://github.com/0xMiden/miden-vm/pull/2865)).
 - [BREAKING] `miden_processor::execute()` and `execute_sync()` now return `ExecutionOutput`; trace building remains explicit via `execute_trace_inputs*()` and `trace::build_trace()` ([#2865](https://github.com/0xMiden/miden-vm/pull/2865)).
diff --git a/Cargo.lock b/Cargo.lock
index c1a326829f..d99ffde2c6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1607,6 +1607,19 @@ dependencies = [
  "p3-goldilocks",
 ]
 
+[[package]]
+name = "miden-synthetic-tx-kernel"
+version = "0.23.0"
+dependencies = [
+ "criterion",
+ "miden-core",
+ "miden-processor",
+ "miden-vm",
+ "serde",
+ "serde_json",
+ "thiserror",
+]
+
 [[package]]
 name = "miden-test-serde-macros"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index bb188afcb4..f5128b6fae 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,7 @@
 [workspace]
 members = [
     "air",
+    "benches/synthetic-tx-kernel",
     "core",
     "crates/ace-codegen",
     "crates/assembly",
diff --git a/benches/synthetic-tx-kernel/Cargo.toml b/benches/synthetic-tx-kernel/Cargo.toml
new file mode 100644
index 0000000000..72e285b977
--- /dev/null
+++ b/benches/synthetic-tx-kernel/Cargo.toml
@@ -0,0 +1,27 @@
+[package]
+name = "miden-synthetic-tx-kernel"
+version.workspace = true
+edition.workspace = true
+license.workspace = true
+authors.workspace = true
+homepage.workspace = true
+repository.workspace = true
+publish = false
+
+[lints]
+workspace = true
+
+[dependencies]
+miden-core      = { workspace = true }
+miden-processor = { workspace = true, features = ["concurrent"] }
+miden-vm        = { path = "../../miden-vm", features = ["concurrent"] }
+serde           = { workspace = true, features = ["std"] }
+serde_json      = { workspace = true, features = ["std"] }
+thiserror       = { workspace = true, features = ["std"] }
+
+[dev-dependencies]
+criterion = { workspace = true }
+
+[[bench]]
+name = "synthetic_kernel"
+harness = false
diff --git a/benches/synthetic-tx-kernel/README.md b/benches/synthetic-tx-kernel/README.md
new file mode 100644
index 0000000000..2137037edf
--- /dev/null
+++ b/benches/synthetic-tx-kernel/README.md
@@ -0,0 +1,213 @@
+# miden-synthetic-tx-kernel
+
+Criterion benchmark that reproduces the **proving-cost brackets** of a real
+Miden transaction workload from a small JSON snapshot -- without pulling in
+any protocol-level code.
+
+## Approach
+
+STARK proving cost is dominated by the padded power-of-two lengths of the
+execution trace's segments. Everything else -- per-chiplet row counts,
+instruction mix, which procedures get called -- is second-order once the
+brackets are known.
+
+This crate takes a snapshot of per-segment trace-row counts captured from
+a real transaction in the `protocol` repo, generates a tiny MASM program
+whose execution reproduces those brackets, and runs `execute` +
+`execute_and_prove` Criterion groups against it. The result is a
+VM-level regression detector that isolates *prover* changes from
+*workload* changes without depending on protocol's transaction
+machinery.
+
+## Pipeline (per bench run)
+
+Each bench invocation rebuilds every synthetic program from scratch,
+so the numbers always reflect the current commit's VM -- there are no
+stale calibration constants checked into the repo.
+
+1. **Calibrate (once)** -- run each MASM snippet as `repeat.K ...` and
+   divide the resulting per-component row counts by `K` to learn how
+   many core/hasher/memory/... rows a single iteration costs *on this
+   VM*. Running this on every bench invocation is what keeps the
+   bench honest across VM changes: if `hperm` gets cheaper tomorrow,
+   tomorrow's iteration count grows to compensate, and the target
+   bracket is still hit.
+
+For each snapshot under `snapshots/` (or the single path in
+`SYNTH_SNAPSHOT`):
+
+2. **Load snapshot** -- read the target row counts; this is the shape
+   we want the emitted program to reproduce. See
+   [Snapshot format](#snapshot-format).
+3. **Solve** -- pick an iteration count for each snippet so that their
+   combined row contributions add up to the snapshot's target. We do
+   this by fixed-point refinement: start from zero, and on each pass
+   update every snippet's count from the current guesses of the others,
+   clamping negatives to zero. A handful of passes is enough because
+   each snippet is designed to drive mostly *one* component, so the
+   counts barely depend on each other and the sweep converges quickly.
+   (For the linear-algebra reader: this is Jacobi iteration on a
+   near-diagonal matrix with a non-negativity projection.)
+4. **Emit** -- wrap each snippet's body in a `repeat.N ... end` block,
+   concatenate, and enclose in `begin ... end`. The output is the MASM
+   program that Criterion actually runs.
+5. **Verify** -- execute the emitted program, measure its real row
+   counts, and assert that `padded_core_side` and `padded_chiplets`
+   match the snapshot's. A bracket miss fails the bench; smaller drift
+   inside the same bracket is reported but tolerated, because proving
+   cost is driven by the padded length, not the raw count.
+
+## Snippets
+
+Five patterns cover every component the solver targets:
+
+| Snippet       | Body                                         | Drives                        |
+|---------------|----------------------------------------------|-------------------------------|
+| `hasher`      | `hperm`                                      | Poseidon2 hasher chiplet      |
+| `bitwise`     | `u32split u32xor`                            | bitwise chiplet               |
+| `u32arith`    | `u32assert2 push.65537 add swap push.65537 add swap` | range chiplet |
+| `memory`      | `dup.4 mem_storew_le dup.4 mem_loadw_le movup.4 push.262148 add movdn.4` | memory chiplet |
+| `decoder_pad` | `swap dup.1 add`                             | core (decoder + stack)        |
+
+`u32arith` and `memory` use banded counters (strides of 65537 and
+262148) so that their 16-bit limbs form disjoint contiguous bands,
+keeping the range chiplet from deduplicating limb values across
+iterations.
+
+The solver has no snippets targeting the ACE or kernel-ROM chiplets.
+
+- **ACE** is reachable from plain MASM, but exercising it requires
+  building an arithmetic circuit and preparing a memory region for its
+  READ section -- more setup than the other snippets warrant, and not
+  currently done here.
+- **Kernel-ROM** rows are a small, near-constant contribution in
+  practice, so we simplify by folding them into the memory target
+  rather than driving them directly.
+
+Since snapshots still carry row counts for both, they're **folded into
+the memory target** -- growing memory ops preserves the overall
+chiplet-trace length and therefore the chiplet bracket.
+
+One producer-side caveat: the consumer can measure `ace_chiplet_len()`
+when it runs synthetic programs, but protocol snapshots may report
+`ace_rows: 0` until the protocol-side `miden-processor` dependency
+exposes that accessor. Treat zero ACE rows in a snapshot as a producer
+visibility limitation, not as proof that the VM emitted no ACE rows.
+
+## Snapshot format
+
+Two-tier: **hard-contract totals** in `trace`, **advisory breakdown** in
+`shape`. The loader validates `trace.chiplets_rows == sum(shape) + 1`.
+Schema version is currently `"0"`.
+
+```json
+{
+  "schema_version": "0",
+  "source": "protocol/bench-transaction:consume-single-p2id",
+  "timestamp": "unix-1776428820",
+  "miden_vm_version": "0.22",
+  "trace": {
+    "core_rows":     77699,
+    "chiplets_rows": 123129,
+    "range_rows":    20203
+  },
+  "shape": {
+    "hasher_rows":     120352,
+    "bitwise_rows":       416,
+    "memory_rows":       2297,
+    "kernel_rom_rows":     63,
+    "ace_rows":             0
+  }
+}
+```
+
+Snapshots live in `snapshots/`. The bench loads every `*.json` file in
+that directory and runs one Criterion group per snapshot, named
+`synthetic_transaction_kernel/<file-stem>`. Set
+`SYNTH_SNAPSHOT=/path/to/file.json` to bench a single snapshot
+instead.
+
+## Verifier contract
+
+Once the emitted program has run, the verifier compares its actual
+row counts against the snapshot's targets and decides whether the
+bench passed. The checks come in three tiers -- **hard**, **soft**,
+and **info** -- graded by how directly each number maps to proving
+cost. There's also one free-standing **warning** for snippet-balance
+regressions.
+
+### Hard checks -- fail the bench
+
+Proving cost is dominated by the padded (power-of-two) length of each
+trace segment, not by the raw row count. So the only assertions that
+can fail the bench are on two padded proxies:
+
+- `padded_core_side = next_pow2(max(core_rows, range_rows))` -- the
+  non-chiplets side of the AIR.
+- `padded_chiplets   = next_pow2(chiplets_rows)`.
+
+These two can land in *different* brackets on the same workload --
+`consume-two-p2id`, for example, has `padded_core_side = 131072` but
+`padded_chiplets = 262144`. Checking them independently catches a
+bracket miss on either side that a single global `padded_total`
+check would hide.
+
+### Soft checks -- report, don't fail
+
+`core_rows` and `chiplets_rows` are compared against the targets
+within a 2% band. A drift inside that band is harmless for proving
+cost (same bracket either way), so the bench only reports it. A
+drift that *crosses* a bracket is already caught by the hard tier
+above, so this tier exists purely to surface in-bracket near-misses
+worth noticing.
+
+### Info -- no judgement
+
+Per-chiplet deltas (hasher/bitwise/memory/...) from `shape` are
+printed for visibility but never asserted. Some divergence is
+unavoidable: MAST hashing at program init contributes hasher rows
+that the synthetic program can't suppress, so a snapshot with
+`core_rows / hasher_rows > 4` cannot be per-chiplet-matched even
+though it still matches both padded brackets. See `src/snippets.rs`
+for the cases where this structural mismatch shows up.
+
+### Warning -- range dominates
+
+If `range_rows` turns out to be the largest unpadded component in
+either the target or the actual shape, the bench prints a warning.
+The solver treats range as a derived quantity driven mostly by u32
+arithmetic; if it starts setting the bracket, snippet balance has
+drifted and should be revisited.
+
+## Refreshing snapshots from protocol
+
+The producer lives in the `protocol` repo as
+`bin/bench-transaction/src/bin/tx-trace-snapshot.rs` and emits one
+JSON per scenario under `bin/bench-transaction/snapshots/`. Flow:
+
+1. In `protocol`: `cargo run --release -p bench-transaction --bin tx-trace-snapshot`
+2. Copy the regenerated JSONs over the same-named files in
+   `miden-vm/benches/synthetic-tx-kernel/snapshots/`.
+3. Run `cargo bench -p miden-synthetic-tx-kernel` and verify
+   `=> BRACKET MATCH` for every snapshot in the printed verifier
+   tables.
+
+Snapshots travel by hand so that the two repos can evolve independently. The
+loader rejects unknown `schema_version` values. A `miden_vm_version` major/minor
+mismatch is intentionally a loud warning, not a hard failure, because protocol
+often pins one miden-vm release behind `next`. Read bracket matches across a
+version mismatch as useful regression signals, then refresh the snapshots when
+the protocol-side pin catches up.
+
+## Running
+
+```sh
+cargo bench -p miden-synthetic-tx-kernel
+```
+
+Env vars:
+
+- `SYNTH_SNAPSHOT=<path>` -- bench only the specified snapshot file
+  (instead of iterating over every `snapshots/*.json`).
+- `SYNTH_MASM_WRITE=1` -- dump each emitted MASM program to
+  `target/synthetic_kernel_<snapshot-stem>.masm` for inspection.
diff --git a/benches/synthetic-tx-kernel/benches/synthetic_kernel.rs b/benches/synthetic-tx-kernel/benches/synthetic_kernel.rs
new file mode 100644
index 0000000000..1dab09f53b
--- /dev/null
+++ b/benches/synthetic-tx-kernel/benches/synthetic_kernel.rs
@@ -0,0 +1,346 @@
+//! Synthetic transaction-kernel benchmark.
+//!
+//! Pipeline each bench run:
+//!   1. Calibrate each snippet's per-iteration cost against the current VM (shared across all
+//!      snapshots in this run).
+//!   2. For each snapshot JSON (every `snapshots/*.json`, or the single file in `SYNTH_SNAPSHOT`):
+//!      solve for per-snippet iteration counts, emit the resulting MASM program, verify it lands in
+//!      the snapshot's padded-trace bracket, and run `execute` + `execute_and_prove` Criterion
+//!      benches.
+//!
+//! Env vars:
+//! - `SYNTH_SNAPSHOT`: path to a single snapshot JSON; if set, only this snapshot is benched.
+//!   Otherwise every `snapshots/*.json` in the manifest dir is used.
+//! - `SYNTH_MASM_WRITE`: if set, write the emitted MASM to
+//!   `target/synthetic_kernel_<snapshot-stem>.masm`.
+
+use std::{hint::black_box, path::PathBuf, time::Duration};
+
+use criterion::{BatchSize, Criterion, SamplingMode, criterion_group, criterion_main};
+use miden_processor::{
+    DefaultHost, ExecutionOptions, FastProcessor, StackInputs, advice::AdviceInputs,
+};
+use miden_synthetic_tx_kernel::{
+    calibrator::{Calibration, calibrate, measure_program},
+    snapshot::{TraceShape, TraceSnapshot},
+    snippets::{SNIPPETS, memory_max_iters, u32arith_max_iters},
+    solver::{Plan, emit, solve},
+    verifier::VerificationReport,
+};
+use miden_vm::{Assembler, ProvingOptions, prove_sync};
+
+fn resolve_snapshot_paths() -> Vec<PathBuf> {
+    if let Ok(explicit) = std::env::var("SYNTH_SNAPSHOT") {
+        return vec![PathBuf::from(explicit)];
+    }
+    let snapshots_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("snapshots");
+    let mut paths: Vec<PathBuf> = std::fs::read_dir(&snapshots_dir)
+        .unwrap_or_else(|e| panic!("failed to read {}: {e}", snapshots_dir.display()))
+        .filter_map(|entry| entry.ok().map(|e| e.path()))
+        .filter(|p| p.extension().and_then(|e| e.to_str()) == Some("json"))
+        .collect();
+    paths.sort();
+    assert!(!paths.is_empty(), "no snapshots found under {}", snapshots_dir.display());
+    paths
+}
+
+fn synthetic_transaction_kernel(c: &mut Criterion) {
+    let calibration = calibrate().expect("failed to calibrate snippets");
+    println!("\n=== calibration (rows/iter)");
+    for snippet in SNIPPETS {
+        let cost = calibration[snippet.name];
+        println!(
+            "    {:<14} core={:7.3} hasher={:6.3} bitwise={:6.3} memory={:6.3} range={:6.3}",
+            snippet.name, cost.core, cost.hasher, cost.bitwise, cost.memory, cost.range,
+        );
+    }
+
+    for path in resolve_snapshot_paths() {
+        bench_one_snapshot(c, &calibration, &path);
+    }
+}
+
+fn bench_one_snapshot(c: &mut Criterion, calibration: &Calibration, snapshot_path: &PathBuf) {
+    let snapshot = TraceSnapshot::load(snapshot_path)
+        .unwrap_or_else(|e| panic!("failed to load snapshot at {}: {e}", snapshot_path.display()));
+
+    println!("\n=== snapshot: {}", snapshot_path.display());
+    println!("    source:  {}", snapshot.source);
+    println!("    vm_ver:  {}", snapshot.miden_vm_version);
+
+    let local_vm_version = env!("CARGO_PKG_VERSION");
+    if !versions_align(&snapshot.miden_vm_version, local_vm_version) {
+        println!(
+            "    WARNING: snapshot captured against miden-vm {}, consumer is running {}; \
+             continuing because version skew is expected when protocol lags miden-vm",
+            snapshot.miden_vm_version, local_vm_version,
+        );
+    }
+    println!(
+        "    trace:   core={} chiplets={} range={} (padded_total={})",
+        snapshot.trace.core_rows,
+        snapshot.trace.chiplets_rows,
+        snapshot.trace.range_rows,
+        snapshot.trace.padded_total(),
+    );
+    println!(
+        "    shape:   hasher={} bitwise={} memory={} kernel_rom={} ace={}",
+        snapshot.shape.hasher_rows,
+        snapshot.shape.bitwise_rows,
+        snapshot.shape.memory_rows,
+        snapshot.shape.kernel_rom_rows,
+        snapshot.shape.ace_rows,
+    );
+    if snapshot.shape.substituted_rows() > 0 {
+        println!(
+            "    note:    {} rows (ace={} + kernel_rom={}) folded into memory target",
+            snapshot.shape.substituted_rows(),
+            snapshot.shape.ace_rows,
+            snapshot.shape.kernel_rom_rows,
+        );
+    }
+
+    let target_shape = snapshot.shape();
+    let mut plan = solve(calibration, &target_shape);
+    range_correction_pass(&mut plan, &target_shape);
+    assert_counters_fit(&plan);
+
+    println!("\n=== plan");
+    for snippet in SNIPPETS {
+        println!("    {:<14} iters={}", snippet.name, plan.iters(snippet.name),);
+    }
+
+    let snapshot_stem = snapshot_path.file_stem().and_then(|s| s.to_str()).unwrap_or("unknown");
+
+    let source = emit(&plan);
+    if std::env::var("SYNTH_MASM_WRITE").is_ok() {
+        let out = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("target")
+            .join(format!("synthetic_kernel_{snapshot_stem}.masm"));
+        std::fs::create_dir_all(out.parent().expect("parent"))
+            .expect("create target dir for MASM dump");
+        std::fs::write(&out, &source).expect("write MASM dump");
+        println!("\n=== wrote MASM dump to {}", out.display());
+    }
+
+    let actual = measure_program(&source).expect("measure emitted program");
+    let report = VerificationReport::new(target_shape, actual);
+    println!("\n=== verification\n{report}");
+    assert!(
+        report.brackets_match(),
+        "emitted program lands in a different padded-trace bracket than the snapshot",
+    );
+
+    let program = Assembler::default()
+        .assemble_program(&source)
+        .expect("assemble emitted program");
+
+    let mut group = c.benchmark_group(format!("synthetic_transaction_kernel/{snapshot_stem}"));
+    group
+        .sampling_mode(SamplingMode::Flat)
+        .sample_size(10)
+        .warm_up_time(Duration::from_millis(500))
+        .measurement_time(Duration::from_secs(10));
+
+    group.bench_function("execute", |b| {
+        b.iter_batched(
+            || {
+                let host = DefaultHost::default();
+                let processor = FastProcessor::new_with_options(
+                    StackInputs::default(),
+                    AdviceInputs::default(),
+                    ExecutionOptions::default(),
+                );
+                (host, program.clone(), processor)
+            },
+            |(mut host, program, processor)| {
+                black_box(processor.execute_sync(&program, &mut host).expect("execute"));
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.bench_function("execute_and_prove", |b| {
+        b.iter_batched(
+            || {
+                let host = DefaultHost::default();
+                let stack = StackInputs::default();
+                let advice = AdviceInputs::default();
+                (host, program.clone(), stack, advice)
+            },
+            |(mut host, program, stack, advice)| {
+                black_box(
+                    prove_sync(
+                        &program,
+                        stack,
+                        advice,
+                        &mut host,
+                        ExecutionOptions::default(),
+                        ProvingOptions::default(),
+                    )
+                    .expect("prove"),
+                );
+            },
+            BatchSize::SmallInput,
+        );
+    });
+
+    group.finish();
+}
+
+/// Returns true when snapshot and local versions agree on `major.minor`. Patch differences are
+/// ignored.
+fn versions_align(snapshot: &str, local: &str) -> bool {
+    let mut s = snapshot.split('.');
+    let mut l = local.split('.');
+    (s.next(), s.next()) == (l.next(), l.next())
+}
+
+/// Panic if any snippet's plan iteration count would overflow its counter beyond `u32::MAX` (which
+/// would trip `u32assert2` or a memory op at runtime). Cheap safeguard in case a snapshot with
+/// unusually large range or memory targets gets fed in.
+fn assert_counters_fit(plan: &Plan) {
+    let u32arith = plan.iters("u32arith");
+    assert!(
+        u32arith <= u32arith_max_iters(),
+        "u32arith iters ({}) would overflow its u32 counter (max {}); \
+         revisit the banded-counter start constants",
+        u32arith,
+        u32arith_max_iters(),
+    );
+    let memory = plan.iters("memory");
+    assert!(
+        memory <= memory_max_iters(),
+        "memory iters ({}) would overflow its u32 address counter (max {}); \
+         revisit the banded-address start constants",
+        memory,
+        memory_max_iters(),
+    );
+}
+
+// RANGE CORRECTION PASS
+// ------------------------------------------------------------------------
+//
+// Range's trace length is not perfectly linear under composition, so the primary solver can leave
+// a few-percent residual. This pass closes it by measuring marginal rates and applying a combo of
+// `+u32arith -hasher -decoder_pad` that lifts range while keeping core and chiplets approximately
+// fixed. Memory is deliberately left alone so the emitted memory-chiplet workload stays
+// representative.
+
+#[derive(Debug, Clone, Copy)]
+struct MarginalRates {
+    core: f64,
+    chiplets: f64,
+    range: f64,
+}
+
+const CORRECTION_PROBE_DELTA: u64 = 256;
+const CORRECTION_TOLERANCE: f64 = 0.01;
+const CORRECTION_MAX_PASSES: usize = 2;
+
+fn measure_plan(plan: &Plan) -> TraceShape {
+    let source = emit(plan);
+    measure_program(&source).expect("measure emitted plan")
+}
+
+fn measure_marginal(
+    base_plan: &Plan,
+    base_shape: TraceShape,
+    snippet: &'static str,
+    delta: u64,
+) -> MarginalRates {
+    let mut probe = base_plan.clone();
+    probe.add(snippet, delta);
+    let shape = measure_plan(&probe);
+    let d = delta as f64;
+    MarginalRates {
+        core: (shape.totals.core_rows as f64 - base_shape.totals.core_rows as f64) / d,
+        chiplets: (shape.totals.chiplets_rows as f64 - base_shape.totals.chiplets_rows as f64) / d,
+        range: (shape.totals.range_rows as f64 - base_shape.totals.range_rows as f64) / d,
+    }
+}
+
+/// Solve for `(add_u32arith, sub_hasher, sub_pad)` that lifts range by `range_residual` while
+/// holding core and chiplets approximately fixed. Returns `None` if the 2x2 system for the two
+/// subtractions is degenerate, any coefficient comes out negative, or the net range gain is
+/// non-positive.
+fn solve_range_correction(
+    range_residual: f64,
+    u32arith: MarginalRates,
+    hasher: MarginalRates,
+    decoder_pad: MarginalRates,
+) -> Option<(u64, u64, u64)> {
+    if range_residual <= 0.0 {
+        return None;
+    }
+    // Solve per unit of u32arith:
+    //   hasher.core * y     + decoder_pad.core * z     = u32arith.core
+    //   hasher.chiplets * y + decoder_pad.chiplets * z = u32arith.chiplets
+    let det = hasher.core * decoder_pad.chiplets - decoder_pad.core * hasher.chiplets;
+    if det.abs() < 1e-9 {
+        return None;
+    }
+    let y_per_x =
+        (u32arith.core * decoder_pad.chiplets - decoder_pad.core * u32arith.chiplets) / det;
+    let z_per_x = (hasher.core * u32arith.chiplets - u32arith.core * hasher.chiplets) / det;
+    if y_per_x < 0.0 || z_per_x < 0.0 {
+        return None;
+    }
+    let net_range_per_x = u32arith.range - hasher.range * y_per_x - decoder_pad.range * z_per_x;
+    if net_range_per_x <= 0.0 {
+        return None;
+    }
+    let x = (range_residual / net_range_per_x).round();
+    if x <= 0.0 {
+        return None;
+    }
+    Some((x as u64, (x * y_per_x).round() as u64, (x * z_per_x).round() as u64))
+}
+
+fn range_correction_pass(plan: &mut Plan, target: &TraceShape) {
+    let target_range = target.totals.range_rows;
+    if target_range == 0 {
+        return;
+    }
+    let tolerance = (target_range as f64 * CORRECTION_TOLERANCE) as u64;
+    for pass in 0..CORRECTION_MAX_PASSES {
+        let actual = measure_plan(plan);
+        let actual_range = actual.totals.range_rows;
+        let residual = target_range as i64 - actual_range as i64;
+        println!(
+            "\n=== range correction pass {}: target={} actual={} residual={}",
+            pass + 1,
+            target_range,
+            actual_range,
+            residual,
+        );
+        if residual <= tolerance as i64 {
+            // Already within band, or overshoot (residual <= 0).
+            return;
+        }
+
+        let u32arith = measure_marginal(plan, actual, "u32arith", CORRECTION_PROBE_DELTA);
+        let hasher = measure_marginal(plan, actual, "hasher", CORRECTION_PROBE_DELTA);
+        let decoder_pad = measure_marginal(plan, actual, "decoder_pad", CORRECTION_PROBE_DELTA);
+
+        match solve_range_correction(residual as f64, u32arith, hasher, decoder_pad) {
+            Some((add_u32arith, sub_hasher, sub_pad)) => {
+                let sub_hasher = sub_hasher.min(plan.iters("hasher"));
+                let sub_pad = sub_pad.min(plan.iters("decoder_pad"));
+                println!(
+                    "    applying: +u32arith {add_u32arith}  -hasher {sub_hasher}  -decoder_pad {sub_pad}"
+                );
+                plan.add("u32arith", add_u32arith);
+                plan.sub_saturating("hasher", sub_hasher);
+                plan.sub_saturating("decoder_pad", sub_pad);
+            },
+            None => {
+                println!("    no valid local correction found");
+                return;
+            },
+        }
+    }
+}
+
+criterion_group!(benches, synthetic_transaction_kernel);
+criterion_main!(benches);
diff --git a/benches/synthetic-tx-kernel/snapshots/consume-single-p2id.json b/benches/synthetic-tx-kernel/snapshots/consume-single-p2id.json
new file mode 100644
index 0000000000..3b625cf6d1
--- /dev/null
+++ b/benches/synthetic-tx-kernel/snapshots/consume-single-p2id.json
@@ -0,0 +1,18 @@
+{
+  "schema_version": "0",
+  "source": "protocol/bench-transaction:consume-single-p2id",
+  "timestamp": "unix-1776673245",
+  "miden_vm_version": "0.22",
+  "trace": {
+    "core_rows": 77699,
+    "chiplets_rows": 123129,
+    "range_rows": 20203
+  },
+  "shape": {
+    "hasher_rows": 120352,
+    "bitwise_rows": 416,
+    "memory_rows": 2297,
+    "kernel_rom_rows": 63,
+    "ace_rows": 0
+  }
+}
\ No newline at end of file
diff --git a/benches/synthetic-tx-kernel/snapshots/consume-two-p2id.json b/benches/synthetic-tx-kernel/snapshots/consume-two-p2id.json
new file mode 100644
index 0000000000..a84c84c0c9
--- /dev/null
+++ b/benches/synthetic-tx-kernel/snapshots/consume-two-p2id.json
@@ -0,0 +1,18 @@
+{
+  "schema_version": "0",
+  "source": "protocol/bench-transaction:consume-two-p2id",
+  "timestamp": "unix-1776673247",
+  "miden_vm_version": "0.22",
+  "trace": {
+    "core_rows": 80522,
+    "chiplets_rows": 141260,
+    "range_rows": 20121
+  },
+  "shape": {
+    "hasher_rows": 138208,
+    "bitwise_rows": 592,
+    "memory_rows": 2392,
+    "kernel_rom_rows": 67,
+    "ace_rows": 0
+  }
+}
\ No newline at end of file
diff --git a/benches/synthetic-tx-kernel/src/calibrator.rs b/benches/synthetic-tx-kernel/src/calibrator.rs
new file mode 100644
index 0000000000..8e73f56e63
--- /dev/null
+++ b/benches/synthetic-tx-kernel/src/calibrator.rs
@@ -0,0 +1,255 @@
+//! Calibration and measurement helpers.
+//!
+//! Each snippet is run as a `repeat.K` loop, measured through the real trace builder, and
+//! converted into per-iteration row costs. Calibration happens on every bench run so the synthetic
+//! adapts to the current VM's row accounting.
+
+use std::collections::BTreeMap;
+
+use miden_processor::{DefaultHost, FastProcessor, StackInputs, trace::build_trace};
+use miden_vm::Assembler;
+
+use crate::{
+    snapshot::{TraceBreakdown, TraceShape, TraceTotals},
+    snippets::{self, Component, SNIPPETS},
+};
+
+pub const CALIBRATION_ITERS: u64 = 1000;
+
+// MEASUREMENT
+// ------------------------------------------------------------------------
+
+/// Assemble and execute `source`, returning the shape of the resulting execution trace. Wraps
+/// assembler + fast processor + trace builder.
+pub fn measure_program(source: &str) -> Result<TraceShape, MeasurementError> {
+    let program = Assembler::default()
+        .assemble_program(source)
+        .map_err(|e| MeasurementError::Assembly(format!("{e}")))?;
+
+    let mut host = DefaultHost::default();
+    let processor = FastProcessor::new(StackInputs::default());
+    let trace_inputs = processor
+        .execute_trace_inputs_sync(&program, &mut host)
+        .map_err(|e| MeasurementError::Execution(format!("{e}")))?;
+    let trace =
+        build_trace(trace_inputs).map_err(|e| MeasurementError::TraceBuild(format!("{e}")))?;
+    let summary = trace.trace_len_summary();
+    let chiplets = summary.chiplets_trace_len();
+
+    let breakdown = TraceBreakdown {
+        hasher_rows: chiplets.hash_chiplet_len() as u64,
+        bitwise_rows: chiplets.bitwise_chiplet_len() as u64,
+        memory_rows: chiplets.memory_chiplet_len() as u64,
+        kernel_rom_rows: chiplets.kernel_rom_len() as u64,
+        ace_rows: chiplets.ace_chiplet_len() as u64,
+    };
+    let totals = TraceTotals {
+        core_rows: summary.main_trace_len() as u64,
+        chiplets_rows: chiplets.trace_len() as u64,
+        range_rows: summary.range_trace_len() as u64,
+    };
+
+    // Cross-check our derived formulas against the processor's authoritative values; a drift here
+    // means the AIR-side definitions have moved and the rest of the pipeline will silently
+    // miscalibrate.
+    let derived_chiplets = breakdown.chiplets_sum();
+    if totals.chiplets_rows != derived_chiplets {
+        return Err(MeasurementError::InvariantDrift {
+            quantity: "chiplets_total",
+            processor: totals.chiplets_rows,
+            derived: derived_chiplets,
+        });
+    }
+    let derived_padded = totals.padded_total();
+    let processor_padded = summary.padded_trace_len() as u64;
+    if derived_padded != processor_padded {
+        return Err(MeasurementError::InvariantDrift {
+            quantity: "padded_total",
+            processor: processor_padded,
+            derived: derived_padded,
+        });
+    }
+
+    Ok(TraceShape::new(totals, breakdown))
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum MeasurementError {
+    #[error("failed to assemble program: {0}")]
+    Assembly(String),
+    #[error("failed to execute program: {0}")]
+    Execution(String),
+    #[error("failed to build trace: {0}")]
+    TraceBuild(String),
+    /// One of our derived formulas drifted from the processor's authoritative value; AIR-side
+    /// definitions have probably changed and the snapshot/verifier formulas need updating.
+    #[error(
+        "invariant drift: {quantity} from processor = {processor}, but our derivation = {derived}"
+    )]
+    InvariantDrift {
+        quantity: &'static str,
+        processor: u64,
+        derived: u64,
+    },
+}
+
+// CALIBRATION
+// ------------------------------------------------------------------------
+
+/// Per-iteration row rates, kept as `f64` and rounded by the solver.
+#[derive(Debug, Clone, Copy, Default)]
+pub struct IterCost {
+    pub core: f64,
+    pub hasher: f64,
+    pub bitwise: f64,
+    pub memory: f64,
+    pub range: f64,
+}
+
+impl IterCost {
+    pub fn get(&self, component: Component) -> f64 {
+        match component {
+            Component::Core => self.core,
+            Component::Hasher => self.hasher,
+            Component::Bitwise => self.bitwise,
+            Component::Memory => self.memory,
+            Component::Range => self.range,
+        }
+    }
+}
+
+/// Per-snippet rows-per-iter across every tracked component. Cross-terms (e.g. a non-zero hasher
+/// rate on the `decoder_pad` snippet) are measured so the solver can subtract them.
+pub type Calibration = BTreeMap<&'static str, IterCost>;
+
+/// Run every snippet through a single-point calibration at [`CALIBRATION_ITERS`] and record
+/// per-iter cost in each component.
+pub fn calibrate() -> Result<Calibration, MeasurementError> {
+    let mut cal = Calibration::new();
+    for snippet in SNIPPETS {
+        let source = snippets::wrap_program(&snippets::render(snippet, CALIBRATION_ITERS));
+        let shape = measure_program(&source)?;
+        cal.insert(snippet.name, per_iter_cost(shape, CALIBRATION_ITERS));
+    }
+    Ok(cal)
+}
+
+fn per_iter_cost(shape: TraceShape, iters: u64) -> IterCost {
+    let k = iters as f64;
+    IterCost {
+        core: shape.totals.core_rows as f64 / k,
+        hasher: shape.breakdown.hasher_rows as f64 / k,
+        bitwise: shape.breakdown.bitwise_rows as f64 / k,
+        memory: shape.breakdown.memory_rows as f64 / k,
+        range: shape.totals.range_rows as f64 / k,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // MEASUREMENT TESTS
+    // --------------------------------------------------------------------
+
+    #[test]
+    fn measures_trivial_program() {
+        // `measure_program()` already cross-checks our derived totals against the processor's
+        // authoritative values; this test just smoke-checks basic measurement.
+        let shape = measure_program("begin push.1 drop end").expect("measure");
+        assert!(shape.totals.core_rows > 0, "main trace should include framing rows");
+        assert!(shape.totals.padded_total().is_power_of_two());
+    }
+
+    #[test]
+    fn hperm_adds_rows_beyond_baseline() {
+        let baseline = measure_program("begin push.1 drop end").expect("baseline");
+        let with_hperm =
+            measure_program("begin padw padw padw hperm dropw dropw dropw end").expect("hperm");
+        assert!(
+            with_hperm.breakdown.hasher_rows > baseline.breakdown.hasher_rows,
+            "hperm should add hasher rows above the baseline ({} vs {})",
+            with_hperm.breakdown.hasher_rows,
+            baseline.breakdown.hasher_rows,
+        );
+    }
+
+    // CALIBRATION TESTS
+    // --------------------------------------------------------------------
+
+    fn cal() -> Calibration {
+        calibrate().expect("calibration should succeed")
+    }
+
+    #[test]
+    fn every_snippet_has_an_entry() {
+        let c = cal();
+        for snippet in SNIPPETS {
+            assert!(c.contains_key(snippet.name), "missing calibration for {}", snippet.name);
+        }
+    }
+
+    #[test]
+    fn hasher_snippet_is_hasher_dominant() {
+        let c = cal();
+        let hasher = c["hasher"];
+        let pad = c["decoder_pad"];
+        assert!(
+            hasher.hasher > pad.hasher * 10.0,
+            "hasher/iter ({}) not dominant over decoder_pad leak ({})",
+            hasher.hasher,
+            pad.hasher,
+        );
+    }
+
+    #[test]
+    fn bitwise_snippet_rows_match_op_cycle_len() {
+        let c = cal();
+        let bitwise = c["bitwise"];
+        // OP_CYCLE_LEN = 8 per u32 bitwise op.
+        assert!(
+            bitwise.bitwise >= 7.5,
+            "bitwise per-iter ({}) below OP_CYCLE_LEN",
+            bitwise.bitwise
+        );
+        assert!(
+            bitwise.bitwise <= 9.0,
+            "bitwise per-iter ({}) above OP_CYCLE_LEN",
+            bitwise.bitwise
+        );
+    }
+
+    #[test]
+    fn memory_snippet_two_rows_per_iter() {
+        let c = cal();
+        let memory = c["memory"];
+        assert!(memory.memory >= 1.5, "memory per-iter ({}) too low", memory.memory);
+        assert!(memory.memory <= 2.5, "memory per-iter ({}) too high", memory.memory);
+    }
+
+    #[test]
+    fn u32arith_snippet_drives_range() {
+        let c = cal();
+        let arith = c["u32arith"];
+        let pad = c["decoder_pad"];
+        assert!(
+            arith.range > pad.range * 5.0,
+            "u32arith range/iter ({}) should dominate the baseline decoder_pad range/iter ({})",
+            arith.range,
+            pad.range,
+        );
+    }
+
+    #[test]
+    fn decoder_pad_is_core_dominant() {
+        let c = cal();
+        let pad = c["decoder_pad"];
+        assert!(pad.core > 1.0, "decoder_pad core/iter should be > 1.0");
+        assert!(
+            pad.core > pad.hasher * 3.0,
+            "decoder_pad core ({}) should dominate hasher ({})",
+            pad.core,
+            pad.hasher,
+        );
+    }
+}
diff --git a/benches/synthetic-tx-kernel/src/lib.rs b/benches/synthetic-tx-kernel/src/lib.rs
new file mode 100644
index 0000000000..827a2357a0
--- /dev/null
+++ b/benches/synthetic-tx-kernel/src/lib.rs
@@ -0,0 +1,18 @@
+//! Synthetic benchmark generator for VM-level transaction-load regression tests.
+//!
+//! Given a snapshot captured from a representative transaction in `protocol`, this crate calibrates
+//! a small catalog of MASM snippets against the current VM, solves for iteration counts, emits a
+//! synthetic program, and verifies that the program lands in the target core/chiplets padded
+//! brackets.
+//!
+//! The snapshot schema has two tiers:
+//! - `trace`: hard totals (`core_rows`, `chiplets_rows`, `range_rows`)
+//! - `shape`: advisory per-chiplet breakdown used by the solver
+//!
+//! See `README.md` for design rationale.
+
+pub mod calibrator;
+pub mod snapshot;
+pub mod snippets;
+pub mod solver;
+pub mod verifier;
diff --git a/benches/synthetic-tx-kernel/src/snapshot.rs b/benches/synthetic-tx-kernel/src/snapshot.rs
new file mode 100644
index 0000000000..d9a13e6862
--- /dev/null
+++ b/benches/synthetic-tx-kernel/src/snapshot.rs
@@ -0,0 +1,373 @@
+//! Snapshot schema shared by the protocol-side producer and the VM-side synthetic benchmark.
+//!
+//! `trace` contains the hard-target totals used by the verifier:
+//! - `core_rows`
+//! - `chiplets_rows`
+//! - `range_rows`
+//!
+//! `shape` contains an advisory per-chiplet breakdown used by the solver to keep the synthetic
+//! workload representative. The loader validates `trace.chiplets_rows == shape.chiplets_sum()`.
+
+use std::path::Path;
+
+use serde::{Deserialize, Serialize};
+
+/// Schema version this crate understands.
+pub const CURRENT_SCHEMA_VERSION: &str = "0";
+
+/// Mirrors `miden_air::trace::MIN_TRACE_LEN`. Keep in sync when the processor's minimum padded
+/// length changes.
+const MIN_TRACE_LEN: u64 = 64;
+
+/// A snapshot captured from a representative transaction execution.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct TraceSnapshot {
+    pub schema_version: String,
+    pub source: String,
+    pub timestamp: String,
+    pub miden_vm_version: String,
+    /// Hard-target totals. The verifier's bracket check operates on these.
+    pub trace: TraceTotals,
+    /// Advisory per-chiplet breakdown used by the solver for shaping.
+    pub shape: TraceBreakdown,
+}
+
+/// Hard-target aggregates -- the verifier's primary contract.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
+pub struct TraceTotals {
+    /// System + decoder + stack trace length.
+    pub core_rows: u64,
+    /// Total chiplets trace length, matching `ChipletsLengths::trace_len` in the processor (sum of
+    /// per-chiplet lengths + 1 mandatory padding row).
+    pub chiplets_rows: u64,
+    /// Range-checker trace length. Derived from memory + bitwise activity; not independently
+    /// targeted but tracked so the verifier can warn if it ever dominates.
+    pub range_rows: u64,
+}
+
+/// Per-chiplet row counts. Advisory only -- the solver uses these to size individual snippets so
+/// the synthetic program stays representative (hasher work looks like hasher work, not a pile of
+/// decoder-pad), but the verifier does not treat individual values as hard targets.
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
+pub struct TraceBreakdown {
+    pub hasher_rows: u64,
+    pub bitwise_rows: u64,
+    pub memory_rows: u64,
+    /// Kernel ROM rows. Not drivable from plain MASM; folded into memory.
+    #[serde(default)]
+    pub kernel_rom_rows: u64,
+    /// ACE chiplet rows. Not drivable from plain MASM; folded into memory. Some producer versions
+    /// may report this as zero until their processor dependency exposes the ACE trace accessor.
+    #[serde(default)]
+    pub ace_rows: u64,
+}
+
+/// In-memory bundle used by the solver and verifier; not serialized.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct TraceShape {
+    pub totals: TraceTotals,
+    pub breakdown: TraceBreakdown,
+}
+
+impl TraceTotals {
+    /// Padded power-of-two bracket for the non-chiplet side of the trace:
+    /// `next_pow2(max(core_rows, range_rows))`. Under the current AIR this covers core
+    /// (system/decoder/stack) and range together; if a future AIR separates them, this accessor
+    /// can be revisited.
+    pub fn padded_core_side(&self) -> u64 {
+        self.core_rows.max(self.range_rows).next_power_of_two().max(MIN_TRACE_LEN)
+    }
+
+    /// Padded power-of-two bracket for the chiplets side of the trace.
+    pub fn padded_chiplets(&self) -> u64 {
+        self.chiplets_rows.next_power_of_two().max(MIN_TRACE_LEN)
+    }
+
+    /// Single global padded length as reported by the processor's
+    /// `TraceLenSummary::padded_trace_len`. Used by the calibrator to cross-check our derived
+    /// formulas against the prover.
+    pub fn padded_total(&self) -> u64 {
+        self.core_rows
+            .max(self.range_rows)
+            .max(self.chiplets_rows)
+            .next_power_of_two()
+            .max(MIN_TRACE_LEN)
+    }
+
+    /// True iff `range_rows` is the largest unpadded component.
+    pub fn range_dominates(&self) -> bool {
+        self.range_rows > self.core_rows && self.range_rows > self.chiplets_rows
+    }
+}
+
+impl TraceBreakdown {
+    /// Sum of all chiplet sub-traces plus the mandatory +1 padding row, matching
+    /// `ChipletsLengths::trace_len` in the processor. Used as the loader's consistency check
+    /// against `TraceTotals::chiplets_rows`.
+    pub fn chiplets_sum(&self) -> u64 {
+        self.hasher_rows
+            + self.bitwise_rows
+            + self.memory_rows
+            + self.kernel_rom_rows
+            + self.ace_rows
+            + 1
+    }
+
+    /// Memory-row target the solver aims for: snapshot memory plus ACE and kernel_rom (both
+    /// unreachable from plain MASM) folded in.
+    pub fn memory_target(&self) -> u64 {
+        self.memory_rows + self.kernel_rom_rows + self.ace_rows
+    }
+
+    /// Rows folded into the memory target from unreachable chiplets.
+    pub fn substituted_rows(&self) -> u64 {
+        self.kernel_rom_rows + self.ace_rows
+    }
+}
+
+impl TraceShape {
+    pub fn new(totals: TraceTotals, breakdown: TraceBreakdown) -> Self {
+        Self { totals, breakdown }
+    }
+}
+
+impl TraceSnapshot {
+    /// Load a snapshot from disk, validate its schema version, and cross-check the internal
+    /// consistency of the two tiers.
+    pub fn load(path: impl AsRef<Path>) -> Result<Self, SnapshotError> {
+        let bytes = std::fs::read(path.as_ref()).map_err(|source| SnapshotError::Io {
+            path: path.as_ref().display().to_string(),
+            source,
+        })?;
+        let snap: Self = serde_json::from_slice(&bytes).map_err(SnapshotError::Parse)?;
+
+        if snap.schema_version != CURRENT_SCHEMA_VERSION {
+            return Err(SnapshotError::SchemaVersion {
+                file: snap.schema_version,
+                supported: CURRENT_SCHEMA_VERSION,
+            });
+        }
+
+        let expected = snap.shape.chiplets_sum();
+        if snap.trace.chiplets_rows != expected {
+            return Err(SnapshotError::InconsistentChipletsTotal {
+                from_trace: snap.trace.chiplets_rows,
+                from_shape: expected,
+            });
+        }
+
+        Ok(snap)
+    }
+
+    /// Combined target shape that the solver and verifier consume.
+    pub fn shape(&self) -> TraceShape {
+        TraceShape::new(self.trace, self.shape)
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum SnapshotError {
+    #[error("failed to read snapshot at {path}: {source}")]
+    Io {
+        path: String,
+        #[source]
+        source: std::io::Error,
+    },
+    #[error("failed to parse snapshot JSON: {0}")]
+    Parse(#[source] serde_json::Error),
+    #[error(
+        "unsupported snapshot schema version {file:?}; this crate understands version {supported:?}"
+    )]
+    SchemaVersion { file: String, supported: &'static str },
+    #[error(
+        "snapshot inconsistency: trace.chiplets_rows = {from_trace} but shape sums to {from_shape}"
+    )]
+    InconsistentChipletsTotal { from_trace: u64, from_shape: u64 },
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    struct CommittedSnapshotExpectation {
+        file_stem: &'static str,
+        source: &'static str,
+        padded_core_side: u64,
+        padded_chiplets: u64,
+    }
+
+    const COMMITTED_SNAPSHOT_EXPECTATIONS: &[CommittedSnapshotExpectation] = &[
+        CommittedSnapshotExpectation {
+            file_stem: "consume-single-p2id",
+            source: "protocol/bench-transaction:consume-single-p2id",
+            padded_core_side: 131_072,
+            padded_chiplets: 131_072,
+        },
+        CommittedSnapshotExpectation {
+            file_stem: "consume-two-p2id",
+            source: "protocol/bench-transaction:consume-two-p2id",
+            padded_core_side: 131_072,
+            padded_chiplets: 262_144,
+        },
+    ];
+
+    fn expectation_for(file_stem: &str) -> Option<&'static CommittedSnapshotExpectation> {
+        COMMITTED_SNAPSHOT_EXPECTATIONS
+            .iter()
+            .find(|expected| expected.file_stem == file_stem)
+    }
+
+    fn sample_shape() -> (TraceTotals, TraceBreakdown) {
+        let breakdown = TraceBreakdown {
+            hasher_rows: 200,
+            bitwise_rows: 50,
+            memory_rows: 300,
+            kernel_rom_rows: 40,
+            ace_rows: 60,
+        };
+        let totals = TraceTotals {
+            core_rows: 1000,
+            chiplets_rows: breakdown.chiplets_sum(),
+            range_rows: 100,
+        };
+        (totals, breakdown)
+    }
+
+    #[test]
+    fn memory_target_folds_ace_and_kernel_rom() {
+        let (_, b) = sample_shape();
+        assert_eq!(b.memory_target(), 400);
+        assert_eq!(b.substituted_rows(), 100);
+        // 200 + 50 + 300 + 40 + 60 + 1 padding row = 651
+        assert_eq!(b.chiplets_sum(), 651);
+    }
+
+    #[test]
+    fn padded_totals_match_processor_formula() {
+        let (t, _) = sample_shape();
+        // max(1000, 100, 651) = 1000 → next pow2 = 1024
+        assert_eq!(t.padded_total(), 1024);
+        // core + range: max(1000, 100) = 1000 → 1024
+        assert_eq!(t.padded_core_side(), 1024);
+        // chiplets alone: 651 → 1024
+        assert_eq!(t.padded_chiplets(), 1024);
+    }
+
+    #[test]
+    fn padded_total_clamps_to_min_trace_len() {
+        let totals = TraceTotals {
+            core_rows: 1,
+            chiplets_rows: 1,
+            range_rows: 0,
+        };
+        assert_eq!(totals.padded_total(), MIN_TRACE_LEN);
+        assert_eq!(totals.padded_core_side(), MIN_TRACE_LEN);
+        assert_eq!(totals.padded_chiplets(), MIN_TRACE_LEN);
+    }
+
+    #[test]
+    fn range_dominates_is_detected() {
+        let totals = TraceTotals {
+            core_rows: 100,
+            chiplets_rows: 200,
+            range_rows: 500,
+        };
+        assert!(totals.range_dominates());
+        let totals = TraceTotals {
+            core_rows: 500,
+            chiplets_rows: 200,
+            range_rows: 100,
+        };
+        assert!(!totals.range_dominates());
+    }
+
+    #[test]
+    fn committed_snapshots_roundtrip() {
+        let snapshots_dir = Path::new(env!("CARGO_MANIFEST_DIR")).join("snapshots");
+        let entries = std::fs::read_dir(&snapshots_dir)
+            .unwrap_or_else(|e| panic!("read {}: {e}", snapshots_dir.display()));
+        let mut discovered = Vec::new();
+        for entry in entries {
+            let path = entry.expect("dir entry").path();
+            if path.extension().and_then(|e| e.to_str()) != Some("json") {
+                continue;
+            }
+            let file_stem = path.file_stem().and_then(|s| s.to_str()).expect("snapshot stem");
+            let expected = expectation_for(file_stem)
+                .unwrap_or_else(|| panic!("unexpected committed snapshot: {}", path.display()));
+            let snap = TraceSnapshot::load(&path)
+                .unwrap_or_else(|e| panic!("load {}: {e}", path.display()));
+            assert_eq!(snap.schema_version, CURRENT_SCHEMA_VERSION);
+            assert_eq!(snap.source, expected.source);
+            assert!(snap.trace.core_rows > 0);
+            assert!(snap.trace.chiplets_rows > 0);
+            assert_eq!(snap.trace.chiplets_rows, snap.shape.chiplets_sum());
+            assert_eq!(snap.trace.padded_core_side(), expected.padded_core_side);
+            assert_eq!(snap.trace.padded_chiplets(), expected.padded_chiplets);
+
+            let reserialized = serde_json::to_string(&snap).expect("reserialize");
+            let roundtripped: TraceSnapshot =
+                serde_json::from_str(&reserialized).expect("deserialize reserialized");
+            assert_eq!(snap.trace, roundtripped.trace);
+            assert_eq!(snap.shape, roundtripped.shape);
+            discovered.push(file_stem.to_string());
+        }
+        discovered.sort();
+        let mut expected: Vec<_> = COMMITTED_SNAPSHOT_EXPECTATIONS
+            .iter()
+            .map(|expected| expected.file_stem)
+            .collect();
+        expected.sort();
+        assert_eq!(discovered, expected);
+    }
+
+    #[test]
+    fn missing_optional_fields_default_to_zero() {
+        let minimal = r#"{
+            "schema_version": "0",
+            "source": "test",
+            "timestamp": "t",
+            "miden_vm_version": "x",
+            "trace": { "core_rows": 100, "chiplets_rows": 11, "range_rows": 50 },
+            "shape": { "hasher_rows": 10, "bitwise_rows": 0, "memory_rows": 0 }
+        }"#;
+        let snap: TraceSnapshot = serde_json::from_str(minimal).expect("parse minimal");
+        assert_eq!(snap.shape.kernel_rom_rows, 0);
+        assert_eq!(snap.shape.ace_rows, 0);
+    }
+
+    #[test]
+    fn rejects_unsupported_schema_version() {
+        let wrong = r#"{
+            "schema_version": "9999",
+            "source": "test",
+            "timestamp": "t",
+            "miden_vm_version": "x",
+            "trace": { "core_rows": 100, "chiplets_rows": 11, "range_rows": 0 },
+            "shape": { "hasher_rows": 10, "bitwise_rows": 0, "memory_rows": 0 }
+        }"#;
+        let tmp = std::env::temp_dir().join("synthetic-tx-kernel-schema-wrong-version.json");
+        std::fs::write(&tmp, wrong).unwrap();
+        let err = TraceSnapshot::load(&tmp).expect_err("expected schema version rejection");
+        assert!(matches!(err, SnapshotError::SchemaVersion { .. }));
+        let _ = std::fs::remove_file(&tmp);
+    }
+
+    #[test]
+    fn rejects_inconsistent_chiplets_total() {
+        // chiplets_rows says 500 but the breakdown sums to 11 (10 + 0 + 0 + 1).
+        let mismatched = r#"{
+            "schema_version": "0",
+            "source": "test",
+            "timestamp": "t",
+            "miden_vm_version": "x",
+            "trace": { "core_rows": 100, "chiplets_rows": 500, "range_rows": 0 },
+            "shape": { "hasher_rows": 10, "bitwise_rows": 0, "memory_rows": 0 }
+        }"#;
+        let tmp = std::env::temp_dir().join("synthetic-tx-kernel-chiplets-mismatch.json");
+        std::fs::write(&tmp, mismatched).unwrap();
+        let err = TraceSnapshot::load(&tmp).expect_err("expected inconsistency rejection");
+        assert!(matches!(err, SnapshotError::InconsistentChipletsTotal { .. }));
+        let _ = std::fs::remove_file(&tmp);
+    }
+}
diff --git a/benches/synthetic-tx-kernel/src/snippets.rs b/benches/synthetic-tx-kernel/src/snippets.rs
new file mode 100644
index 0000000000..ecd7ebb18b
--- /dev/null
+++ b/benches/synthetic-tx-kernel/src/snippets.rs
@@ -0,0 +1,228 @@
+//! Static catalog of MASM snippets that drive individual VM components.
+//!
+//! The patterns are deliberately few and natural -- they mirror work a real transaction does,
+//! rather than one synthetic op per chiplet:
+//!
+//! - **hasher** drives the Poseidon2 chiplet via repeated `hperm`. The state evolves between
+//!   iterations (one `padw padw padw` as setup, no reset), so each permutation has a distinct input
+//!   and the hasher AIR's multiplicity column does not collapse them.
+//! - **bitwise** drives the bitwise chiplet via `u32split + u32xor`.
+//! - **u32arith** drives the range checker via two banded u32 counters, advancing each by 65537 per
+//!   iter so their 16-bit halves evolve as disjoint contiguous bands. `u32assert2` issues fresh
+//!   range-check values each iter without cross-dedup.
+//! - **memory** drives the memory chiplet. The address advances by `4 * 65537 = 262148` per iter so
+//!   the two 16-bit halves of the word index form disjoint contiguous bands, which also feeds the
+//!   range chiplet via address-limb decomposition.
+//! - **decoder_pad** drives only the core (system/decoder/stack) trace so the solver can top up
+//!   core-trace budget without adding chiplet rows.
+//!
+//! Each snippet is partitioned into `setup` / `body` / `cleanup` so that the body alone can be
+//! wrapped in a `repeat.N ... end` block. The body must leave stack depth unchanged -- the repeat
+//! block would otherwise drift the stack each iteration.
+//!
+//! Note: on current `next`, decoder-only programs still incur hasher rows due to MAST hashing, so
+//! low hasher targets may be unreachable; the solver clamps `hasher` iterations to zero in that
+//! case.
+
+/// A VM component the solver targets.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Ord, PartialOrd)]
+pub enum Component {
+    /// System + decoder + stack.
+    Core,
+    Hasher,
+    Bitwise,
+    Memory,
+    /// Range checker. Not currently a hard bracket target but the solver still sizes the
+    /// `u32arith` snippet against it so the synthetic's range workload is representative.
+    Range,
+}
+
+/// A MASM fragment that dominantly drives one component.
+#[derive(Debug, Clone, Copy)]
+pub struct Snippet {
+    pub name: &'static str,
+    /// Runs once, outside the `repeat.N ... end` block.
+    pub setup: &'static str,
+    /// Runs N times inside the repeat block. Must be stack-balanced.
+    pub body: &'static str,
+    /// Runs once, outside the repeat block.
+    pub cleanup: &'static str,
+    /// The primary component this snippet drives.
+    pub dominant: Component,
+}
+
+/// The full snippet catalog, in solver order: chiplet drivers first so they saturate their
+/// targets, `decoder_pad` last so it absorbs leftover main-trace budget.
+pub const SNIPPETS: &[Snippet] = &[
+    Snippet {
+        name: "hasher",
+        setup: "padw padw padw",
+        body: "hperm",
+        cleanup: "dropw dropw dropw",
+        dominant: Component::Hasher,
+    },
+    Snippet {
+        name: "bitwise",
+        setup: "push.1 neg",
+        body: "u32split u32xor",
+        cleanup: "drop",
+        dominant: Component::Bitwise,
+    },
+    Snippet {
+        name: "u32arith",
+        // Two u32 counters, each advanced by `65537 = 0x0001_0001` per iter, so their four 16-bit
+        // half-limbs form disjoint contiguous bands at 45500+i, 50500+i, 55500+i, 60500+i (clear
+        // of the memory snippet's 4*w1 band, which tops out near 44800). Counters update with
+        // plain field `add` so only the `u32assert2` emits range-check values -- carry limbs don't
+        // pollute the bands. See `u32arith_max_iters` for the counter bound.
+        setup: "push.3637308500 push.2981938500",
+        body: "u32assert2 push.65537 add swap push.65537 add swap",
+        cleanup: "drop drop",
+        dominant: Component::Range,
+    },
+    Snippet {
+        name: "memory",
+        // Advance the word-aligned address by `4 * 65537 = 262148` each iter so that both 16-bit
+        // halves of the word index evolve as disjoint contiguous bands. This makes the memory
+        // chiplet feed fresh range-check values rather than deduplicating, so the snippet also
+        // contributes meaningfully to the range chiplet.
+        //
+        // The start address `4 * ((10000 << 16) | 20000)` puts the address-limb bands in a high
+        // region that doesn't overlap u32arith's bands. The counter advances with plain field
+        // `add` so only the memory chiplet emits range-check values -- the update doesn't pollute
+        // the bands with carry-limb checks. Safe as long as the address stays below `u32::MAX`;
+        // see `assert_counters_fit`.
+        setup: "padw push.2621520000",
+        body: "dup.4 mem_storew_le dup.4 mem_loadw_le movup.4 push.262148 add movdn.4",
+        cleanup: "drop dropw",
+        dominant: Component::Memory,
+    },
+    Snippet {
+        name: "decoder_pad",
+        setup: "",
+        body: "swap dup.1 add",
+        cleanup: "",
+        dominant: Component::Core,
+    },
+];
+
+/// Look up a snippet by name. Only used by tests.
+#[cfg(test)]
+pub(crate) fn find(name: &str) -> Option<&'static Snippet> {
+    SNIPPETS.iter().find(|s| s.name == name)
+}
+
+/// Assemble a single snippet into a complete program body: the setup, followed by
+/// `repeat.iters body end`, followed by the cleanup. Returned text has no `begin`/`end` wrapping --
+/// caller composes multiple snippets.
+pub fn render(snippet: &Snippet, iters: u64) -> String {
+    use std::fmt::Write;
+    let mut out = String::new();
+    if !snippet.setup.is_empty() {
+        writeln!(out, "    {}", snippet.setup).unwrap();
+    }
+    if iters > 0 {
+        writeln!(out, "    repeat.{iters}").unwrap();
+        writeln!(out, "        {}", snippet.body).unwrap();
+        writeln!(out, "    end").unwrap();
+    }
+    if !snippet.cleanup.is_empty() {
+        writeln!(out, "    {}", snippet.cleanup).unwrap();
+    }
+    out
+}
+
+/// Wrap a snippet fragment into a complete `begin ... end` program.
+pub fn wrap_program(body: &str) -> String {
+    format!("begin\n{body}end\n")
+}
+
+// COUNTER SAFETY
+// ------------------------------------------------------------------------
+//
+// Both `u32arith` and `memory` advance a counter with plain field `add`, so `u32assert2` / the
+// memory chiplet would fail at runtime if the counter crossed `u32::MAX`. These helpers expose the
+// per-snippet limits so callers can validate a plan before emitting.
+
+/// Starting value of `u32arith`'s higher counter -- the one closer to `u32::MAX` and therefore the
+/// binding constraint.
+const U32ARITH_COUNTER_START: u64 = 3_637_308_500;
+const U32ARITH_COUNTER_STRIDE: u64 = 65_537;
+
+/// Starting value of `memory`'s address counter.
+const MEMORY_COUNTER_START: u64 = 2_621_520_000;
+const MEMORY_COUNTER_STRIDE: u64 = 262_148;
+
+const U32_MAX: u64 = u32::MAX as u64;
+
+/// Maximum iterations of `u32arith` before the `y` counter would exceed `u32::MAX` and trip the
+/// next iteration's `u32assert2`.
+pub fn u32arith_max_iters() -> u64 {
+    (U32_MAX - U32ARITH_COUNTER_START) / U32ARITH_COUNTER_STRIDE
+}
+
+/// Maximum iterations of `memory` before the address counter would exceed `u32::MAX`.
+pub fn memory_max_iters() -> u64 {
+    (U32_MAX - MEMORY_COUNTER_START) / MEMORY_COUNTER_STRIDE
+}
+
+#[cfg(test)]
+mod tests {
+    use miden_vm::Assembler;
+
+    use super::*;
+
+    #[test]
+    fn catalog_has_one_snippet_per_solver_component() {
+        let targets = [
+            Component::Core,
+            Component::Hasher,
+            Component::Bitwise,
+            Component::Memory,
+            Component::Range,
+        ];
+        for target in targets {
+            let count = SNIPPETS.iter().filter(|s| s.dominant == target).count();
+            assert_eq!(count, 1, "expected exactly one snippet for {target:?}");
+        }
+    }
+
+    #[test]
+    fn each_snippet_assembles_as_a_standalone_program() {
+        // Fail fast: if a snippet has malformed MASM, the calibrator will blow up at bench time.
+        // Catch it in unit tests instead.
+        for snippet in SNIPPETS {
+            let source = wrap_program(&render(snippet, 4));
+            Assembler::default()
+                .assemble_program(&source)
+                .unwrap_or_else(|e| panic!("snippet {:?} failed to assemble: {e}", snippet.name));
+        }
+    }
+
+    #[test]
+    fn counter_limits_cover_realistic_plans() {
+        // Realistic plans for consume/create P2ID transactions produce ~5k u32arith iters and
+        // ~1.2k memory iters. These guards have plenty of headroom for that regime.
+        assert!(u32arith_max_iters() >= 10_000);
+        assert!(memory_max_iters() >= 6_000);
+    }
+
+    #[test]
+    fn render_emits_repeat_with_body() {
+        let snippet = find("bitwise").expect("bitwise snippet");
+        let out = render(snippet, 42);
+        assert!(out.contains("repeat.42"));
+        assert!(out.contains("u32split u32xor"));
+        assert!(out.contains("push.1 neg"));
+        assert!(out.contains("drop"));
+    }
+
+    #[test]
+    fn render_with_zero_iters_still_emits_setup_and_cleanup() {
+        let snippet = find("hasher").expect("hasher snippet");
+        let out = render(snippet, 0);
+        assert!(out.contains("padw padw padw"));
+        assert!(out.contains("dropw dropw dropw"));
+        assert!(!out.contains("repeat."));
+    }
+}
diff --git a/benches/synthetic-tx-kernel/src/solver.rs b/benches/synthetic-tx-kernel/src/solver.rs
new file mode 100644
index 0000000000..91131d11ff
--- /dev/null
+++ b/benches/synthetic-tx-kernel/src/solver.rs
@@ -0,0 +1,217 @@
+//! Solve for per-snippet iteration counts and emit the MASM program.
+//!
+//! The calibration matrix is close to diagonally dominant: each snippet primarily drives one
+//! component and leaks small cross-terms into the others. A short Jacobi refinement with a
+//! non-negativity clamp is enough for this problem size and handles infeasible targets gracefully.
+
+use std::collections::BTreeMap;
+
+use crate::{
+    calibrator::Calibration,
+    snapshot::TraceShape,
+    snippets::{self, Component, SNIPPETS},
+};
+
+/// Small fixed-point refinement count. The calibrated systems in this crate converge in a few
+/// passes, and the clamp keeps iteration counts non-negative.
+const REFINEMENT_PASSES: usize = 8;
+
+/// Iteration counts per snippet, ready to hand to the emitter.
+///
+/// Implemented as a sparse map where absence means "zero iterations"; the newtype hides the
+/// `unwrap_or(0)` convention behind [`Plan::iters`] so call sites can't forget it.
+#[derive(Debug, Default, Clone)]
+pub struct Plan {
+    entries: BTreeMap<&'static str, u64>,
+}
+
+impl Plan {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Iteration count for `name`, or 0 if the snippet has no entry.
+    pub fn iters(&self, name: &str) -> u64 {
+        self.entries.get(name).copied().unwrap_or(0)
+    }
+
+    /// Set the iteration count for `name`, removing the entry entirely when `n == 0` so that
+    /// `iters() == 0` is equivalent to the entry being absent.
+    pub fn set(&mut self, name: &'static str, n: u64) {
+        if n == 0 {
+            self.entries.remove(name);
+        } else {
+            self.entries.insert(name, n);
+        }
+    }
+
+    /// Increment the iteration count for `name` by `delta`.
+    pub fn add(&mut self, name: &'static str, delta: u64) {
+        if delta == 0 {
+            return;
+        }
+        self.set(name, self.iters(name) + delta);
+    }
+
+    /// Decrement the iteration count for `name` by `delta`, saturating at zero.
+    pub fn sub_saturating(&mut self, name: &'static str, delta: u64) {
+        self.set(name, self.iters(name).saturating_sub(delta));
+    }
+}
+
+/// Solve for the iteration counts that reproduce `target`'s per-component row counts. Per-chiplet
+/// targets come from the snapshot's advisory `shape` breakdown -- the solver uses them to keep the
+/// synthetic program representative, but the verifier only hard-asserts totals/brackets.
+pub fn solve(calibration: &Calibration, target: &TraceShape) -> Plan {
+    let mut iters: BTreeMap<&'static str, f64> =
+        SNIPPETS.iter().map(|s| (s.name, 0.0_f64)).collect();
+
+    let component_target = |c: Component| -> f64 {
+        match c {
+            Component::Core => target.totals.core_rows as f64,
+            Component::Hasher => target.breakdown.hasher_rows as f64,
+            Component::Bitwise => target.breakdown.bitwise_rows as f64,
+            Component::Memory => target.breakdown.memory_target() as f64,
+            Component::Range => target.totals.range_rows as f64,
+        }
+    };
+
+    for _ in 0..REFINEMENT_PASSES {
+        let snapshot = iters.clone();
+        for snippet in SNIPPETS {
+            let cost = match calibration.get(snippet.name) {
+                Some(c) => *c,
+                None => continue,
+            };
+            let rate = cost.get(snippet.dominant);
+            if rate <= 0.0 {
+                continue;
+            }
+            let target_rows = component_target(snippet.dominant);
+            let cross_rows: f64 = SNIPPETS
+                .iter()
+                .filter(|s| s.name != snippet.name)
+                .map(|s| {
+                    let other =
+                        calibration.get(s.name).map(|c| c.get(snippet.dominant)).unwrap_or(0.0);
+                    other * snapshot[s.name]
+                })
+                .sum();
+            let needed = (target_rows - cross_rows).max(0.0);
+            iters.insert(snippet.name, needed / rate);
+        }
+    }
+
+    let mut plan = Plan::new();
+    for (name, v) in iters {
+        plan.set(name, v.round().max(0.0) as u64);
+    }
+    plan
+}
+
+/// Render the plan as a single `begin ... end` program.
+pub fn emit(plan: &Plan) -> String {
+    use std::fmt::Write;
+    let mut body = String::new();
+    for snippet in SNIPPETS {
+        let n = plan.iters(snippet.name);
+        if n == 0 {
+            continue;
+        }
+        write!(body, "{}", snippets::render(snippet, n)).unwrap();
+    }
+    snippets::wrap_program(&body)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::{
+        calibrator::{calibrate, measure_program},
+        snapshot::{TraceBreakdown, TraceTotals},
+    };
+
+    fn shape_of(
+        core_rows: u64,
+        range_rows: u64,
+        hasher: u64,
+        bitwise: u64,
+        memory: u64,
+    ) -> TraceShape {
+        let breakdown = TraceBreakdown {
+            hasher_rows: hasher,
+            bitwise_rows: bitwise,
+            memory_rows: memory,
+            kernel_rom_rows: 0,
+            ace_rows: 0,
+        };
+        let totals = TraceTotals {
+            core_rows,
+            chiplets_rows: breakdown.chiplets_sum(),
+            range_rows,
+        };
+        TraceShape::new(totals, breakdown)
+    }
+
+    fn low_hasher_target() -> TraceShape {
+        // core/hasher ratio of ~8, well below the intrinsic core/4 floor. Memory kept modest
+        // (ratio core/memory ~30) so the test exercises the hasher-feasibility path without making
+        // it infeasible via memory overshoot into core.
+        shape_of(68900, 40000, 8200, 0, 2300)
+    }
+
+    fn high_hasher_target() -> TraceShape {
+        // main/hasher ratio of ~2, above the intrinsic main/4 floor.
+        shape_of(16000, 0, 8000, 0, 0)
+    }
+
+    #[test]
+    fn low_hasher_target_does_not_add_hperm() {
+        let cal = calibrate().expect("calibrate");
+        let plan = solve(&cal, &low_hasher_target());
+        assert_eq!(
+            plan.iters("hasher"),
+            0,
+            "when the decoder (via memory + pad) already overshoots the hasher target, no hperm iterations should be added",
+        );
+        assert!(plan.iters("memory") > 0);
+    }
+
+    #[test]
+    fn high_hasher_target_requires_hperm() {
+        let cal = calibrate().expect("calibrate");
+        let plan = solve(&cal, &high_hasher_target());
+        assert!(
+            plan.iters("hasher") > 0,
+            "a hasher target above the main/4 floor should require hperm iterations",
+        );
+    }
+
+    #[test]
+    fn emitted_program_matches_padded_bracket() {
+        let cal = calibrate().expect("calibrate");
+        let target = low_hasher_target();
+        let plan = solve(&cal, &target);
+        let source = emit(&plan);
+        let actual = measure_program(&source).expect("measure emitted program");
+        assert_eq!(
+            actual.totals.padded_total(),
+            target.totals.padded_total(),
+            "padded trace length must match target bracket (got {} vs {})",
+            actual.totals.padded_total(),
+            target.totals.padded_total(),
+        );
+    }
+
+    #[test]
+    fn zero_target_yields_empty_program() {
+        let cal = calibrate().expect("calibrate");
+        let target = shape_of(0, 0, 0, 0, 0);
+        let plan = solve(&cal, &target);
+        for snippet in SNIPPETS {
+            assert_eq!(plan.iters(snippet.name), 0, "{}", snippet.name);
+        }
+        let source = emit(&plan);
+        assert_eq!(source.trim(), "begin\nend");
+    }
+}
diff --git a/benches/synthetic-tx-kernel/src/verifier.rs b/benches/synthetic-tx-kernel/src/verifier.rs
new file mode 100644
index 0000000000..ef4e78e657
--- /dev/null
+++ b/benches/synthetic-tx-kernel/src/verifier.rs
@@ -0,0 +1,300 @@
+//! Verification helpers for synthetic-trace matching.
+//!
+//! Hard checks:
+//! - `padded_core_side(actual) == padded_core_side(target)` -- the current AIR's non-chiplets-side
+//!   bracket, `next_pow2(max(core_rows, range_rows))`. If a future AIR split gives range its own
+//!   segment, this check can be revised to assert separate brackets.
+//! - `padded_chiplets(actual) == padded_chiplets(target)`
+//!
+//! Soft reporting:
+//! - unpadded totals (`core_rows`, `chiplets_rows`) within [`PER_COMPONENT_TOLERANCE`]
+//! - advisory breakdown deltas (info only)
+//! - warning if `range_rows` dominates
+
+use std::fmt::{self, Display};
+
+use crate::snapshot::TraceShape;
+
+/// Reporting tolerance for unpadded totals; never used for pass/fail.
+pub const PER_COMPONENT_TOLERANCE: f64 = 0.02;
+
+/// Result of comparing an emitted program's measured shape against the snapshot target.
+#[derive(Debug, Clone)]
+pub struct VerificationReport {
+    pub target: TraceShape,
+    pub actual: TraceShape,
+    pub total_deltas: Vec<ComponentDelta>,
+    pub breakdown_deltas: Vec<ComponentDelta>,
+}
+
+/// How a row-count entry participates in the verifier's reporting.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum DeltaStatus {
+    /// Prints `ok` if within tolerance, `out` otherwise.
+    Enforced,
+    /// Always prints `info`; used for rows the solver does not target.
+    Informational,
+}
+
+/// Per-row-count comparison.
+#[derive(Debug, Clone, Copy)]
+pub struct ComponentDelta {
+    pub name: &'static str,
+    pub target: u64,
+    pub actual: u64,
+    pub delta_pct: f64,
+    pub within_tolerance: bool,
+    pub status: DeltaStatus,
+}
+
+impl VerificationReport {
+    pub fn new(target: TraceShape, actual: TraceShape) -> Self {
+        let total_rows: &[(&'static str, u64, u64, DeltaStatus)] = &[
+            (
+                "core_rows",
+                target.totals.core_rows,
+                actual.totals.core_rows,
+                DeltaStatus::Enforced,
+            ),
+            (
+                "chiplets_rows",
+                target.totals.chiplets_rows,
+                actual.totals.chiplets_rows,
+                DeltaStatus::Enforced,
+            ),
+            (
+                // range_rows is derived, not independently driven.
+                "range_rows",
+                target.totals.range_rows,
+                actual.totals.range_rows,
+                DeltaStatus::Informational,
+            ),
+        ];
+        let breakdown_rows: &[(&'static str, u64, u64, DeltaStatus)] = &[
+            (
+                "hasher",
+                target.breakdown.hasher_rows,
+                actual.breakdown.hasher_rows,
+                DeltaStatus::Informational,
+            ),
+            (
+                "bitwise",
+                target.breakdown.bitwise_rows,
+                actual.breakdown.bitwise_rows,
+                DeltaStatus::Informational,
+            ),
+            (
+                "memory",
+                target.breakdown.memory_target(),
+                actual.breakdown.memory_rows,
+                DeltaStatus::Informational,
+            ),
+        ];
+        Self {
+            target,
+            actual,
+            total_deltas: total_rows.iter().map(|r| component_delta(*r)).collect(),
+            breakdown_deltas: breakdown_rows.iter().map(|r| component_delta(*r)).collect(),
+        }
+    }
+
+    /// True when both padded proxies match their targets exactly.
+    pub fn brackets_match(&self) -> bool {
+        self.target.totals.padded_core_side() == self.actual.totals.padded_core_side()
+            && self.target.totals.padded_chiplets() == self.actual.totals.padded_chiplets()
+    }
+
+    /// True if `range_rows` is the largest unpadded component in either side, which means snippet
+    /// balance should be revisited.
+    pub fn range_dominates(&self) -> bool {
+        self.target.totals.range_dominates() || self.actual.totals.range_dominates()
+    }
+}
+
+fn component_delta((name, t, a, status): (&'static str, u64, u64, DeltaStatus)) -> ComponentDelta {
+    let delta_pct = if t == 0 {
+        if a == 0 { 0.0 } else { f64::INFINITY }
+    } else {
+        (a as f64 - t as f64) / t as f64
+    };
+    ComponentDelta {
+        name,
+        target: t,
+        actual: a,
+        delta_pct,
+        within_tolerance: delta_pct.abs() <= PER_COMPONENT_TOLERANCE,
+        status,
+    }
+}
+
+impl Display for VerificationReport {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        writeln!(f, "-- hard brackets (padded power-of-two) --")?;
+        write_bracket_row(
+            f,
+            "padded_core_side",
+            self.target.totals.padded_core_side(),
+            self.actual.totals.padded_core_side(),
+        )?;
+        write_bracket_row(
+            f,
+            "padded_chiplets",
+            self.target.totals.padded_chiplets(),
+            self.actual.totals.padded_chiplets(),
+        )?;
+
+        writeln!(f, "\n-- totals (soft: {:.0}% band) --", PER_COMPONENT_TOLERANCE * 100.0)?;
+        write_delta_header(f)?;
+        for d in &self.total_deltas {
+            write_delta_row(f, d)?;
+        }
+
+        writeln!(f, "\n-- breakdown (info) --")?;
+        write_delta_header(f)?;
+        for d in &self.breakdown_deltas {
+            write_delta_row(f, d)?;
+        }
+
+        writeln!(f)?;
+        if self.brackets_match() {
+            writeln!(f, "=> BRACKET MATCH")?;
+        } else {
+            writeln!(f, "=> BRACKET MISS")?;
+        }
+        if self.range_dominates() {
+            writeln!(
+                f,
+                "!! WARNING: range_rows dominates — \"ignore range\" assumption is breaking"
+            )?;
+        }
+        Ok(())
+    }
+}
+
+fn write_delta_header(f: &mut fmt::Formatter<'_>) -> fmt::Result {
+    writeln!(
+        f,
+        "{:<16} {:>12} {:>12} {:>10}  status",
+        "component", "target", "actual", "delta"
+    )
+}
+
+fn write_delta_row(f: &mut fmt::Formatter<'_>, d: &ComponentDelta) -> fmt::Result {
+    let delta_str = if d.delta_pct.is_finite() {
+        format!("{:+6.2}%", d.delta_pct * 100.0)
+    } else {
+        "+∞".to_string()
+    };
+    let status = match d.status {
+        DeltaStatus::Enforced => {
+            if d.within_tolerance {
+                "ok"
+            } else {
+                "out"
+            }
+        },
+        DeltaStatus::Informational => "info",
+    };
+    writeln!(
+        f,
+        "{:<16} {:>12} {:>12} {:>10}  {}",
+        d.name, d.target, d.actual, delta_str, status
+    )
+}
+
+fn write_bracket_row(
+    f: &mut fmt::Formatter<'_>,
+    name: &str,
+    target: u64,
+    actual: u64,
+) -> fmt::Result {
+    let ok = if target == actual { "==" } else { "MISS" };
+    writeln!(f, "{name:<16} {target:>12} {actual:>12} {ok:>10}")
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::snapshot::{TraceBreakdown, TraceTotals};
+
+    fn shape(core: u64, hasher: u64, memory: u64) -> TraceShape {
+        let breakdown = TraceBreakdown {
+            hasher_rows: hasher,
+            bitwise_rows: 0,
+            memory_rows: memory,
+            kernel_rom_rows: 0,
+            ace_rows: 0,
+        };
+        let totals = TraceTotals {
+            core_rows: core,
+            chiplets_rows: breakdown.chiplets_sum(),
+            range_rows: 0,
+        };
+        TraceShape::new(totals, breakdown)
+    }
+
+    #[test]
+    fn exact_match_is_bracket_ok_and_all_within_tolerance() {
+        let t = shape(68000, 8000, 12000);
+        let r = VerificationReport::new(t, t);
+        assert!(r.brackets_match());
+        assert!(r.total_deltas.iter().all(|d| d.within_tolerance));
+        assert!(r.breakdown_deltas.iter().all(|d| d.within_tolerance));
+    }
+
+    #[test]
+    fn bracket_miss_is_reported_when_core_bracket_differs() {
+        // target.core=68000 → 131072; actual.core=30000 → 32768 (different bracket)
+        let target = shape(68000, 8000, 12000);
+        let actual = shape(30000, 2000, 1000);
+        let r = VerificationReport::new(target, actual);
+        assert!(!r.brackets_match());
+        assert!(r.to_string().contains("BRACKET MISS"));
+    }
+
+    #[test]
+    fn chiplets_bracket_can_miss_independently_of_core() {
+        // core is the same (same padded bracket); chiplets_rows lands in different brackets.
+        // target chiplets = 8000 + 12000 + 1 = 20001 → 32768
+        // actual chiplets = 20000 + 30000 + 1 = 50001 → 65536
+        let target = shape(40000, 8000, 12000);
+        let actual = shape(40000, 20000, 30000);
+        let r = VerificationReport::new(target, actual);
+        // padded_core_side: both 40000 → 65536 (same)
+        assert_eq!(target.totals.padded_core_side(), actual.totals.padded_core_side());
+        // padded_chiplets differs
+        assert_ne!(target.totals.padded_chiplets(), actual.totals.padded_chiplets());
+        assert!(!r.brackets_match());
+    }
+
+    #[test]
+    fn range_dominates_is_warned() {
+        let breakdown = TraceBreakdown {
+            hasher_rows: 100,
+            bitwise_rows: 0,
+            memory_rows: 0,
+            kernel_rom_rows: 0,
+            ace_rows: 0,
+        };
+        let totals = TraceTotals {
+            core_rows: 100,
+            chiplets_rows: breakdown.chiplets_sum(),
+            range_rows: 500,
+        };
+        let t = TraceShape::new(totals, breakdown);
+        let r = VerificationReport::new(t, t);
+        assert!(r.range_dominates());
+        assert!(r.to_string().contains("range_rows dominates"));
+    }
+
+    #[test]
+    fn per_component_overshoot_stays_within_bracket() {
+        // Hasher overshoots but both core and chiplets stay within their brackets.
+        let target = shape(68000, 8000, 12000);
+        let actual = shape(68000, 14000, 12000);
+        let r = VerificationReport::new(target, actual);
+        assert!(r.brackets_match());
+        let hasher_delta = r.breakdown_deltas.iter().find(|d| d.name == "hasher").unwrap();
+        assert!(!hasher_delta.within_tolerance);
+    }
+}