youknowone · youknowone · Apr 13, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,20 @@
+# AGENTS
+
+## Performance Work
+
+Use the dedicated backend-performance workflow in [`docs/performance-workflow.md`](docs/performance-workflow.md).
+
+Key entry points:
+
+- Full deterministic regression gate: `scripts/compare_backends.sh`
+- Focused backend workloads: `scripts/compare_workloads.sh`
+- High-level API workloads: `scripts/compare_api_workloads.sh`
+- Single-backend profiling: `scripts/profile_backend.sh`
+- Optimized code inspection: `scripts/inspect_codegen.sh`
+
+Important constraints:
+
+- Never compare C and Rust sys backends in the same process. They export the same `lzma_*` symbols.
+- Treat `scripts/compare_backends.sh` as the coarse deterministic gate.
+- Use `qc` and `size` focused workloads for paths that are property-test-like or otherwise input-sensitive.
+- Keep workload shape, iteration count, and profiler command stable while iterating on a hotspot.
diff --git a/Cargo.lock.msrv b/Cargo.lock.msrv
diff --git a/Cargo.toml b/Cargo.toml
@@ -17,11 +17,11 @@ rust-version = "1.63"
 exclude = [".github/"]
 
 [workspace]
-members = ["systest", "liblzma-rs", "liblzma-rs-sys"]
+members = ["systest", "xz", "xz-sys", "perf-probe"]
 
 [dependencies]
 # Rust backend (default) - pure Rust implementation
-liblzma-sys = { package = "liblzma-rs-sys", path = "liblzma-rs-sys", optional = true }
+liblzma-sys = { package = "xz-sys", path = "xz-sys", optional = true }
 # C backend - original liblzma C library via FFI
 liblzma-c-sys = { package = "liblzma-sys", path = "liblzma-sys", version = "0.4.5", optional = true, default-features = false }
 num_cpus = { version = "1.16.0", optional = true }

diff --git a/benches/backend_comparison.rs b/benches/backend_comparison.rs
@@ -4,8 +4,20 @@ use std::ptr;
 
 use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
 
-use liblzma_c_sys as c_sys;
-use liblzma_sys as rs_sys;
+#[cfg(all(feature = "c-backend", feature = "rust-backend"))]
+compile_error!("backend_comparison bench must be built with exactly one backend feature");
+#[cfg(not(any(feature = "c-backend", feature = "rust-backend")))]
+compile_error!("backend_comparison bench requires either `c-backend` or `rust-backend`");
+
+#[cfg(feature = "c-backend")]
+use liblzma_c_sys as backend_sys;
+#[cfg(feature = "rust-backend")]
+use liblzma_sys as backend_sys;
+
+#[cfg(feature = "c-backend")]
+const BACKEND_NAME: &str = "c";
+#[cfg(feature = "rust-backend")]
+const BACKEND_NAME: &str = "rust";
 
 fn make_payload(size: usize) -> Vec<u8> {
     let mut x: u64 = 0x9E3779B97F4A7C15;
@@ -19,13 +31,13 @@ fn make_payload(size: usize) -> Vec<u8> {
     out
 }
 
-unsafe fn c_encode(input: &[u8]) -> Vec<u8> {
-    let bound = c_sys::lzma_stream_buffer_bound(input.len());
+unsafe fn backend_encode(input: &[u8]) -> Vec<u8> {
+    let bound = backend_sys::lzma_stream_buffer_bound(input.len());
     let mut out = vec![0u8; bound];
     let mut out_pos: usize = 0;
-    c_sys::lzma_easy_buffer_encode(
+    backend_sys::lzma_easy_buffer_encode(
         6,
-        c_sys::LZMA_CHECK_CRC64,
+        backend_sys::LZMA_CHECK_CRC64,
         ptr::null(),
         input.as_ptr(),
         input.len(),
@@ -37,50 +49,12 @@ unsafe fn c_encode(input: &[u8]) -> Vec<u8> {
     out
 }
 
-unsafe fn rs_encode(input: &[u8]) -> Vec<u8> {
-    let bound = rs_sys::lzma_stream_buffer_bound(input.len());
-    let mut out = vec![0u8; bound];
-    let mut out_pos: usize = 0;
-    rs_sys::lzma_easy_buffer_encode(
-        6,
-        rs_sys::LZMA_CHECK_CRC64,
-        ptr::null(),
-        input.as_ptr(),
-        input.len(),
-        out.as_mut_ptr(),
-        &mut out_pos,
-        out.len(),
-    );
-    out.truncate(out_pos);
-    out
-}
-
-unsafe fn c_decode(compressed: &[u8], out_size: usize) -> Vec<u8> {
-    let mut out = vec![0u8; out_size];
-    let mut memlimit = u64::MAX;
-    let mut in_pos = 0usize;
-    let mut out_pos = 0usize;
-    c_sys::lzma_stream_buffer_decode(
-        &mut memlimit,
-        0,
-        ptr::null(),
-        compressed.as_ptr(),
-        &mut in_pos,
-        compressed.len(),
-        out.as_mut_ptr(),
-        &mut out_pos,
-        out.len(),
-    );
-    out.truncate(out_pos);
-    out
-}
-
-unsafe fn rs_decode(compressed: &[u8], out_size: usize) -> Vec<u8> {
+unsafe fn backend_decode(compressed: &[u8], out_size: usize) -> Vec<u8> {
     let mut out = vec![0u8; out_size];
     let mut memlimit = u64::MAX;
     let mut in_pos = 0usize;
     let mut out_pos = 0usize;
-    rs_sys::lzma_stream_buffer_decode(
+    backend_sys::lzma_stream_buffer_decode(
         &mut memlimit,
         0,
         ptr::null(),
@@ -103,11 +77,8 @@ fn bench_encode(c: &mut Criterion) {
         let input = make_payload(size);
         group.throughput(Throughput::Bytes(size as u64));
 
-        group.bench_with_input(BenchmarkId::new("c", label), &input, |b, input| {
-            b.iter(|| unsafe { c_encode(black_box(input)) })
-        });
-        group.bench_with_input(BenchmarkId::new("rust", label), &input, |b, input| {
-            b.iter(|| unsafe { rs_encode(black_box(input)) })
+        group.bench_with_input(BenchmarkId::new(BACKEND_NAME, label), &input, |b, input| {
+            b.iter(|| unsafe { backend_encode(black_box(input)) })
         });
     }
     group.finish();
@@ -119,17 +90,13 @@ fn bench_decode(c: &mut Criterion) {
     let mut group = c.benchmark_group("decode");
     for &(size, label) in sizes {
         let input = make_payload(size);
-        let c_compressed = unsafe { c_encode(&input) };
-        let rs_compressed = unsafe { rs_encode(&input) };
+        let compressed = unsafe { backend_encode(&input) };
         group.throughput(Throughput::Bytes(size as u64));
 
-        group.bench_with_input(BenchmarkId::new("c", label), &c_compressed, |b, data| {
-            b.iter(|| unsafe { c_decode(black_box(data), size) })
-        });
         group.bench_with_input(
-            BenchmarkId::new("rust", label),
-            &rs_compressed,
-            |b, data| b.iter(|| unsafe { rs_decode(black_box(data), size) }),
+            BenchmarkId::new(BACKEND_NAME, label),
+            &compressed,
+            |b, data| b.iter(|| unsafe { backend_decode(black_box(data), size) }),
         );
     }
     group.finish();
@@ -143,11 +110,8 @@ fn bench_crc32(c: &mut Criterion) {
         let data = make_payload(size);
         group.throughput(Throughput::Bytes(size as u64));
 
-        group.bench_with_input(BenchmarkId::new("c", label), &data, |b, data| {
-            b.iter(|| unsafe { c_sys::lzma_crc32(black_box(data.as_ptr()), data.len(), 0) })
-        });
-        group.bench_with_input(BenchmarkId::new("rust", label), &data, |b, data| {
-            b.iter(|| unsafe { rs_sys::lzma_crc32(black_box(data.as_ptr()), data.len(), 0) })
+        group.bench_with_input(BenchmarkId::new(BACKEND_NAME, label), &data, |b, data| {
+            b.iter(|| unsafe { backend_sys::lzma_crc32(black_box(data.as_ptr()), data.len(), 0) })
         });
     }
     group.finish();
@@ -161,11 +125,8 @@ fn bench_crc64(c: &mut Criterion) {
         let data = make_payload(size);
         group.throughput(Throughput::Bytes(size as u64));
 
-        group.bench_with_input(BenchmarkId::new("c", label), &data, |b, data| {
-            b.iter(|| unsafe { c_sys::lzma_crc64(black_box(data.as_ptr()), data.len(), 0) })
-        });
-        group.bench_with_input(BenchmarkId::new("rust", label), &data, |b, data| {
-            b.iter(|| unsafe { rs_sys::lzma_crc64(black_box(data.as_ptr()), data.len(), 0) })
+        group.bench_with_input(BenchmarkId::new(BACKEND_NAME, label), &data, |b, data| {
+            b.iter(|| unsafe { backend_sys::lzma_crc64(black_box(data.as_ptr()), data.len(), 0) })
         });
     }
     group.finish();

diff --git a/docs/performance-workflow.md b/docs/performance-workflow.md
@@ -0,0 +1,139 @@
+# Performance Workflow
+
+This repository now has a repeatable loop for backend comparison, profiling, and code generation inspection.
+
+Important: the C and Rust sys backends must never be linked into the same process when comparing performance. Both export the same `lzma_*` symbols, so shared-process comparisons can silently resolve to the wrong implementation.
+
+## 1. Baseline correctness
+
+Keep correctness green before taking timings:
+
+```bash
+cargo test --no-default-features --features rust-backend
+cargo test --no-default-features --features c-backend
+cargo test --no-default-features --features rust-backend,c-backend --test sys_equivalence
+```
+
+## 2. Compare the full test suite
+
+Use `hyperfine` to compare end-to-end wall clock time of the deterministic root test bundle and `systest` with isolated target directories per backend:
+
+```bash
+scripts/compare_backends.sh --runs 10 --warmup 2
+```
+
+This writes:
+
+- `target/perf-results/root-tests.json`
+- `target/perf-results/root-tests.md`
+- `target/perf-results/systest.json`
+- `target/perf-results/systest.md`
+
+Those reports are the coarse regression gate. Run them after major porting or optimization work.
+
+The root bundle intentionally skips QuickCheck-based unit tests because they generate different random inputs in separate backend processes. Cover those property-style paths with the focused `qc` and `size` workloads instead.
+
+## 3. Compare focused workloads
+
+Use `perf-probe`, a small standalone binary crate that links exactly one backend at a time.
+
+Examples:
+
+```bash
+scripts/compare_workloads.sh encode --input-kind random --size 1048576 --iters 20 --warmup 3
+scripts/compare_workloads.sh decode --input-kind random --size 1048576 --iters 50 --warmup 5
+scripts/compare_workloads.sh size --input-kind random --size 1048576 --iters 400 --warmup 40
+scripts/compare_workloads.sh crc64 --size 16777216 --iters 400 --warmup 20
+scripts/compare_workloads.sh crc64 --size 1048576 --chunk-size 16 --iters 400 --warmup 40
+```
+
+This writes workload-specific `hyperfine` reports under `target/perf-results/`.
+
+For tiny inputs, increase `--iters` until one benchmarked command takes at least around a
+second. Otherwise process startup noise can dominate the comparison even when the actual
+backend work is near parity.
+
+There is still a criterion bench in [`benches/backend_comparison.rs`](../benches/backend_comparison.rs), but it now measures one backend per run. Use it only with exactly one backend feature enabled.
+
+For high-level API regressions, compare the root crate against the upstream XZ corpus:
+
+```bash
+scripts/compare_api_workloads.sh standard-files --mode all --iters 200 --warmup 20
+scripts/compare_api_workloads.sh standard-files --mode good --iters 400 --warmup 40
+scripts/compare_api_workloads.sh standard-files --mode good --name-pattern delta --iters 400 --warmup 40
+scripts/compare_api_workloads.sh qc --mode both --cases 128 --max-size 4096 --iters 200 --warmup 20
+scripts/compare_api_workloads.sh bufread-trailing --mode both --input-size 1024 --trailing-size 123 --iters 1000 --warmup 100
+```
+
+This uses [`examples/standard_files_probe.rs`](../examples/standard_files_probe.rs), which mirrors the `tests/xz.rs` `standard_files` path and writes reports to:
+
+- `target/perf-results/api-standard-files.json`
+- `target/perf-results/api-standard-files.md`
+
+The `qc` workload uses [`examples/qc_probe.rs`](../examples/qc_probe.rs) to reproduce the
+small-input repeated round-trip pattern from the root crate tests. This is useful when
+overall regressions show up in `root-tests` even though large encode/decode probes look
+good.
+
+The `bufread-trailing` workload uses
+[`examples/bufread_trailing_probe.rs`](../examples/bufread_trailing_probe.rs) to reproduce
+the `bufread::tests::compressed_and_trailing_data` path with enough in-process repetition
+to reduce test process startup noise.
+
+The `size` workload isolates the `uncompressed_size()` path from [`src/lib.rs`](../src/lib.rs),
+which corresponds to the QuickCheck-based `tests::size` unit test but with deterministic input.
+
+Use `--name-pattern <substring>` to isolate a file family inside the XZ corpus when a full-corpus comparison is too mixed to identify the remaining gap.
+
+## 4. Profile a focused workload
+
+`perf-probe` is a profiler-friendly executable that runs a single workload many times with deterministic input.
+
+Examples:
+
+```bash
+scripts/profile_backend.sh rust decode --size 1048576 --iters 800 --warmup 80
+scripts/profile_backend.sh rust size --input-kind random --size 1048576 --iters 800 --warmup 80
+scripts/profile_backend.sh rust encode --input-kind random --size 8388608 --iters 150 --warmup 20
+scripts/profile_backend.sh c crc64 --size 16777216 --iters 400
+```
+
+Useful flags passed through to `perf-probe`:
+
+- `--workload encode|decode|size|crc32|crc64`
+- `--input <path>`
+- `--compressed-input <path>`
+- `--save-output <path>`
+- `--input-kind text|random`
+- `--size <bytes>`
+- `--chunk-size <bytes>`
+- `--expected-size <bytes>`
+- `--iters <n>`
+- `--warmup <n>`
+- `--preset <level>`
+
+On macOS the script prefers `samply`; on Linux it falls back to `perf`; otherwise it runs the workload plainly with release debuginfo enabled.
+
+## 5. Inspect the generated code
+
+After a profile points to a hot Rust function, inspect its optimized output:
+
+```bash
+scripts/inspect_codegen.sh xz::lzma::lzma_encoder::lzma_encode --package xz
+scripts/inspect_codegen.sh xz::check::crc64_fast::lzma_crc64 --package xz --format llvm
+```
+
+This uses `cargo-asm` and builds under `target/codegen` by default.
+
+## 6. Iterate
+
+The expected loop is:
+
+1. Reproduce the gap with `scripts/compare_backends.sh`.
+2. Use `scripts/compare_workloads.sh` to isolate the subsystem.
+3. Capture a focused profile with `scripts/profile_backend.sh`.
+4. Pick the hottest Rust function from the profile.
+5. Inspect its assembly or LLVM IR with `scripts/inspect_codegen.sh`.
+6. Change the Rust port, then repeat the same commands.
+
+Keep the input shape, iteration count, and profiler command stable while working a hotspot so before/after numbers stay comparable.