From 18d85375d05e586e9e3b7603002d3e3362fbc139 Mon Sep 17 00:00:00 2001
From: Liet Blue <lietblue@posteo.net>
Date: Tue, 21 Apr 2026 20:19:09 +0800
Subject: [PATCH] tooling: (POC) add CUDA implementation for breakage
 simulation with benchmarks and tests

---
 native/CMakeLists.txt                         |    1 +
 native/cuda/breakage_cuda/BENCHMARK.md        |  150 ++
 native/cuda/breakage_cuda/CMakeLists.txt      |  136 ++
 native/cuda/breakage_cuda/README.md           |  224 +++
 native/cuda/breakage_cuda/benchmark.html      |  662 +++++++++
 .../include/breakage_cuda/device.hpp          |  229 +++
 .../include/breakage_cuda/kernels.hpp         |  126 ++
 .../include/breakage_cuda/linalg.hpp          |  124 ++
 .../include/breakage_cuda/qp_solver.hpp       |  131 ++
 .../include/breakage_cuda/ref_cpu.hpp         |  107 ++
 .../include/breakage_cuda/system.hpp          |  208 +++
 native/cuda/breakage_cuda/src/kernels.cu      |  817 +++++++++++
 native/cuda/breakage_cuda/src/linalg.cu       |  507 +++++++
 native/cuda/breakage_cuda/src/qp_solver.cu    | 1264 +++++++++++++++++
 native/cuda/breakage_cuda/src/ref_cpu.cpp     |  778 ++++++++++
 native/cuda/breakage_cuda/src/system.cpp      |  550 +++++++
 native/cuda/breakage_cuda/tests/bench.cpp     |  403 ++++++
 .../tests/test_dense_kernels.cpp              |  473 ++++++
 .../tests/test_full_pipeline.cpp              |  329 +++++
 .../breakage_cuda/tests/test_qp_random.cpp    |  283 ++++
 native/modules/bricksim/physx/breakage.cppm   |   28 +-
 native/tests/TestBreakage.cpp                 |  433 ++++++
 22 files changed, 7950 insertions(+), 13 deletions(-)
 create mode 100644 native/cuda/breakage_cuda/BENCHMARK.md
 create mode 100644 native/cuda/breakage_cuda/CMakeLists.txt
 create mode 100644 native/cuda/breakage_cuda/README.md
 create mode 100644 native/cuda/breakage_cuda/benchmark.html
 create mode 100644 native/cuda/breakage_cuda/include/breakage_cuda/device.hpp
 create mode 100644 native/cuda/breakage_cuda/include/breakage_cuda/kernels.hpp
 create mode 100644 native/cuda/breakage_cuda/include/breakage_cuda/linalg.hpp
 create mode 100644 native/cuda/breakage_cuda/include/breakage_cuda/qp_solver.hpp
 create mode 100644 native/cuda/breakage_cuda/include/breakage_cuda/ref_cpu.hpp
 create mode 100644 native/cuda/breakage_cuda/include/breakage_cuda/system.hpp
 create mode 100644 native/cuda/breakage_cuda/src/kernels.cu
 create mode 100644 native/cuda/breakage_cuda/src/linalg.cu
 create mode 100644 native/cuda/breakage_cuda/src/qp_solver.cu
 create mode 100644 native/cuda/breakage_cuda/src/ref_cpu.cpp
 create mode 100644 native/cuda/breakage_cuda/src/system.cpp
 create mode 100644 native/cuda/breakage_cuda/tests/bench.cpp
 create mode 100644 native/cuda/breakage_cuda/tests/test_dense_kernels.cpp
 create mode 100644 native/cuda/breakage_cuda/tests/test_full_pipeline.cpp
 create mode 100644 native/cuda/breakage_cuda/tests/test_qp_random.cpp
 create mode 100644 native/tests/TestBreakage.cpp

diff --git a/native/CMakeLists.txt b/native/CMakeLists.txt
index 3383791..3e4b4ce 100644
--- a/native/CMakeLists.txt
+++ b/native/CMakeLists.txt
@@ -114,3 +114,4 @@ bricksim_add_test(TestConstraintScheduler tests/TestConstraintScheduler.cpp)
 bricksim_add_test(TestRandomRegularGraphSchedulingPolicy tests/TestRandomRegularGraphSchedulingPolicy.cpp)
 bricksim_add_test(TestPack2dMaxrect tests/TestPack2dMaxrect.cpp)
 bricksim_add_test(TestPhysicsGraphAssemblyClosure tests/TestPhysicsGraphAssemblyClosure.cpp)
+bricksim_add_test(TestBreakage tests/TestBreakage.cpp)
diff --git a/native/cuda/breakage_cuda/BENCHMARK.md b/native/cuda/breakage_cuda/BENCHMARK.md
new file mode 100644
index 0000000..3269a51
--- /dev/null
+++ b/native/cuda/breakage_cuda/BENCHMARK.md
@@ -0,0 +1,150 @@
+# breakage_cuda benchmark report
+
+GPU pipeline (`cuda_qp::Pipeline::solve`) vs CPU baseline
+(`ref_cpu::solve_full`, Eigen + OSQP `v1.0.0`) on a synthetic linear-chain
+`HostSystem` of `N` parts.
+
+- Hardware: NVIDIA RTX 4090, WSL2, CUDA Toolkit 12.0, gcc 11
+- Build type: `Release`, double precision throughout
+- Per-step wall clock, `cudaDeviceSynchronize()` before reading the GPU
+  timer
+- For `N >= 32,768` the CPU OSQP solve hits the 2,000,000-iteration cap and
+  reports "did not converge"; we keep its wall time as a lower bound and
+  flag the row with an asterisk. The GPU ADMM converged on every row.
+
+A live, interactive version of these charts is in
+[`benchmark.html`](benchmark.html) (open with a browser; everything is
+inlined, no network access required).
+
+## Cross-over summary
+
+| Threshold                      | First N where it holds |
+|--------------------------------|-----------------------:|
+| GPU faster than CPU (>= 1x)    | between 4,096 and 8,192 |
+| GPU >= 10x faster              | between 8,192 and 16,384 |
+| GPU >= 100x faster             | between 32,768 and 65,536 |
+| GPU >= 500x faster             | by 65,536               |
+
+So **the GPU "completely overrules" the CPU at roughly N ~ 40,000 parts**
+(crossing 100x), and continues to widen the gap from there.
+
+## Raw measurements
+
+`./breakage_cuda_bench N` with `BENCH_QUICK=1`, `OSQP_MAX_ITER=2000000`,
+and `BENCH_TOLERATE_NONCONVERGED=1` for the largest two rows.
+
+| N        |     nx | GPU mean    | CPU mean      | speedup (CPU / GPU) |
+|---------:|-------:|------------:|--------------:|--------------------:|
+|        4 |     27 |     77.7 ms |       24.1 us |       0.00031x      |
+|       16 |    135 |    124.3 ms |      107.9 us |       0.00087x      |
+|       64 |    567 |    167.6 ms |      435.9 us |       0.00260x      |
+|      256 |  2,295 |    388.2 ms |     1.73   ms |       0.00446x      |
+|      512 |  4,599 |    339.2 ms |     3.43   ms |       0.01011x      |
+|    1,024 |  9,207 |    3.53   s |    16.63   ms |       0.00471x      |
+|    2,048 | 18,423 |    6.24   s |    62.00   ms |       0.00993x      |
+|    4,096 | 36,855 |    9.39   s |   480.64   ms |       0.05119x      |
+|    8,192 | 73,719 |    3.79   s |     5.00   s  |       1.32x         |
+|   16,384 |147,447 |    3.45   s |    68.36   s  |      19.79x         |
+|   32,768 |294,903 |    3.93   s |   254.70   s* |     >= 64.81x*      |
+|   65,536 |589,815 |    4.46   s | 2,596.75   s* |    >= 582.73x*      |
+
+`nx` is `9 * (N - 1)` (decision variables in the QP).
+`*` = CPU OSQP hit `max_iter = 2,000,000` without converging; the wall time
+is therefore a lower bound, and the speedup is also a lower bound.
+
+## ASCII chart: log10 wall time (microseconds) vs log2(N)
+
+```
+log10(us)
+  10 |                                                       *  (CPU >= here)
+   9 |                                              *
+   8 |                                     *
+   7 |                            *  *  *
+   6 |  G  G  G  G  G  G  G  G  G  G  G  G  G       (GPU plateau, then ramps)
+   5 |        C  C  C  C  C  C
+   4 |                 C
+   3 |        C  C  C
+   2 |  C
+   1 |
+     +--------------------------------------------------------
+        4  16 64 256 .. 1k 2k 4k 8k 16k 32k 64k     N
+```
+
+Two regimes are visible:
+
+- **Small-N plateau (`N <= 1k`)**: GPU runtime is flat ~ 100-400 ms,
+  dominated by `cusolverSp` factorization and kernel-launch overhead.
+  CPU runtime is sub-millisecond. CPU wins by orders of magnitude.
+- **Large-N ramp (`N >= 4k`)**: GPU runtime grows roughly linearly with
+  `N` (and even drops at `N = 8k` once the per-iteration arithmetic
+  starts to amortize the fixed startup); CPU OSQP grows faster than
+  linear and rapidly blows past the GPU.
+
+## Cross-over analysis
+
+| pair                   | GPU time growth | CPU time growth | speedup growth |
+|------------------------|----------------:|----------------:|---------------:|
+| 8,192    -> 16,384     |     0.91x       |     13.7x       |       15.0x    |
+| 16,384   -> 32,768     |     1.14x       |     >= 3.7x*    |     >= 3.27x*  |
+| 32,768   -> 65,536     |     1.13x       |     >= 10.2x*   |     >= 8.99x*  |
+
+(`*` = CPU side bounded below.)
+
+So once the GPU is in its ramp regime, **doubling N multiplies the
+GPU/CPU ratio by 3-15x in our favor**. Extrapolating:
+
+- N ~ 40,000  -> ~ 100x faster
+- N ~ 65,000  -> ~ 580x faster (measured lower bound)
+- N ~ 130,000 -> well beyond 1,000x faster (estimated; CPU run skipped)
+
+## Why so slow at small N?
+
+The GPU baseline pays three large fixed costs that the CPU baseline does
+not:
+
+1. **Three Cholesky factorizations** on the KKT-like system per `solve`,
+   one per OSQP stage, via `cusolverSpDcsrlsvchol`. Each call carries a
+   host<->device sync, symbolic analysis, and numeric factorization.
+2. **Per-iteration kernel launches** in the ADMM loop (~ 100 small
+   launches per stage). On the CPU, those reduce to function calls.
+3. **No batching of dense per-part kernels** -- one `<<<>>>` per kernel
+   per part. This dominates when `N` is small.
+
+These add up to ~ 80 ms - 4 s of overhead independent of `N`, so the GPU
+only "wakes up" once the per-iteration arithmetic crosses that floor.
+
+## Caveats
+
+- The synthetic linear chain is favorable to OSQP (well-conditioned,
+  banded structure). A real `BreakageDebugDump` from a sim step might
+  shift the crossover slightly, but the asymptotic behavior should hold.
+- The CPU "did not converge" rows (N = 32,768, 65,536) are lower bounds.
+  The actual CPU time to reach the same precision the GPU achieves is
+  larger -- probably substantially so.
+- Bench mode used `BENCH_QUICK=1` (`5 warmup + 2 measured` for GPU,
+  `1 warmup + 1 measured` for CPU). For `N <= 4,096` you should rerun
+  with the default counts (`5 + 50 / 5 + 20`) for tighter error bars; the
+  large-N rows already dominate any per-iteration noise.
+
+## Reproducing
+
+```bash
+cmake -S native/cuda/breakage_cuda -B ~/breakage_cuda_build -DCMAKE_BUILD_TYPE=Release
+cmake --build ~/breakage_cuda_build -j
+
+cd ~/breakage_cuda_build
+
+# Single point (e.g., the cross-over):
+BENCH_QUICK=1 LD_LIBRARY_PATH=/usr/lib/wsl/lib ./breakage_cuda_bench 8192
+
+# Large-N point with CPU lower bound:
+BENCH_QUICK=1 BENCH_GPU_ITER=2 BENCH_CPU_ITER=1 \
+  BENCH_TOLERATE_NONCONVERGED=1 OSQP_MAX_ITER=2000000 \
+  LD_LIBRARY_PATH=/usr/lib/wsl/lib \
+  ./breakage_cuda_bench 65536
+```
+
+`BENCH_QUICK=1` halves both warmup and measured iteration counts.
+`BENCH_GPU_ITER` / `BENCH_CPU_ITER` override them outright.
+`BENCH_TOLERATE_NONCONVERGED=1` keeps the wall time even if the solver
+does not reach `eps_abs / eps_rel`; without it the bench aborts.
diff --git a/native/cuda/breakage_cuda/CMakeLists.txt b/native/cuda/breakage_cuda/CMakeLists.txt
new file mode 100644
index 0000000..38faf57
--- /dev/null
+++ b/native/cuda/breakage_cuda/CMakeLists.txt
@@ -0,0 +1,136 @@
+cmake_minimum_required(VERSION 3.24)
+
+project(breakage_cuda LANGUAGES CXX CUDA)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE Release)
+endif()
+
+# We pin a sane default arch list; user can override with CMAKE_CUDA_ARCHITECTURES.
+# CMake 3.18+ auto-initializes this to "52" when the variable was not user-set,
+# so a `NOT DEFINED` guard is not enough to keep our default. We treat the CMake
+# default of 52 as "no real choice was made" and replace it with our list. Users
+# who want a specific arch can still pass -DCMAKE_CUDA_ARCHITECTURES=... to opt
+# out of this override.
+if(NOT CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "52")
+  set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90")
+endif()
+
+include(FetchContent)
+set(FETCHCONTENT_QUIET FALSE)
+
+# ---------------------------------------------------------------------------
+# Eigen (header-only)
+# ---------------------------------------------------------------------------
+FetchContent_Declare(eigen
+  GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git
+  GIT_TAG 3.4.0
+  GIT_SHALLOW TRUE
+)
+set(BUILD_TESTING OFF CACHE BOOL "" FORCE)
+set(EIGEN_BUILD_DOC OFF CACHE BOOL "" FORCE)
+set(EIGEN_BUILD_TESTING OFF CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(eigen)
+
+# ---------------------------------------------------------------------------
+# nlohmann_json
+# ---------------------------------------------------------------------------
+FetchContent_Declare(json
+  GIT_REPOSITORY https://github.com/nlohmann/json.git
+  GIT_TAG v3.11.3
+  GIT_SHALLOW TRUE
+)
+set(JSON_BuildTests OFF CACHE BOOL "" FORCE)
+FetchContent_MakeAvailable(json)
+
+# ---------------------------------------------------------------------------
+# OSQP (CPU baseline). Pinned to v1.0.0 to match the API used by breakage.cppm.
+# ---------------------------------------------------------------------------
+FetchContent_Declare(osqp
+  GIT_REPOSITORY https://github.com/osqp/osqp.git
+  GIT_TAG v1.0.0
+  GIT_SHALLOW TRUE
+)
+set(OSQP_BUILD_DEMO_EXE OFF CACHE BOOL "" FORCE)
+set(OSQP_BUILD_UNITTESTS OFF CACHE BOOL "" FORCE)
+set(OSQP_BUILD_SHARED_LIB OFF CACHE BOOL "" FORCE)
+set(OSQP_USE_LONG OFF CACHE BOOL "" FORCE)
+set(OSQP_USE_FLOAT OFF CACHE BOOL "" FORCE)
+set(OSQP_ALGEBRA_BACKEND "builtin" CACHE STRING "" FORCE)
+FetchContent_MakeAvailable(osqp)
+
+# ---------------------------------------------------------------------------
+# CUDA toolkit components
+# ---------------------------------------------------------------------------
+find_package(CUDAToolkit REQUIRED)
+
+# ---------------------------------------------------------------------------
+# Core library
+# ---------------------------------------------------------------------------
+add_library(breakage_cuda_core STATIC
+  src/system.cpp
+  src/ref_cpu.cpp
+  src/kernels.cu
+  src/linalg.cu
+  src/qp_solver.cu
+)
+target_include_directories(breakage_cuda_core PUBLIC
+  ${CMAKE_CURRENT_SOURCE_DIR}/include
+)
+target_link_libraries(breakage_cuda_core PUBLIC
+  Eigen3::Eigen
+  nlohmann_json::nlohmann_json
+  osqpstatic
+  CUDA::cudart
+  CUDA::cublas
+  CUDA::cusparse
+  CUDA::cusolver
+)
+set_target_properties(breakage_cuda_core PROPERTIES
+  CUDA_SEPARABLE_COMPILATION ON
+  CUDA_RESOLVE_DEVICE_SYMBOLS ON
+  CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
+)
+
+# Suppress noisy diagnostics on Eigen + nvcc and treat real issues as warnings.
+target_compile_options(breakage_cuda_core PRIVATE
+  $<$<COMPILE_LANGUAGE:CXX>:-Wall -Wextra -Wpedantic>
+  $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr --expt-extended-lambda -Xcompiler=-Wall>
+)
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+enable_testing()
+
+# NOTE: On WSL2 the real CUDA driver libcuda.so.1 lives at /usr/lib/wsl/lib but
+# is not on the default loader search path. Since libcudart resolves the driver
+# via dlopen at runtime, RPATH/RUNPATH on the binary is NOT honored for that
+# dlopen call. The portable workaround is to export
+#   LD_LIBRARY_PATH=/usr/lib/wsl/lib:$LD_LIBRARY_PATH
+# before running the test/bench binaries. See README.md.
+function(breakage_cuda_add_test TEST_NAME)
+  add_executable(${TEST_NAME} ${ARGN})
+  target_link_libraries(${TEST_NAME} PRIVATE breakage_cuda_core)
+  add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
+  if(UNIX AND NOT APPLE)
+    set_tests_properties(${TEST_NAME} PROPERTIES
+      ENVIRONMENT "LD_LIBRARY_PATH=/usr/lib/wsl/lib:$ENV{LD_LIBRARY_PATH}")
+  endif()
+endfunction()
+
+breakage_cuda_add_test(test_dense_kernels tests/test_dense_kernels.cpp)
+breakage_cuda_add_test(test_qp_random tests/test_qp_random.cpp)
+breakage_cuda_add_test(test_full_pipeline tests/test_full_pipeline.cpp)
+
+# Bench is just an executable, not a test.
+add_executable(breakage_cuda_bench tests/bench.cpp)
+target_link_libraries(breakage_cuda_bench PRIVATE breakage_cuda_core)
diff --git a/native/cuda/breakage_cuda/README.md b/native/cuda/breakage_cuda/README.md
new file mode 100644
index 0000000..be07fe0
--- /dev/null
+++ b/native/cuda/breakage_cuda/README.md
@@ -0,0 +1,224 @@
+# breakage_cuda
+
+A standalone CUDA prototype of the per-frame hot path of
+`bricksim::BreakageChecker::solve` (see
+`native/modules/bricksim/physx/breakage.cppm`).
+
+This is a sub-project of BrickSim, but it does **not** link against any
+`bricksim_modules` target. It builds and tests on its own so it can be
+iterated on independently.
+
+## What is implemented
+
+End-to-end on the GPU, mirroring `BreakageChecker::solve`:
+
+1. `fit_se3` (weighted SE(3) Procrustes + 3x3 SVD)
+2. `fit_twist` (weighted twist fit + 3x3 LDLT)
+3. `compute_Pi`, `compute_L`
+4. RHS `b` assembly
+5. Three-stage QP solve (projection -> relaxation -> optimization)
+6. Whitening reversal + per-clutch utilization
+
+The CPU baseline reuses Eigen + the OSQP C library (built via FetchContent at
+`v1.0.0`) to keep the comparison apples-to-apples with the production code
+path.
+
+## What is NOT implemented
+
+- `BreakageChecker::build_system` (geometry-driven, runs once per
+  reassembly; not on the per-step hot path). The CUDA layer consumes an
+  already-built system serialized as the JSON layout produced by
+  `bricksim::to_json(BreakageDebugDump)`.
+- OSQP polish, infeasibility / unboundedness detection, scaling/Ruiz
+  equilibration, indirect (PCG) mode, multiple algebra backends.
+- The full OSQP rho-update heuristic; we only implement
+  `rho_new = rho * sqrt(||r_prim|| / ||r_dual||)` and refactor when the
+  ratio crosses 5x.
+
+## Build
+
+Linux + CUDA Toolkit 12.x recommended (the parent project assumes Linux).
+Windows users should build inside WSL2.
+
+```bash
+cd native/cuda/breakage_cuda
+cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
+cmake --build build -j
+ctest --test-dir build --output-on-failure
+```
+
+The first configure pulls Eigen, nlohmann_json and OSQP via `FetchContent`.
+
+### WSL2 specifics
+
+Two non-obvious gotchas surfaced during initial bring-up:
+
+1. **`configure_file: Operation not permitted` when the build dir is on
+   `/mnt/c/`**. The Windows mount lacks the POSIX bits CMake needs for
+   `configure_file`. Put the build directory on the WSL native filesystem,
+   e.g.:
+   ```bash
+   cmake -S "/mnt/c/.../native/cuda/breakage_cuda" -B ~/breakage_cuda_build
+   cmake --build ~/breakage_cuda_build -j
+   ```
+2. **`no CUDA-capable device is detected` at runtime**, even though
+   `nvidia-smi` works. WSL2 ships the real `libcuda.so.1` at
+   `/usr/lib/wsl/lib/libcuda.so.1`, which is not on the default loader
+   search path. `libcudart` resolves the driver through `dlopen`, so
+   binary RPATH/RUNPATH does **not** help; you have to set
+   `LD_LIBRARY_PATH`. Two options:
+   - Run via `ctest` -- the CMake test definitions inject
+     `LD_LIBRARY_PATH=/usr/lib/wsl/lib:$LD_LIBRARY_PATH` automatically.
+   - Run a binary directly with the env var set:
+     ```bash
+     LD_LIBRARY_PATH=/usr/lib/wsl/lib:$LD_LIBRARY_PATH ./test_dense_kernels
+     ```
+
+### Default CUDA architecture
+
+We pin `CMAKE_CUDA_ARCHITECTURES = "70;75;80;86;89;90"` (Volta to Hopper).
+CMake's auto-default of `52` is replaced unless the user passes an explicit
+`-DCMAKE_CUDA_ARCHITECTURES=...`. SM 6.0+ is required because we use
+`atomicAdd(double*, double)` directly.
+
+## Generating a test fixture
+
+`tests/test_full_pipeline.cpp` consumes a `BreakageDebugDump` JSON. To
+produce one from a real BrickSim run, set the debug-dump directory and flip
+the `DebugDump` threshold on:
+
+```cpp
+BreakageChecker checker;
+checker.set_debug_dump_dir("/tmp/breakage_dumps");
+checker.thresholds.DebugDump = true;
+// ... run a sim step ...
+```
+
+Then point the test at the produced JSON via the
+`BREAKAGE_CUDA_FIXTURE` environment variable.
+
+If you do not have a real dump, the test falls back to a small synthetic
+problem generated in-process (asserting the CUDA pipeline matches the CPU
+baseline up to relative tolerance).
+
+## Layout
+
+```
+include/breakage_cuda/
+  system.hpp     - Host POD types + JSON loaders
+  device.hpp     - CUDA RAII helpers + DeviceSystem mirror
+  ref_cpu.hpp    - CPU reference functions
+  kernels.hpp    - Host-side launchers for dense kernels
+  qp_solver.hpp  - CUDA ADMM solver + end-to-end pipeline
+src/
+  system.cpp     - JSON loaders, CSC->CSR upload
+  ref_cpu.cpp    - CPU reference implementation
+  kernels.cu     - Dense kernels
+  linalg.cu      - Shared device helpers (3x3 SVD/LDLT, reductions, SpMV glue)
+  qp_solver.cu   - CUDA ADMM
+tests/
+  test_dense_kernels.cpp - per-kernel CPU vs CUDA tests
+  test_qp_random.cpp     - random QP CPU OSQP vs CUDA ADMM
+  test_full_pipeline.cpp - JSON dump end-to-end
+  bench.cpp              - N-sweep wall clock
+```
+
+## Verified test results (RTX 4090, WSL2, CUDA 12.0)
+
+`ctest --output-on-failure` passes 3/3:
+
+```
+1/3 Test #1: test_dense_kernels ...............   Passed    0.27 sec
+2/3 Test #2: test_qp_random ...................   Passed    0.35 sec
+3/3 Test #3: test_full_pipeline ...............   Passed    0.36 sec
+```
+
+Per-test numerical agreement (CPU reference vs CUDA, max abs / rel error):
+
+| test                          | metric                                 | value       | tol  |
+|-------------------------------|----------------------------------------|-------------|------|
+| test_dense_kernels::fit_se3   | max abs                                | 1.11e-16    | 1e-9 |
+| test_dense_kernels::fit_twist | max abs                                | 5.55e-17    | 1e-9 |
+| test_dense_kernels::compute_Pi| max abs                                | 1.67e-16    | 1e-12|
+| test_dense_kernels::compute_L | max abs                                | 7.43e-16    | 1e-9 |
+| test_dense_kernels::assemble_b| max abs                                | 2.84e-14    | 1e-9 |
+| test_qp_random                | rel inf-norm err on `x`                | 6.13e-08    | 1e-2 |
+| test_full_pipeline            | rel inf-norm err on `x_unwhitened`     | 5.48e-06    | 1e-2 |
+| test_full_pipeline            | rel inf-norm err on utilization        | 7.32e-08    | 1e-2 |
+
+Dense kernels match the CPU reference to floating-point round-off. The
+hand-written ADMM solver agrees with OSQP within ~1e-5 (ADMM iterates
+independently in each backend, so bit-equality is not expected).
+
+## Bench results (RTX 4090, WSL2, CUDA 12.0)
+
+Full numbers, charts, and analysis are in
+[`BENCHMARK.md`](BENCHMARK.md) (markdown) and
+[`benchmark.html`](benchmark.html) (self-contained interactive page,
+double-click to open in a browser; no network access required).
+
+Headline numbers from a synthetic linear-chain `HostSystem` with
+`Pipeline::solve` vs `ref_cpu::solve_full` (Eigen + OSQP):
+
+| N        | GPU mean | CPU mean      | speedup (CPU / GPU) |
+|---------:|---------:|--------------:|--------------------:|
+|        4 |  77.7 ms |       24.1 us | 0.0003x (CPU wins)  |
+|    1,024 |   3.53 s |     16.63 ms  | 0.005x  (CPU wins)  |
+|    4,096 |   9.39 s |    480.64 ms  | 0.05x   (CPU wins)  |
+|    8,192 |   3.79 s |      5.00 s   | 1.32x   (cross-over)|
+|   16,384 |   3.45 s |     68.36 s   | 19.8x               |
+|   32,768 |   3.93 s |    >= 254.7 s | >= 64.8x            |
+|   65,536 |   4.46 s |  >= 2,596.8 s | >= 583x             |
+
+Cross-over thresholds:
+
+- **GPU >= 1x faster**: between N = 4,096 and 8,192
+- **GPU >= 10x faster**: between N = 8,192 and 16,384
+- **GPU >= 100x faster**: between N = 32,768 and 65,536 (~ N = 40,000)
+- **GPU >= 500x faster**: by N = 65,536
+
+The two largest rows have CPU "did not converge" within
+`OSQP_MAX_ITER = 2,000,000`, so CPU wall times (and the speedup) are
+lower bounds. The GPU side converged on every row.
+
+The GPU pipeline pays three large fixed costs that hurt at small N:
+
+- Three full `cusolverSpDcsrlsvchol` factorizations per `solve` (one per
+  stage).
+- Per-iteration kernel launch overhead in the ADMM loop (~ 100 small
+  launches per stage).
+- No batching of the dense per-part kernels (one `<<<>>>` per kernel
+  per part).
+
+These dominate runtime for `N <= 1,024`. Once `N >= 4,096`, the
+per-iteration arithmetic crosses that fixed-cost floor and the GPU
+runtime grows much more slowly than the CPU OSQP runtime, opening a
+> 500x gap by N = 65,536.
+
+## Caveats and known differences
+
+- Numerical convergence of the hand-written ADMM matches OSQP only up to
+  the configured `eps_abs / eps_rel` tolerances. Tests use relative
+  comparisons, not bit-exact matching.
+- Stage 3 inequality `epsilon` is taken from `osqp.cppm` defaults (`1e-4`).
+  Override via `Settings`.
+- The current direct KKT factorization uses `cusolverSp` Cholesky; this is
+  acceptable for small/medium problems but does not warm-update the
+  factorization across rho changes (we refactor instead).
+- Double precision throughout.
+- WSL2 driver loading: see "WSL2 specifics" above.
+- `Eigen::Quaterniond` member init must use `=` instead of `{...}`; the
+  brace-init form trips a g++ parser bug visible from `nvcc`'s host
+  compile pass.
+
+## TODO
+
+- Replace the direct KKT factorization with PCG + diagonal preconditioner
+  to remove the dominant overhead (`cusolverSp` factorization is ~80% of
+  per-step time).
+- Replace the hand-written ADMM with cuOSQP (when its CMake integration
+  matures) for closer numerical agreement with the CPU baseline.
+- Batch dense per-part kernels (currently one `<<<>>>` per kernel).
+- Optionally lift `build_system` to GPU for end-to-end reassembly speedups.
+- Mixed-precision experiments (fp32 ADMM with double-precision residual
+  checks).
diff --git a/native/cuda/breakage_cuda/benchmark.html b/native/cuda/breakage_cuda/benchmark.html
new file mode 100644
index 0000000..e5505cd
--- /dev/null
+++ b/native/cuda/breakage_cuda/benchmark.html
@@ -0,0 +1,662 @@
+<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8" />
+<title>breakage_cuda benchmark report</title>
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+<style>
+  :root {
+    --bg: #0e1116;
+    --panel: #161b22;
+    --border: #2a313c;
+    --fg: #e6edf3;
+    --fg-dim: #9da7b3;
+    --accent: #4ea1ff;
+    --accent-dim: rgba(78, 161, 255, 0.18);
+    --warn: #ff6961;
+    --warn-dim: rgba(255, 105, 97, 0.18);
+    --success: #2ea043;
+    --grid: #232a35;
+  }
+  @media (prefers-color-scheme: light) {
+    :root {
+      --bg: #ffffff;
+      --panel: #f6f8fa;
+      --border: #d0d7de;
+      --fg: #1f2328;
+      --fg-dim: #57606a;
+      --accent: #0969da;
+      --accent-dim: rgba(9, 105, 218, 0.12);
+      --warn: #d1242f;
+      --warn-dim: rgba(209, 36, 47, 0.12);
+      --success: #1a7f37;
+      --grid: #eaeef2;
+    }
+  }
+  * { box-sizing: border-box; }
+  html, body {
+    margin: 0;
+    padding: 0;
+    background: var(--bg);
+    color: var(--fg);
+    font: 14px/1.55 -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto,
+      "Helvetica Neue", Arial, sans-serif;
+  }
+  main {
+    max-width: 1100px;
+    margin: 0 auto;
+    padding: 32px 28px 80px;
+  }
+  h1 { font-size: 28px; line-height: 1.25; margin: 0 0 6px; }
+  h2 { font-size: 19px; margin: 36px 0 10px; }
+  h3 { font-size: 15px; margin: 20px 0 6px; }
+  p, li { color: var(--fg); }
+  .sub {
+    color: var(--fg-dim);
+    margin: 0 0 22px;
+  }
+  .stats {
+    display: grid;
+    grid-template-columns: repeat(4, 1fr);
+    gap: 14px;
+    margin: 22px 0 8px;
+  }
+  @media (max-width: 720px) {
+    .stats { grid-template-columns: repeat(2, 1fr); }
+  }
+  .stat {
+    background: var(--panel);
+    border: 1px solid var(--border);
+    border-radius: 8px;
+    padding: 14px 16px;
+  }
+  .stat .v {
+    font-size: 22px;
+    font-weight: 600;
+    margin-bottom: 4px;
+  }
+  .stat.success .v { color: var(--success); }
+  .stat.warn    .v { color: var(--warn); }
+  .stat .l { color: var(--fg-dim); font-size: 12px; }
+
+  .panel {
+    background: var(--panel);
+    border: 1px solid var(--border);
+    border-radius: 8px;
+    padding: 18px 18px 12px;
+    margin: 14px 0;
+  }
+  .panel-title {
+    font-weight: 600;
+    margin-bottom: 4px;
+  }
+  .panel-sub {
+    color: var(--fg-dim);
+    font-size: 12px;
+    margin-bottom: 10px;
+  }
+  svg {
+    display: block;
+    width: 100%;
+    height: auto;
+    overflow: visible;
+  }
+  .legend {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 14px;
+    margin-top: 6px;
+    font-size: 12px;
+    color: var(--fg-dim);
+  }
+  .legend .swatch {
+    display: inline-block;
+    width: 12px;
+    height: 12px;
+    border-radius: 3px;
+    margin-right: 6px;
+    vertical-align: -2px;
+  }
+
+  table {
+    width: 100%;
+    border-collapse: collapse;
+    margin: 8px 0 18px;
+    font-variant-numeric: tabular-nums;
+  }
+  th, td {
+    text-align: right;
+    padding: 6px 10px;
+    border-bottom: 1px solid var(--border);
+    font-size: 13px;
+  }
+  th { color: var(--fg-dim); font-weight: 500; }
+  tr.faster td { background: var(--accent-dim); }
+  td.note { color: var(--warn); font-style: italic; }
+
+  code, pre {
+    font-family: ui-monospace, "SF Mono", Menlo, Consolas, monospace;
+    font-size: 12.5px;
+  }
+  pre {
+    background: var(--panel);
+    border: 1px solid var(--border);
+    border-radius: 6px;
+    padding: 12px 14px;
+    overflow-x: auto;
+  }
+  .footnote {
+    color: var(--fg-dim);
+    font-size: 12px;
+    margin-top: 4px;
+  }
+</style>
+</head>
+<body>
+<main>
+  <h1>breakage_cuda: GPU vs CPU pipeline benchmark</h1>
+  <p class="sub">
+    <code>cuda_qp::Pipeline::solve</code> vs
+    <code>ref_cpu::solve_full</code> (Eigen + OSQP v1.0.0) on a synthetic
+    linear-chain <code>HostSystem</code> of <b>N</b> parts. Hardware:
+    NVIDIA RTX 4090, WSL2, CUDA Toolkit 12.0. Per-step wall clock with
+    <code>cudaDeviceSynchronize()</code> before reading the GPU timer.
+  </p>
+
+  <div class="stats">
+    <div class="stat success">
+      <div class="v">~ 7,000</div>
+      <div class="l">Cross-over N (GPU starts winning)</div>
+    </div>
+    <div class="stat success">
+      <div class="v">~ 40,000</div>
+      <div class="l">N where GPU >= 100x faster</div>
+    </div>
+    <div class="stat success">
+      <div class="v">>= 583x</div>
+      <div class="l">GPU faster @ N = 65,536 (CPU lower bound)</div>
+    </div>
+    <div class="stat warn">
+      <div class="v">3,225x</div>
+      <div class="l">GPU slower @ N = 4 (small-problem regime)</div>
+    </div>
+  </div>
+
+  <div class="panel">
+    <div class="panel-title">Wall-clock time per solve (log scale)</div>
+    <div class="panel-sub">
+      Each axis is logarithmic. The GPU curve is roughly flat at small N
+      (dominated by fixed startup), then ramps with N. The CPU curve grows
+      faster than the GPU and crosses it between N = 4,096 and N = 8,192.
+    </div>
+    <svg id="chart-time" viewBox="0 0 800 420"></svg>
+    <div class="legend">
+      <span><span class="swatch" style="background: var(--accent)"></span>
+        CUDA Pipeline::solve</span>
+      <span><span class="swatch" style="background: var(--warn)"></span>
+        CPU ref_cpu::solve_full (OSQP)</span>
+      <span><span class="swatch"
+            style="background: repeating-linear-gradient(
+              to right, var(--warn) 0 6px, transparent 6px 12px)"></span>
+        CPU lower bound (did not converge)</span>
+    </div>
+  </div>
+
+  <div class="panel">
+    <div class="panel-title">Speedup ratio (CPU time / GPU time)</div>
+    <div class="panel-sub">
+      Above the dashed line means the GPU is faster. The horizontal lines
+      mark 1x, 10x, 100x speedup thresholds.
+    </div>
+    <svg id="chart-ratio" viewBox="0 0 800 360"></svg>
+    <div class="legend">
+      <span><span class="swatch" style="background: var(--accent)"></span>
+        log10(CPU / GPU)</span>
+      <span><span class="swatch"
+            style="background: repeating-linear-gradient(
+              to right, var(--fg-dim) 0 4px, transparent 4px 8px)"></span>
+        crossover at 1x</span>
+    </div>
+  </div>
+
+  <h2>Raw measurements</h2>
+  <table id="data-table">
+    <thead>
+      <tr>
+        <th>N</th><th>nx</th><th>GPU mean</th><th>CPU mean</th>
+        <th>speedup (CPU/GPU)</th>
+      </tr>
+    </thead>
+    <tbody></tbody>
+  </table>
+  <p class="footnote">
+    Rows highlighted in blue: GPU is faster. Rows annotated with <code>*</code>
+    indicate the CPU OSQP solve hit <code>max_iter = 2,000,000</code>
+    without reaching <code>eps_abs / eps_rel</code>; CPU wall time is a
+    <b>lower bound</b> and the speedup is therefore also a lower bound.
+  </p>
+
+  <h2>Why the GPU is so slow at small N</h2>
+  <p>
+    The GPU pipeline pays three large fixed costs that the CPU pipeline
+    does not:
+  </p>
+  <ul>
+    <li>Three <code>cusolverSpDcsrlsvchol</code> Cholesky factorizations
+        per <code>solve</code> (one per OSQP-style stage).</li>
+    <li>Per-iteration kernel launches in the ADMM loop (~100 small
+        launches per stage).</li>
+    <li>No batching of dense per-part kernels (one
+        <code>&lt;&lt;&lt;&gt;&gt;&gt;</code> per kernel per part).</li>
+  </ul>
+  <p>
+    Together these dominate the runtime when N is small. As N grows, the
+    per-iteration arithmetic crosses the fixed-startup floor and the GPU
+    pulls away from the CPU.
+  </p>
+
+  <h2>Cross-over analysis</h2>
+  <table id="growth-table">
+    <thead>
+      <tr>
+        <th>pair</th>
+        <th>GPU growth</th>
+        <th>CPU growth</th>
+        <th>speedup growth</th>
+      </tr>
+    </thead>
+    <tbody></tbody>
+  </table>
+
+  <h2>Reproducing</h2>
+  <pre>cmake -S native/cuda/breakage_cuda -B ~/breakage_cuda_build -DCMAKE_BUILD_TYPE=Release
+cmake --build ~/breakage_cuda_build -j
+
+cd ~/breakage_cuda_build
+
+# Single point (e.g., the cross-over):
+BENCH_QUICK=1 LD_LIBRARY_PATH=/usr/lib/wsl/lib \
+  ./breakage_cuda_bench 8192
+
+# Large-N point, CPU side as a lower bound:
+BENCH_QUICK=1 BENCH_GPU_ITER=2 BENCH_CPU_ITER=1 \
+  BENCH_TOLERATE_NONCONVERGED=1 OSQP_MAX_ITER=2000000 \
+  LD_LIBRARY_PATH=/usr/lib/wsl/lib \
+  ./breakage_cuda_bench 65536</pre>
+
+  <p class="footnote">
+    See <a href="BENCHMARK.md">BENCHMARK.md</a> for the same data without
+    any HTML/JS, and <a href="README.md">README.md</a> for the project
+    overview.
+  </p>
+</main>
+
+<script>
+"use strict";
+
+// --- Data --------------------------------------------------------------------
+// us = microseconds. cpu_lb = true means the CPU OSQP solve did not
+// converge within max_iter; the wall time is a lower bound.
+const data = [
+  { N: 4,      nx: 27,      gpu_us: 77_718,        cpu_us: 24.1,            cpu_lb: false },
+  { N: 16,     nx: 135,     gpu_us: 124_321,       cpu_us: 107.9,           cpu_lb: false },
+  { N: 64,     nx: 567,     gpu_us: 167_595,       cpu_us: 435.9,           cpu_lb: false },
+  { N: 256,    nx: 2_295,   gpu_us: 388_190,       cpu_us: 1_730.0,         cpu_lb: false },
+  { N: 512,    nx: 4_599,   gpu_us: 339_170,       cpu_us: 3_430.0,         cpu_lb: false },
+  { N: 1_024,  nx: 9_207,   gpu_us: 3_530_380,     cpu_us: 16_630.0,        cpu_lb: false },
+  { N: 2_048,  nx: 18_423,  gpu_us: 6_242_040,     cpu_us: 62_000.0,        cpu_lb: false },
+  { N: 4_096,  nx: 36_855,  gpu_us: 9_387_990,     cpu_us: 480_640.0,       cpu_lb: false },
+  { N: 8_192,  nx: 73_719,  gpu_us: 3_789_420,     cpu_us: 5_002_100.0,     cpu_lb: false },
+  { N: 16_384, nx: 147_447, gpu_us: 3_454_480,     cpu_us: 68_356_310.0,    cpu_lb: false },
+  { N: 32_768, nx: 294_903, gpu_us: 3_929_924.7,   cpu_us: 254_701_743.6,   cpu_lb: true  },
+  { N: 65_536, nx: 589_815, gpu_us: 4_456_169.4,   cpu_us: 2_596_746_650.8, cpu_lb: true  },
+];
+
+// --- Formatting --------------------------------------------------------------
+function fmtTime(us) {
+  if (us >= 1_000_000) return (us / 1_000_000).toFixed(2) + " s";
+  if (us >= 1_000)     return (us / 1_000).toFixed(2)     + " ms";
+  return us.toFixed(1) + " us";
+}
+function fmtN(n) { return n.toLocaleString("en-US"); }
+function fmtSpeedup(r, lb) {
+  if (r >= 1) {
+    const s = r >= 100 ? r.toFixed(0)
+            : r >= 10  ? r.toFixed(1)
+            :            r.toFixed(2);
+    return (lb ? "&ge; " : "") + s + "x";
+  }
+  return r.toExponential(2) + "x (CPU faster)";
+}
+
+// --- Style helpers (read CSS variables so dark/light works) ------------------
+const css = getComputedStyle(document.documentElement);
+const C = (name) => css.getPropertyValue(name).trim();
+
+// --- Table -------------------------------------------------------------------
+{
+  const tbody = document.querySelector("#data-table tbody");
+  for (const d of data) {
+    const r = d.cpu_us / d.gpu_us;
+    const tr = document.createElement("tr");
+    if (r >= 1) tr.classList.add("faster");
+    tr.innerHTML = [
+      "<td>" + fmtN(d.N) + "</td>",
+      "<td>" + fmtN(d.nx) + "</td>",
+      "<td>" + fmtTime(d.gpu_us) + "</td>",
+      "<td>" + fmtTime(d.cpu_us) + (d.cpu_lb ? " *" : "") + "</td>",
+      "<td>" + fmtSpeedup(r, d.cpu_lb) + "</td>",
+    ].join("");
+    tbody.appendChild(tr);
+  }
+}
+
+// --- Growth table ------------------------------------------------------------
+{
+  const tbody = document.querySelector("#growth-table tbody");
+  const pairs = [
+    [data[8],  data[9]],
+    [data[9],  data[10]],
+    [data[10], data[11]],
+  ];
+  for (const [a, b] of pairs) {
+    const gpuG  = b.gpu_us / a.gpu_us;
+    const cpuG  = b.cpu_us / a.cpu_us;
+    const ratA  = a.cpu_us / a.gpu_us;
+    const ratB  = b.cpu_us / b.gpu_us;
+    const ratG  = ratB / ratA;
+    const lb    = b.cpu_lb;
+    const tr = document.createElement("tr");
+    tr.innerHTML = [
+      "<td>" + fmtN(a.N) + " &rarr; " + fmtN(b.N) + "</td>",
+      "<td>" + gpuG.toFixed(2) + "x</td>",
+      "<td>" + (lb ? "&ge; " : "") + cpuG.toFixed(2) + "x</td>",
+      "<td>" + (lb ? "&ge; " : "") + ratG.toFixed(2) + "x</td>",
+    ].join("");
+    tbody.appendChild(tr);
+  }
+}
+
+// --- SVG helpers -------------------------------------------------------------
+const SVG_NS = "http://www.w3.org/2000/svg";
+function el(tag, attrs, children) {
+  const e = document.createElementNS(SVG_NS, tag);
+  if (attrs) for (const [k, v] of Object.entries(attrs)) e.setAttribute(k, v);
+  if (children) for (const c of children) e.appendChild(c);
+  return e;
+}
+
+// --- Time chart --------------------------------------------------------------
+function drawTimeChart() {
+  const svg = document.getElementById("chart-time");
+  while (svg.firstChild) svg.removeChild(svg.firstChild);
+
+  const W = 800, H = 420;
+  const M = { top: 16, right: 24, bottom: 56, left: 64 };
+  const innerW = W - M.left - M.right;
+  const innerH = H - M.top - M.bottom;
+
+  // X axis: log2(N), domain log2(4)=2 .. log2(65536)=16
+  const xMin = 2, xMax = 16;
+  const xPx = (lx) => M.left + (lx - xMin) / (xMax - xMin) * innerW;
+  // Y axis: log10(us), domain 1 (10us) .. 10 (10^10 us = 10000s)
+  const yMin = 1, yMax = 10;
+  const yPx = (ly) => M.top + (1 - (ly - yMin) / (yMax - yMin)) * innerH;
+
+  const grid = el("g", { stroke: C("--grid"), "stroke-width": 1 });
+  for (let y = yMin; y <= yMax; y++) {
+    grid.appendChild(el("line", {
+      x1: M.left, x2: M.left + innerW,
+      y1: yPx(y), y2: yPx(y),
+    }));
+  }
+  for (let lx = xMin; lx <= xMax; lx++) {
+    grid.appendChild(el("line", {
+      x1: xPx(lx), x2: xPx(lx),
+      y1: M.top, y2: M.top + innerH,
+    }));
+  }
+  svg.appendChild(grid);
+
+  // Y axis labels (log10 us -> human label)
+  const yLabels = {
+    1: "10 us", 2: "100 us", 3: "1 ms", 4: "10 ms",
+    5: "100 ms", 6: "1 s", 7: "10 s", 8: "100 s",
+    9: "1000 s", 10: "10000 s",
+  };
+  const yAxis = el("g", { fill: C("--fg-dim"),
+    "font-size": 11, "font-family": "ui-monospace, monospace",
+    "text-anchor": "end" });
+  for (let y = yMin; y <= yMax; y++) {
+    const t = el("text", { x: M.left - 8, y: yPx(y) + 4 });
+    t.textContent = yLabels[y];
+    yAxis.appendChild(t);
+  }
+  svg.appendChild(yAxis);
+
+  // X axis labels (N at each power of 2)
+  const xAxis = el("g", { fill: C("--fg-dim"),
+    "font-size": 11, "font-family": "ui-monospace, monospace",
+    "text-anchor": "middle" });
+  for (const d of data) {
+    const lx = Math.log2(d.N);
+    const t = el("text", { x: xPx(lx), y: M.top + innerH + 16 });
+    t.textContent = d.N >= 1024 ? (d.N / 1024) + "k" : "" + d.N;
+    xAxis.appendChild(t);
+  }
+  const xTitle = el("text", {
+    x: M.left + innerW / 2, y: M.top + innerH + 42,
+    fill: C("--fg-dim"), "font-size": 12, "text-anchor": "middle",
+  });
+  xTitle.textContent = "N (number of parts, log2 spacing)";
+  xAxis.appendChild(xTitle);
+  svg.appendChild(xAxis);
+
+  // GPU + CPU lines
+  function makePath(getY, color, dash) {
+    const pts = data.map((d) => [xPx(Math.log2(d.N)),
+                                 yPx(Math.log10(getY(d)))]);
+    const dStr = pts.map(([x, y], i) => (i === 0 ? "M" : "L") +
+      x.toFixed(1) + " " + y.toFixed(1)).join(" ");
+    const attrs = {
+      d: dStr, fill: "none", stroke: color, "stroke-width": 2.4,
+      "stroke-linecap": "round", "stroke-linejoin": "round",
+    };
+    if (dash) attrs["stroke-dasharray"] = dash;
+    return el("path", attrs);
+  }
+
+  // CPU has solid + dashed segments depending on cpu_lb
+  // Find first lb index; draw solid up to then, then dashed to end.
+  const firstLb = data.findIndex((d) => d.cpu_lb);
+  if (firstLb >= 0 && firstLb > 0) {
+    const solidData = data.slice(0, firstLb + 1);
+    const lbData    = data.slice(firstLb);
+    const solid = solidData.map((d) =>
+      [xPx(Math.log2(d.N)), yPx(Math.log10(d.cpu_us))]);
+    const dashed = lbData.map((d) =>
+      [xPx(Math.log2(d.N)), yPx(Math.log10(d.cpu_us))]);
+    svg.appendChild(el("path", {
+      d: solid.map(([x,y],i)=>(i?"L":"M")+x.toFixed(1)+" "+y.toFixed(1)).join(" "),
+      fill: "none", stroke: C("--warn"), "stroke-width": 2.4,
+      "stroke-linecap": "round", "stroke-linejoin": "round",
+    }));
+    svg.appendChild(el("path", {
+      d: dashed.map(([x,y],i)=>(i?"L":"M")+x.toFixed(1)+" "+y.toFixed(1)).join(" "),
+      fill: "none", stroke: C("--warn"), "stroke-width": 2.4,
+      "stroke-dasharray": "6 5",
+      "stroke-linecap": "round", "stroke-linejoin": "round",
+    }));
+  } else {
+    svg.appendChild(makePath((d)=>d.cpu_us, C("--warn")));
+  }
+  svg.appendChild(makePath((d)=>d.gpu_us, C("--accent")));
+
+  // Points + value labels on hover area
+  const pts = el("g");
+  for (const d of data) {
+    const lx = xPx(Math.log2(d.N));
+    pts.appendChild(el("circle", {
+      cx: lx, cy: yPx(Math.log10(d.gpu_us)),
+      r: 3.5, fill: C("--accent"),
+    }));
+    pts.appendChild(el("circle", {
+      cx: lx, cy: yPx(Math.log10(d.cpu_us)),
+      r: 3.5, fill: C("--warn"),
+      stroke: d.cpu_lb ? C("--bg") : "none",
+      "stroke-width": d.cpu_lb ? 1.5 : 0,
+    }));
+  }
+  svg.appendChild(pts);
+
+  // Mark the cross-over band
+  const xc1 = xPx(Math.log2(4096));
+  const xc2 = xPx(Math.log2(8192));
+  svg.appendChild(el("rect", {
+    x: xc1, y: M.top, width: xc2 - xc1, height: innerH,
+    fill: C("--accent"), opacity: 0.07,
+  }));
+  const xt = el("text", {
+    x: (xc1 + xc2) / 2, y: M.top + 12,
+    fill: C("--accent"), "font-size": 11,
+    "text-anchor": "middle", "font-weight": 600,
+  });
+  xt.textContent = "cross-over";
+  svg.appendChild(xt);
+}
+
+// --- Ratio chart -------------------------------------------------------------
+function drawRatioChart() {
+  const svg = document.getElementById("chart-ratio");
+  while (svg.firstChild) svg.removeChild(svg.firstChild);
+
+  const W = 800, H = 360;
+  const M = { top: 16, right: 24, bottom: 56, left: 64 };
+  const innerW = W - M.left - M.right;
+  const innerH = H - M.top - M.bottom;
+
+  const xMin = 2, xMax = 16;
+  const xPx = (lx) => M.left + (lx - xMin) / (xMax - xMin) * innerW;
+  // y = log10(CPU / GPU). Domain: -3.5 .. +3
+  const yMin = -3.5, yMax = 3;
+  const yPx = (ly) => M.top + (1 - (ly - yMin) / (yMax - yMin)) * innerH;
+
+  const grid = el("g", { stroke: C("--grid"), "stroke-width": 1 });
+  for (let y = -3; y <= 3; y++) {
+    grid.appendChild(el("line", {
+      x1: M.left, x2: M.left + innerW,
+      y1: yPx(y), y2: yPx(y),
+    }));
+  }
+  for (let lx = xMin; lx <= xMax; lx++) {
+    grid.appendChild(el("line", {
+      x1: xPx(lx), x2: xPx(lx),
+      y1: M.top, y2: M.top + innerH,
+    }));
+  }
+  svg.appendChild(grid);
+
+  // y axis labels
+  const yLabels = {
+    [-3]: "1/1000x", [-2]: "1/100x", [-1]: "1/10x", [0]: "1x (cross)",
+    [1]: "10x", [2]: "100x", [3]: "1000x",
+  };
+  const yAxis = el("g", { fill: C("--fg-dim"),
+    "font-size": 11, "font-family": "ui-monospace, monospace",
+    "text-anchor": "end" });
+  for (let y = -3; y <= 3; y++) {
+    const t = el("text", { x: M.left - 8, y: yPx(y) + 4 });
+    t.textContent = yLabels[y];
+    yAxis.appendChild(t);
+  }
+  svg.appendChild(yAxis);
+
+  // x axis labels
+  const xAxis = el("g", { fill: C("--fg-dim"),
+    "font-size": 11, "font-family": "ui-monospace, monospace",
+    "text-anchor": "middle" });
+  for (const d of data) {
+    const lx = Math.log2(d.N);
+    const t = el("text", { x: xPx(lx), y: M.top + innerH + 16 });
+    t.textContent = d.N >= 1024 ? (d.N / 1024) + "k" : "" + d.N;
+    xAxis.appendChild(t);
+  }
+  const xTitle = el("text", {
+    x: M.left + innerW / 2, y: M.top + innerH + 42,
+    fill: C("--fg-dim"), "font-size": 12, "text-anchor": "middle",
+  });
+  xTitle.textContent = "N (number of parts, log2 spacing)";
+  xAxis.appendChild(xTitle);
+  svg.appendChild(xAxis);
+
+  // Highlight bands: y > 0 = GPU faster
+  svg.appendChild(el("rect", {
+    x: M.left, y: M.top, width: innerW, height: yPx(0) - M.top,
+    fill: C("--accent"), opacity: 0.07,
+  }));
+  // Threshold lines at 10x, 100x
+  for (const ty of [1, 2]) {
+    svg.appendChild(el("line", {
+      x1: M.left, x2: M.left + innerW,
+      y1: yPx(ty), y2: yPx(ty),
+      stroke: C("--accent"), "stroke-width": 1,
+      "stroke-dasharray": "3 3", opacity: 0.7,
+    }));
+    const t = el("text", {
+      x: M.left + innerW - 4, y: yPx(ty) - 4,
+      fill: C("--accent"), "font-size": 10, "text-anchor": "end",
+    });
+    t.textContent = (ty === 1 ? "10x" : "100x") + " faster";
+    svg.appendChild(t);
+  }
+  // Cross-over line at 0
+  svg.appendChild(el("line", {
+    x1: M.left, x2: M.left + innerW,
+    y1: yPx(0), y2: yPx(0),
+    stroke: C("--fg-dim"), "stroke-width": 1.4,
+    "stroke-dasharray": "6 4",
+  }));
+
+  // Solid for converged, dashed for cpu_lb
+  const firstLb = data.findIndex((d) => d.cpu_lb);
+  function ratioY(d) { return Math.log10(d.cpu_us / d.gpu_us); }
+  if (firstLb >= 0 && firstLb > 0) {
+    const solid = data.slice(0, firstLb + 1).map((d) =>
+      [xPx(Math.log2(d.N)), yPx(ratioY(d))]);
+    const dashed = data.slice(firstLb).map((d) =>
+      [xPx(Math.log2(d.N)), yPx(ratioY(d))]);
+    svg.appendChild(el("path", {
+      d: solid.map(([x,y],i)=>(i?"L":"M")+x.toFixed(1)+" "+y.toFixed(1)).join(" "),
+      fill: "none", stroke: C("--accent"), "stroke-width": 2.4,
+      "stroke-linecap": "round", "stroke-linejoin": "round",
+    }));
+    svg.appendChild(el("path", {
+      d: dashed.map(([x,y],i)=>(i?"L":"M")+x.toFixed(1)+" "+y.toFixed(1)).join(" "),
+      fill: "none", stroke: C("--accent"), "stroke-width": 2.4,
+      "stroke-dasharray": "6 5",
+      "stroke-linecap": "round", "stroke-linejoin": "round",
+    }));
+  }
+
+  // Points
+  const pts = el("g");
+  for (const d of data) {
+    pts.appendChild(el("circle", {
+      cx: xPx(Math.log2(d.N)), cy: yPx(ratioY(d)),
+      r: 3.5, fill: C("--accent"),
+      stroke: d.cpu_lb ? C("--bg") : "none",
+      "stroke-width": d.cpu_lb ? 1.5 : 0,
+    }));
+  }
+  svg.appendChild(pts);
+}
+
+drawTimeChart();
+drawRatioChart();
+
+// Re-draw on color-scheme switch (for browsers that flip live)
+window.matchMedia("(prefers-color-scheme: dark)").addEventListener?.(
+  "change", () => { drawTimeChart(); drawRatioChart(); });
+</script>
+</body>
+</html>
diff --git a/native/cuda/breakage_cuda/include/breakage_cuda/device.hpp b/native/cuda/breakage_cuda/include/breakage_cuda/device.hpp
new file mode 100644
index 0000000..449f4db
--- /dev/null
+++ b/native/cuda/breakage_cuda/include/breakage_cuda/device.hpp
@@ -0,0 +1,229 @@
+// SPDX-License-Identifier: MIT
+//
+// Lightweight RAII helpers and a device-side mirror of HostSystem. Not a
+// full Eigen-on-GPU layer; we just keep raw pointers + sizes and let kernels
+// operate on them directly.
+//
+// All matrices are stored in CSR (row-major) on the device because cuSPARSE
+// expects CSR.
+
+#pragma once
+
+#include "breakage_cuda/system.hpp"
+
+#include <cuda_runtime.h>
+#include <cusparse.h>
+
+#include <stdexcept>
+#include <string>
+
+namespace breakage_cuda {
+
+// ---------------------------------------------------------------------------
+// Error helpers.
+// ---------------------------------------------------------------------------
+
+#define BREAKAGE_CUDA_CHECK(call)                                              \
+  do {                                                                         \
+    cudaError_t _err = (call);                                                 \
+    if (_err != cudaSuccess) {                                                 \
+      throw std::runtime_error(std::string("CUDA error at " __FILE__ ":") +    \
+                               std::to_string(__LINE__) + ": " +               \
+                               cudaGetErrorString(_err));                      \
+    }                                                                          \
+  } while (0)
+
+#define BREAKAGE_CUSPARSE_CHECK(call)                                          \
+  do {                                                                         \
+    cusparseStatus_t _s = (call);                                              \
+    if (_s != CUSPARSE_STATUS_SUCCESS) {                                       \
+      throw std::runtime_error(std::string("cuSPARSE error at " __FILE__ ":") +\
+                               std::to_string(__LINE__) + ": " +               \
+                               cusparseGetErrorString(_s));                    \
+    }                                                                          \
+  } while (0)
+
+// ---------------------------------------------------------------------------
+// RAII device buffer of arbitrary scalar.
+// ---------------------------------------------------------------------------
+
+template <class T> class DeviceBuffer {
+public:
+  DeviceBuffer() = default;
+
+  explicit DeviceBuffer(std::size_t n) { resize(n); }
+
+  ~DeviceBuffer() { free(); }
+
+  DeviceBuffer(const DeviceBuffer &) = delete;
+  DeviceBuffer &operator=(const DeviceBuffer &) = delete;
+
+  DeviceBuffer(DeviceBuffer &&o) noexcept : ptr_(o.ptr_), n_(o.n_) {
+    o.ptr_ = nullptr;
+    o.n_ = 0;
+  }
+
+  DeviceBuffer &operator=(DeviceBuffer &&o) noexcept {
+    if (this != &o) {
+      free();
+      ptr_ = o.ptr_;
+      n_ = o.n_;
+      o.ptr_ = nullptr;
+      o.n_ = 0;
+    }
+    return *this;
+  }
+
+  void resize(std::size_t n) {
+    if (n == n_)
+      return;
+    free();
+    if (n > 0) {
+      BREAKAGE_CUDA_CHECK(cudaMalloc(&ptr_, n * sizeof(T)));
+      n_ = n;
+    }
+  }
+
+  void free() {
+    if (ptr_ != nullptr) {
+      cudaFree(ptr_);
+      ptr_ = nullptr;
+      n_ = 0;
+    }
+  }
+
+  void copy_from_host(const T *host, std::size_t n) {
+    if (n != n_)
+      resize(n);
+    if (n_ == 0)
+      return;
+    BREAKAGE_CUDA_CHECK(
+        cudaMemcpy(ptr_, host, n_ * sizeof(T), cudaMemcpyHostToDevice));
+  }
+
+  void copy_to_host(T *host, std::size_t n) const {
+    if (n != n_)
+      throw std::invalid_argument("DeviceBuffer::copy_to_host size mismatch");
+    if (n_ == 0)
+      return;
+    BREAKAGE_CUDA_CHECK(
+        cudaMemcpy(host, ptr_, n_ * sizeof(T), cudaMemcpyDeviceToHost));
+  }
+
+  void zero() {
+    if (n_ == 0)
+      return;
+    BREAKAGE_CUDA_CHECK(cudaMemset(ptr_, 0, n_ * sizeof(T)));
+  }
+
+  T *data() { return ptr_; }
+  const T *data() const { return ptr_; }
+  std::size_t size() const { return n_; }
+  bool empty() const { return n_ == 0; }
+
+private:
+  T *ptr_{nullptr};
+  std::size_t n_{0};
+};
+
+// ---------------------------------------------------------------------------
+// Device CSR matrix (owns indptr/indices/data buffers).
+// ---------------------------------------------------------------------------
+
+struct DeviceCsr {
+  int rows{0};
+  int cols{0};
+  int nnz{0};
+  DeviceBuffer<int> rowptr;   // (rows + 1)
+  DeviceBuffer<int> colind;   // (nnz)
+  DeviceBuffer<double> values; // (nnz)
+
+  // Lazy cuSPARSE descriptor (created on first use).
+  cusparseSpMatDescr_t descr{nullptr};
+
+  ~DeviceCsr() {
+    if (descr != nullptr) {
+      cusparseDestroySpMat(descr);
+    }
+  }
+
+  DeviceCsr() = default;
+  DeviceCsr(const DeviceCsr &) = delete;
+  DeviceCsr &operator=(const DeviceCsr &) = delete;
+  DeviceCsr(DeviceCsr &&) = default;
+  DeviceCsr &operator=(DeviceCsr &&) = default;
+};
+
+// Uploads an Eigen CSC matrix as CSR on the device. Performs the
+// CSC->CSR transpose on the host (it is small enough for the matrices we
+// have here; KKT factorizations dominate the runtime budget).
+void upload_csc_as_csr(const SparseMatrixCSC &src, DeviceCsr &dst);
+
+// ---------------------------------------------------------------------------
+// Device system: GPU-resident view of HostSystem.
+//
+// All matrices are CSR. Dense per-part data is stored row-major.
+// ---------------------------------------------------------------------------
+
+struct DeviceSystem {
+  int N{0};   // num_parts
+  int K{0};   // num_clutches
+  int ncv{0}; // num_contact_vertices
+  int nx{0};  // num_vars
+  int me{0};  // num_eq = 6 * N
+  int mi{0};  // num_ineq
+  int mh{0};  // num_relaxed_ineq
+  int ncap{0}; // capacity_clutch_indices.size()
+
+  double total_mass{0.0};
+  double L0{1.0};
+
+  DeviceBuffer<double> mass;          // (N,)
+  DeviceBuffer<double> q_CC;          // (N, 4) row-major xyzw
+  DeviceBuffer<double> c_CC;          // (N, 3) row-major
+  DeviceBuffer<double> I_CC;          // (N, 9) row-major flatten
+
+  DeviceCsr Q;
+  DeviceCsr A;
+  DeviceCsr G;
+  DeviceCsr H;
+  DeviceCsr V;
+
+  DeviceBuffer<int> capacity_clutch_indices; // (ncap,)
+  DeviceBuffer<double> capacities;            // (ncap, 9) row-major
+  DeviceBuffer<double> clutch_whiten;         // (K, 4) row-major
+};
+
+// Uploads HostSystem to DeviceSystem.
+void upload_system(const HostSystem &host, DeviceSystem &dev);
+
+// ---------------------------------------------------------------------------
+// Per-step input mirror.
+// ---------------------------------------------------------------------------
+
+struct DeviceInput {
+  double dt{};
+  DeviceBuffer<double> w;  // (N, 3)
+  DeviceBuffer<double> v;  // (N, 3)
+  DeviceBuffer<double> q;  // (N, 4)
+  DeviceBuffer<double> c;  // (N, 3)
+  DeviceBuffer<double> J;  // (N, 3)
+  DeviceBuffer<double> H;  // (N, 3)
+};
+
+void upload_input(const HostInput &host, DeviceInput &dev);
+
+// ---------------------------------------------------------------------------
+// Per-step state mirror (the part the dense kernels need).
+// ---------------------------------------------------------------------------
+
+struct DeviceState {
+  DeviceBuffer<double> q_W_CC_prev; // (4,) xyzw
+  DeviceBuffer<double> v_W_prev;    // (N, 3)
+  DeviceBuffer<double> L_prev;      // (N, 3)
+};
+
+void upload_state(const HostState &host, DeviceState &dev, int N);
+void download_state(const DeviceState &dev, HostState &host, int N);
+
+} // namespace breakage_cuda
diff --git a/native/cuda/breakage_cuda/include/breakage_cuda/kernels.hpp b/native/cuda/breakage_cuda/include/breakage_cuda/kernels.hpp
new file mode 100644
index 0000000..38fca43
--- /dev/null
+++ b/native/cuda/breakage_cuda/include/breakage_cuda/kernels.hpp
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: MIT
+//
+// Host-side launchers for the dense per-step CUDA kernels. Each one mirrors
+// a function in breakage.cppm; the bodies live in src/kernels.cu and
+// src/linalg.cu.
+
+#pragma once
+
+#include "breakage_cuda/device.hpp"
+
+namespace breakage_cuda::cuda_kernels {
+
+// ---------------------------------------------------------------------------
+// fit_se3 (breakage.cppm:58-88).
+//
+// Inputs:
+//   q0, t0  -- reference orientations / COMs in CC frame  (N x 4 / N x 3)
+//   qx, tx  -- current orientations / COMs                (N x 4 / N x 3)
+//   mass    -- per-part mass                              (N,)
+//   total_mass, lambda_R
+//
+// Output (host scalars + 3x3 R + Vector3 t copied back):
+//   q_out   -- world<-CC quaternion, xyzw                 (4,)
+//   t_out   -- world<-CC translation                      (3,)
+//
+// Uses an internal scratch buffer for the partial 3x3 reductions.
+// ---------------------------------------------------------------------------
+
+void fit_se3(int N, const double *q0_d, const double *t0_d, const double *qx_d,
+             const double *tx_d, const double *mass_d, double total_mass,
+             double lambda_R, double q_out[4], double t_out[3],
+             cudaStream_t stream = 0);
+
+// ---------------------------------------------------------------------------
+// fit_twist (breakage.cppm:96-121).
+//
+// Inputs:
+//   w, v          -- per-part world-frame angular / linear velocities (N x 3)
+//   c_CC          -- COM positions in CC frame                        (N x 3)
+//   q_W_CC, t_W_CC -- world<-CC pose (xyzw quat + translation)
+//   mass          -- (N,)
+//   total_mass, lambda_w
+//
+// Outputs (host):
+//   w0_out -- aggregate angular velocity (3,)
+//   v0_out -- aggregate linear velocity  (3,)
+//
+// Outputs (device):
+//   v_W_d -- per-part world-frame linear velocity reconstructed from twist
+//            (N x 3 row-major). Caller pre-allocates.
+// ---------------------------------------------------------------------------
+
+void fit_twist(int N, const double *w_d, const double *v_d, const double *c_CC_d,
+               const double q_W_CC[4], const double t_W_CC[3],
+               const double *mass_d, double total_mass, double lambda_w,
+               double w0_out[3], double v0_out[3], double *v_W_d,
+               cudaStream_t stream = 0);
+
+// ---------------------------------------------------------------------------
+// compute_Pi (breakage.cppm:166-170). 3x3 output, host-side scalar work.
+// ---------------------------------------------------------------------------
+
+void compute_Pi_host(const double qm[4], const double qp[4], double Pi_out[9]);
+
+// ---------------------------------------------------------------------------
+// compute_L (breakage.cppm:173-180). One row per part on the GPU.
+//
+// Inputs:
+//   I_CC_d -- (N, 9) row-major
+//   q_W_CC -- (4,) xyzw
+//   w0     -- (3,)
+//
+// Output:
+//   L_d -- (N, 3) row-major. Caller pre-allocates.
+// ---------------------------------------------------------------------------
+
+void compute_L(int N, const double *I_CC_d, const double q_W_CC[4],
+               const double w0[3], double *L_d, cudaStream_t stream = 0);
+
+// ---------------------------------------------------------------------------
+// assemble_b (breakage.cppm:1018-1035). One row (6 doubles) per part.
+//
+// Inputs (device):
+//   v_W_curr_d, v_W_prev_d, L_curr_d, L_prev_d -- (N, 3)
+//   J_d, H_d -- (N, 3)
+//   mass_d -- (N,)
+//
+// Inputs (host):
+//   Pi -- 3x3 row-major
+//   dt, L0
+//
+// Output (device):
+//   b_d -- (6 * N,) row-major-as-(N, 6). Caller pre-allocates.
+// ---------------------------------------------------------------------------
+
+void assemble_b(int N, const double *v_W_curr_d, const double *v_W_prev_d,
+                const double *L_curr_d, const double *L_prev_d,
+                const double *J_d, const double *H_d, const double *mass_d,
+                const double Pi[9], double dt, double L0, double *b_d,
+                cudaStream_t stream = 0);
+
+// ---------------------------------------------------------------------------
+// Postprocess: undo Wk whitening on the optimization solution and compute
+// per-clutch utilization (breakage.cppm:1061-1097).
+//
+// Inputs (device):
+//   x_d              -- (num_vars,) the OSQP optimization solution (whitened)
+//   capacity_clutch_indices_d -- (ncap,) int
+//   capacities_d     -- (ncap, 9) row-major
+//   clutch_whiten_d  -- (K, 4) row-major flatten of 2x2
+//
+// Inputs (host):
+//   ncv, K
+//
+// Outputs (device):
+//   x_unwhiten_d  -- (num_vars,) the un-whitened x; caller pre-allocates.
+//   utilization_d -- (K,) initialized to -1.0 here; caller pre-allocates.
+// ---------------------------------------------------------------------------
+
+void postprocess(int ncv, int K, int ncap, const double *x_d,
+                 const int *capacity_clutch_indices_d,
+                 const double *capacities_d, const double *clutch_whiten_d,
+                 double *x_unwhiten_d, double *utilization_d,
+                 cudaStream_t stream = 0);
+
+} // namespace breakage_cuda::cuda_kernels
diff --git a/native/cuda/breakage_cuda/include/breakage_cuda/linalg.hpp b/native/cuda/breakage_cuda/include/breakage_cuda/linalg.hpp
new file mode 100644
index 0000000..93ffd34
--- /dev/null
+++ b/native/cuda/breakage_cuda/include/breakage_cuda/linalg.hpp
@@ -0,0 +1,124 @@
+// SPDX-License-Identifier: MIT
+//
+// Shared CUDA linear-algebra helpers used by both kernels.cu and
+// qp_solver.cu. The implementations live in src/linalg.cu.
+
+#pragma once
+
+#include "breakage_cuda/device.hpp"
+
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+#include <cusolverSp.h>
+#include <cusparse.h>
+
+namespace breakage_cuda::cuda_la {
+
+// ---------------------------------------------------------------------------
+// cuBLAS / cuSPARSE / cuSOLVER handle pool. One per process.
+// ---------------------------------------------------------------------------
+
+class Handles {
+public:
+  static Handles &instance();
+
+  cublasHandle_t blas() const { return blas_; }
+  cusparseHandle_t sparse() const { return sparse_; }
+  cusolverDnHandle_t solver_dn() const { return solver_dn_; }
+  cusolverSpHandle_t solver_sp() const { return solver_sp_; }
+
+private:
+  Handles();
+  ~Handles();
+  Handles(const Handles &) = delete;
+  Handles &operator=(const Handles &) = delete;
+
+  cublasHandle_t blas_{nullptr};
+  cusparseHandle_t sparse_{nullptr};
+  cusolverDnHandle_t solver_dn_{nullptr};
+  cusolverSpHandle_t solver_sp_{nullptr};
+};
+
+// ---------------------------------------------------------------------------
+// Reductions (single-block, used for small per-step inner products).
+// ---------------------------------------------------------------------------
+
+// Returns sum(x .* y). Uses cuBLAS dot (handle from Handles).
+double dot(int n, const double *x_d, const double *y_d, cudaStream_t stream = 0);
+
+// Returns ||x||_2.
+double nrm2(int n, const double *x_d, cudaStream_t stream = 0);
+
+// Returns ||x||_inf.
+double inf_norm(int n, const double *x_d, cudaStream_t stream = 0);
+
+// y <- alpha * x + y
+void axpy(int n, double alpha, const double *x_d, double *y_d,
+          cudaStream_t stream = 0);
+
+// y <- alpha * x
+void scal_copy(int n, double alpha, const double *x_d, double *y_d,
+               cudaStream_t stream = 0);
+
+// x <- alpha * x
+void scal(int n, double alpha, double *x_d, cudaStream_t stream = 0);
+
+// x <- value (broadcast).
+void fill(int n, double value, double *x_d, cudaStream_t stream = 0);
+
+// dst <- src (n elements)
+void copy(int n, const double *src_d, double *dst_d, cudaStream_t stream = 0);
+
+// ---------------------------------------------------------------------------
+// SpMV: y <- alpha * op(A) * x + beta * y, with cuSPARSE generic API.
+//
+// `transpose` selects op(A): false -> A, true -> A^T.
+// Caller pre-allocates buffer_d (size buffer_bytes_for_spmv) once.
+// ---------------------------------------------------------------------------
+
+std::size_t spmv_buffer_size(const DeviceCsr &A, bool transpose);
+
+void spmv(const DeviceCsr &A, bool transpose, double alpha, const double *x_d,
+          double beta, double *y_d, void *buffer_d, std::size_t buffer_bytes,
+          cudaStream_t stream = 0);
+
+// ---------------------------------------------------------------------------
+// Element-wise box projection: out[i] = clamp(in[i], l[i], u[i]).
+// ---------------------------------------------------------------------------
+
+void clamp(int n, const double *in_d, const double *l_d, const double *u_d,
+           double *out_d, cudaStream_t stream = 0);
+
+// in-place: x[i] = clamp(x[i], l[i], u[i])
+void clamp_inplace(int n, const double *l_d, const double *u_d, double *x_d,
+                   cudaStream_t stream = 0);
+
+// ---------------------------------------------------------------------------
+// Dense 3x3 helpers exposed for tests.
+//
+// Each one takes a 3x3 row-major matrix as 9 doubles. Implemented on host.
+// ---------------------------------------------------------------------------
+
+// Computes R, t from the standard SVD-based Procrustes formula. Inputs are
+// the centered matrices and the K matrix (cf. breakage.cppm fit_se3).
+void svd_procrustes_3x3(const double S[9], double R[9]);
+
+// 3x3 SPD LDLT solve: A * x = b, where A is SPD 3x3.
+void spd_solve_3x3(const double A[9], const double b[3], double x[3]);
+
+// 2x2 SPD inverse-sqrt (for Wk = inv_sqrt_spd(Ck)). Both row-major (4 doubles).
+void inv_sqrt_spd_2x2(const double C[4], double W[4]);
+
+// ---------------------------------------------------------------------------
+// Quaternion <-> rotation matrix helpers (xyzw layout, row-major matrix).
+// ---------------------------------------------------------------------------
+
+void quat_xyzw_to_R(const double q[4], double R[9]);
+void R_to_quat_xyzw(const double R[9], double q[4]);
+void quat_xyzw_normalize(double q[4]);
+// Returns r such that q_a * q_b = r (Hamilton product, xyzw).
+void quat_xyzw_mul(const double a[4], const double b[4], double r[4]);
+// Returns the conjugate (-x, -y, -z, w).
+void quat_xyzw_conjugate(const double q[4], double r[4]);
+
+} // namespace breakage_cuda::cuda_la
diff --git a/native/cuda/breakage_cuda/include/breakage_cuda/qp_solver.hpp b/native/cuda/breakage_cuda/include/breakage_cuda/qp_solver.hpp
new file mode 100644
index 0000000..b6278e7
--- /dev/null
+++ b/native/cuda/breakage_cuda/include/breakage_cuda/qp_solver.hpp
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: MIT
+//
+// Hand-written CUDA ADMM solver, structurally aligned with the three-stage
+// OSQP wrapper in osqp.cppm (projection -> relaxation -> optimization).
+//
+// We do NOT replicate every OSQP feature (no polish, no infeasibility
+// detection, no scaling/equilibration). The aim is to produce solutions that
+// are close enough to OSQP for the same problem so that the breakage system
+// behaves consistently.
+//
+// One "stage" corresponds to one ADMM run with its own KKT factorization.
+
+#pragma once
+
+#include "breakage_cuda/device.hpp"
+#include "breakage_cuda/system.hpp"
+
+#include <memory>
+
+namespace breakage_cuda::cuda_qp {
+
+// ---------------------------------------------------------------------------
+// Per-stage parameters (mirror OSQP defaults).
+// ---------------------------------------------------------------------------
+
+struct Settings {
+  double eps_abs{1e-3};
+  double eps_rel{1e-3};
+  double rho_init{0.1};
+  double sigma{1e-6};
+  double alpha{1.6};
+  int max_iter{4000};
+  int check_every{25};
+  // Adaptive-rho settings (cf. OSQP's adaptive_rho heuristic).
+  bool adaptive_rho{true};
+  double adaptive_rho_factor{5.0};
+  // Misc
+  double infinity{1e30};
+};
+
+// ---------------------------------------------------------------------------
+// Persistent device-resident state (warm starts, factorizations).
+//
+// One Stage owns the matrices, KKT factorization, and warm-start buffers for
+// a single ADMM run. The Solver owns three of these (prj/rlx/opt).
+//
+// Stage matrices are constant across solves with the same HostSystem; only
+// l/u/q vectors and the warm-start state change between calls.
+// ---------------------------------------------------------------------------
+
+class Stage; // pimpl
+
+class Solver {
+public:
+  Solver();
+  ~Solver();
+
+  Solver(const Solver &) = delete;
+  Solver &operator=(const Solver &) = delete;
+  Solver(Solver &&) noexcept;
+  Solver &operator=(Solver &&) noexcept;
+
+  // Build the three stages from a (CPU) HostSystem. The DeviceSystem must
+  // already be uploaded; we only need its CSR matrices for SpMV bookkeeping.
+  void setup(const HostSystem &host, const DeviceSystem &dev,
+             const Settings &settings);
+
+  // Run all three stages on RHS b (host vector, length 6 * num_parts).
+  //
+  // Outputs:
+  //   x_d        -- (num_vars,) device, optimization solution (still whitened)
+  //   slack_d    -- (num_eq,)  device, b - A * x_prj
+  //   v_relax_d  -- (num_clutches,) device, relaxation slack v
+  SolveInfo solve(const VectorXd &b, double *x_d, double *slack_d,
+                  double *v_relax_d, cudaStream_t stream = 0);
+
+  // Drop the warm-start state; next solve starts cold.
+  void reset_warm_start();
+
+  // Total dims (forwarded from HostSystem).
+  int num_vars() const;
+  int num_eq() const;
+  int num_clutches() const;
+
+private:
+  std::unique_ptr<Stage> prj_;
+  std::unique_ptr<Stage> rlx_;
+  std::unique_ptr<Stage> opt_;
+  Settings settings_{};
+  int nx_{0};
+  int me_{0};
+  int mi_{0};
+  int mh_{0};
+  int nv_{0};
+};
+
+// ---------------------------------------------------------------------------
+// End-to-end driver mirroring breakage.cppm BreakageChecker::solve, but
+// running everything (dense kernels + QP + postprocess) on the GPU.
+//
+// This is the entry point benchmarks and tests call.
+// ---------------------------------------------------------------------------
+
+class Pipeline {
+public:
+  Pipeline();
+  ~Pipeline();
+
+  void setup(const HostSystem &host, const Thresholds &thr,
+             const Settings &settings = {});
+
+  Solution solve(const HostInput &in, HostState &state,
+                 cudaStream_t stream = 0);
+
+private:
+  HostSystem host_;
+  Thresholds thr_;
+  DeviceSystem dev_;
+  Solver qp_;
+  // Scratch buffers
+  DeviceBuffer<double> v_W_curr_;
+  DeviceBuffer<double> L_curr_;
+  DeviceBuffer<double> b_d_;
+  DeviceBuffer<double> x_d_;
+  DeviceBuffer<double> x_unwhiten_d_;
+  DeviceBuffer<double> slack_d_;
+  DeviceBuffer<double> v_relax_d_;
+  DeviceBuffer<double> utilization_d_;
+};
+
+} // namespace breakage_cuda::cuda_qp
diff --git a/native/cuda/breakage_cuda/include/breakage_cuda/ref_cpu.hpp b/native/cuda/breakage_cuda/include/breakage_cuda/ref_cpu.hpp
new file mode 100644
index 0000000..6d33bf2
--- /dev/null
+++ b/native/cuda/breakage_cuda/include/breakage_cuda/ref_cpu.hpp
@@ -0,0 +1,107 @@
+// SPDX-License-Identifier: MIT
+//
+// CPU reference implementation. Mirrors the per-step code paths of
+// bricksim::BreakageChecker::solve in breakage.cppm:
+//
+//   1. fit_se3 / fit_twist (per-part dense ops)
+//   2. compute_Pi, compute_L
+//   3. RHS b assembly
+//   4. Three-stage OSQP solve (projection / relaxation / optimization)
+//   5. Whitening reversal + utilization
+//
+// Used both as a baseline for the CUDA implementation and to score the random
+// QP and full-pipeline tests.
+
+#pragma once
+
+#include "breakage_cuda/system.hpp"
+
+#include <Eigen/Geometry>
+
+#include <span>
+
+namespace breakage_cuda::ref_cpu {
+
+// ---------------------------------------------------------------------------
+// Compile-time switches mirroring breakage.cppm. They are hard-coded to true
+// because BreakageChecker uses them as such; we expose them here so that
+// tests can opt out if needed.
+// ---------------------------------------------------------------------------
+
+constexpr bool kEnableTorqueScaling = true;
+constexpr bool kEnableClutchWhitening = true;
+constexpr bool kEnableClutchTangentialFriction = true;
+constexpr bool kEnableRealisticClutchFriction = true;
+
+// ---------------------------------------------------------------------------
+// Pure dense kernels (no QP). Each one matches a function in breakage.cppm.
+// ---------------------------------------------------------------------------
+
+struct Transformd {
+  Eigen::Quaterniond q;
+  Vector3d t;
+};
+
+Transformd fit_se3(const MatrixX4d &q0, const MatrixX3d &t0,
+                   const MatrixX4d &qx, const MatrixX3d &tx,
+                   const VectorXd &mass, double total_mass, double lambda_R);
+
+struct TwistFitResult {
+  Vector3d w0;
+  Vector3d v0;
+  MatrixX3d v_W; // (num_parts, 3)
+};
+
+TwistFitResult fit_twist(const MatrixX3d &w, const MatrixX3d &v,
+                         const MatrixX3d &c_CC, const Transformd &T_W_CC,
+                         const VectorXd &mass, double total_mass,
+                         double lambda_w);
+
+Vector3d so3_log_from_unit_quat(Eigen::Quaterniond q);
+
+Matrix3d so3_jacobian_inv(const Vector3d &phi);
+
+Matrix3d compute_Pi(const Eigen::Quaterniond &qm,
+                    const Eigen::Quaterniond &qp);
+
+// I_flat: row-major (N, 9), each row is row-major flatten of 3x3.
+MatrixX3d compute_L(
+    const Eigen::Matrix<double, Dynamic, 9, RowMajor> &I_flat,
+    const Eigen::Quaterniond &q_W_CC, const Vector3d &w0);
+
+// Builds the RHS b vector (size 6 * num_parts) per breakage.cppm:1018-1035.
+//
+// `Pi` is from compute_Pi; `v_W_curr`, `v_W_prev`, `L_curr`, `L_prev` are
+// (num_parts, 3); J/H are external impulses (num_parts, 3).
+//
+// Output: b is laid out as 6 floats per part (linear x 3, angular x 3) in
+// row-major order so it matches breakage.cppm's `b_mat` Map.
+VectorXd assemble_b(const HostSystem &sys, const HostInput &in,
+                    const MatrixX3d &v_W_curr, const MatrixX3d &v_W_prev,
+                    const MatrixX3d &L_curr, const MatrixX3d &L_prev,
+                    const Matrix3d &Pi);
+
+// ---------------------------------------------------------------------------
+// Three-stage QP solver mirroring osqp.cppm.
+//
+// On first call (state.qp == nullptr or InternalQpState empty), the solver
+// is built from sys; subsequent calls warm-start from the previous solution.
+// ---------------------------------------------------------------------------
+
+SolveInfo solve_qp(const HostSystem &sys, const VectorXd &b, HostState &state,
+                   VectorXd &x_out, VectorXd &slack_out, VectorXd &v_relax_out);
+
+// ---------------------------------------------------------------------------
+// End-to-end driver. Runs steps (1)-(5) above and returns the same Solution
+// you would get from BreakageChecker::solve in CPU-only mode.
+// ---------------------------------------------------------------------------
+
+Solution solve_full(const HostSystem &sys, const HostInput &in,
+                    HostState &state, const Thresholds &thr);
+
+// Convenience: computes b and returns it (used by tests).
+VectorXd compute_b(const HostSystem &sys, const HostInput &in, HostState &state,
+                   Eigen::Quaterniond *q_W_CC_out = nullptr,
+                   MatrixX3d *v_W_out = nullptr, MatrixX3d *L_out = nullptr);
+
+} // namespace breakage_cuda::ref_cpu
diff --git a/native/cuda/breakage_cuda/include/breakage_cuda/system.hpp b/native/cuda/breakage_cuda/include/breakage_cuda/system.hpp
new file mode 100644
index 0000000..6f78146
--- /dev/null
+++ b/native/cuda/breakage_cuda/include/breakage_cuda/system.hpp
@@ -0,0 +1,208 @@
+// SPDX-License-Identifier: MIT
+//
+// Plain-C++ mirror of the bricksim::Breakage* types, sufficient to drive the
+// per-step hot path on either CPU (Eigen + OSQP) or CUDA. The JSON layout is
+// the one produced by bricksim::to_json(BreakageDebugDump) in
+// modules/bricksim/physx/breakage.cppm.
+//
+// We intentionally do NOT depend on any bricksim C++26 module: the goal of
+// this subproject is to be a self-contained CUDA prototype.
+
+#pragma once
+
+#include <Eigen/Dense>
+#include <Eigen/SparseCore>
+
+#include <nlohmann/json_fwd.hpp>
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace breakage_cuda {
+
+// ---------------------------------------------------------------------------
+// Aliases (match the layout used by breakage.cppm).
+// ---------------------------------------------------------------------------
+
+using Eigen::Dynamic;
+using Eigen::RowMajor;
+
+using Vector3d = Eigen::Vector3d;
+using Vector4d = Eigen::Vector4d;
+using VectorXd = Eigen::VectorXd;
+
+using Matrix2d = Eigen::Matrix2d;
+using Matrix3d = Eigen::Matrix3d;
+
+using MatrixX3d = Eigen::Matrix<double, Dynamic, 3>;
+using MatrixX4d = Eigen::Matrix<double, Dynamic, 4>;
+
+using SparseMatrixCSC = Eigen::SparseMatrix<double, Eigen::ColMajor, int>;
+using SparseMatrixCSR = Eigen::SparseMatrix<double, Eigen::RowMajor, int>;
+
+// ---------------------------------------------------------------------------
+// Thresholds (mirror BreakageThresholds).
+// ---------------------------------------------------------------------------
+
+struct Thresholds {
+  bool enabled{true};
+  double contact_regularization{0.1};
+  double clutch_axial_compliance{1.0};
+  double clutch_radial_compliance{1.0};
+  double clutch_tangential_compliance{1.0};
+  double friction_coefficient{0.2};
+  double preloaded_force{3.5};
+  double slack_fraction_warn{0.1};
+  double slack_fraction_b_floor{1e-9};
+  bool debug_dump{false};
+  double breakage_cooldown_time{0.05};
+};
+
+// ---------------------------------------------------------------------------
+// Host-side, non-CUDA representation of a problem.
+//
+// All matrices keep the original (column-major / CSC) layout from breakage
+// so that we can hand them to OSQP unchanged. The CUDA layer converts to its
+// own representation in src/system.cpp.
+// ---------------------------------------------------------------------------
+
+struct HostSystem {
+  // Shapes (mirror BreakageSystem::check_shape).
+  int num_parts{};
+  int num_clutches{};
+  int num_contact_vertices{};
+  int num_vars{};
+  int num_eq{};
+  int num_ineq{};
+  int num_relaxed_ineq{};
+
+  double total_mass{};
+  double L0{};
+
+  // Per-part dense data.
+  VectorXd mass;       // (num_parts,)
+  MatrixX4d q_CC;      // (num_parts, 4) xyzw
+  MatrixX3d c_CC;      // (num_parts, 3) m
+  // I_CC stored as (num_parts, 9) row-major flattening of 3x3 inertia tensors.
+  Eigen::Matrix<double, Dynamic, 9, RowMajor> I_CC;
+
+  // Sparse matrices (CSC, owned, compressed).
+  SparseMatrixCSC Q;   // (num_vars,        num_vars)
+  SparseMatrixCSC A;   // (6*num_parts,     num_vars)
+  SparseMatrixCSC G;   // (num_ineq,        num_vars)
+  SparseMatrixCSC H;   // (num_relaxed_ineq, num_vars)
+  SparseMatrixCSC V;   // (num_relaxed_ineq, num_clutches)
+
+  // Per-clutch friction-cone metadata. Indices into clutches_; same length as
+  // capacities.
+  std::vector<int> capacity_clutch_indices;
+  // (capacity_clutch_indices.size(), 9) row-major flattening of Vector9d.
+  Eigen::Matrix<double, Dynamic, 9, RowMajor> capacities;
+  // (num_clutches, 4) row-major flattening of 2x2 whitening matrices.
+  Eigen::Matrix<double, Dynamic, 4, RowMajor> clutch_whiten;
+};
+
+// ---------------------------------------------------------------------------
+// Per-frame input (mirror BreakageInput / BreakageInitialInput).
+// ---------------------------------------------------------------------------
+
+struct HostInput {
+  // Initial-input fields.
+  MatrixX3d w;         // angular velocity        (num_parts, 3)
+  MatrixX3d v;         // linear velocity         (num_parts, 3)
+  MatrixX4d q;         // orientation xyzw        (num_parts, 4)
+  MatrixX3d c;         // COM position            (num_parts, 3)
+
+  // Step-only fields.
+  double dt{};
+  MatrixX3d J;         // external linear impulse (num_parts, 3)
+  MatrixX3d H;         // external angular imp.   (num_parts, 3)
+};
+
+// ---------------------------------------------------------------------------
+// Per-frame solver state across steps (mirror BreakageState + OsqpState).
+//
+// The QP warm-start payload is opaque to the system layer; both the CPU and
+// CUDA solvers cache their own state behind the InternalQpState pointer.
+// ---------------------------------------------------------------------------
+
+struct InternalQpState; // forward declaration; defined by each solver backend.
+using InternalQpStatePtr = std::shared_ptr<InternalQpState>;
+
+struct HostState {
+  Eigen::Quaterniond q_W_CC_prev = Eigen::Quaterniond::Identity();
+  MatrixX3d v_W_prev;  // (num_parts, 3)
+  MatrixX3d L_prev;    // (num_parts, 3)
+
+  // CPU baseline keeps three OSQP solvers + their warm-start vectors here.
+  // CUDA backend keeps device buffers here. Either may be empty.
+  InternalQpStatePtr qp;
+};
+
+// ---------------------------------------------------------------------------
+// Per-stage info from a single solve.
+// ---------------------------------------------------------------------------
+
+struct StageInfo {
+  bool converged{false};
+  int iter{0};
+  double prim_res{std::numeric_limits<double>::quiet_NaN()};
+  double dual_res{std::numeric_limits<double>::quiet_NaN()};
+  double rho{0.0};
+  double solve_time_s{0.0};
+};
+
+struct SolveInfo {
+  bool converged{false};
+  StageInfo prj;
+  StageInfo rlx;
+  StageInfo opt;
+};
+
+// ---------------------------------------------------------------------------
+// Result of a single solve.
+// ---------------------------------------------------------------------------
+
+struct Solution {
+  VectorXd x;            // (num_vars,) un-whitened (matches breakage.cppm output)
+  VectorXd utilization;  // (num_clutches,) per-clutch friction utilization
+  double slack_fraction{0.0};
+  SolveInfo info;
+};
+
+// ---------------------------------------------------------------------------
+// JSON loaders.
+//
+// Layout: identical to BreakageDebugDump JSON (see breakage.cppm to_json).
+// ---------------------------------------------------------------------------
+
+void from_json(const nlohmann::json &j, Thresholds &out);
+void from_json(const nlohmann::json &j, HostSystem &out);
+void from_json(const nlohmann::json &j, HostInput &out);
+void from_json(const nlohmann::json &j, HostState &out);
+
+struct DebugDump {
+  Thresholds thresholds;
+  HostSystem system;
+  HostInput input;
+  HostState state;
+  Solution solution;
+  VectorXd b;
+  std::optional<HostState> prev_state;
+};
+
+void from_json(const nlohmann::json &j, DebugDump &out);
+
+DebugDump load_debug_dump(const std::string &path);
+
+// ---------------------------------------------------------------------------
+// Cross-validation helpers (used in tests).
+// ---------------------------------------------------------------------------
+
+// Compares two solutions' x within a relative tolerance (||a-b||_inf / max(||b||_inf, floor)).
+double relative_inf_error(const VectorXd &a, const VectorXd &b,
+                          double floor = 1e-12);
+
+} // namespace breakage_cuda
diff --git a/native/cuda/breakage_cuda/src/kernels.cu b/native/cuda/breakage_cuda/src/kernels.cu
new file mode 100644
index 0000000..e1fed70
--- /dev/null
+++ b/native/cuda/breakage_cuda/src/kernels.cu
@@ -0,0 +1,817 @@
+// SPDX-License-Identifier: MIT
+//
+// Implementation of breakage_cuda/kernels.hpp:
+//
+//   * fit_se3      -- mass-weighted SE3 Procrustes (GPU reductions + host SVD)
+//   * fit_twist    -- mass-weighted twist fit       (GPU reductions + host LDLT)
+//   * compute_Pi   -- pure-host 3x3 (small, scalar work)
+//   * compute_L    -- per-part inertia rotation     (one thread per part)
+//   * assemble_b   -- per-part RHS row assembly     (one thread per part)
+//   * postprocess  -- un-whiten optimization x and compute clutch utilization
+//
+// Strategy notes:
+//   - Per-part data ((N,K) row-major) is small enough that we keep the kernels
+//     simple: one thread per part with a block-stride loop, plus a single
+//     shared-memory sum reduction inside each block.
+//   - Reductions land in tiny global accumulators (<= 18 doubles) via per-block
+//     atomicAdd; we then memcpy them back to host for the small dense post-
+//     processing (SVD, LDLT) where Eigen on the host is plenty fast.
+//   - All numerics in double; formulas mirror breakage.cppm verbatim.
+
+#include "breakage_cuda/kernels.hpp"
+#include "breakage_cuda/device.hpp"
+#include "breakage_cuda/linalg.hpp"
+
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <cmath>
+#include <stdexcept>
+
+namespace breakage_cuda::cuda_kernels {
+
+namespace {
+
+// ---------------------------------------------------------------------------
+// Plain-old-data wrappers used as kernel by-value parameters.
+// Mat3 is row-major.
+// ---------------------------------------------------------------------------
+
+struct Mat3 {
+  double m[9];
+};
+struct Vec3 {
+  double v[3];
+};
+
+// All reduction kernels assume blockDim.x == kBlock. The host launchers
+// always launch with this exact block size.
+constexpr int kBlock = 256;
+
+// Cap the launch grid: reductions all use a block-stride loop so any value
+// works, but a smaller grid lowers atomic-add contention on the tiny
+// accumulators.
+inline int grid_for(int N) {
+  int g = (N + kBlock - 1) / kBlock;
+  return std::min(g, 256);
+}
+
+// ---------------------------------------------------------------------------
+// Device math helpers.
+// ---------------------------------------------------------------------------
+
+__device__ inline void d_quat_xyzw_to_R(double x, double y, double z, double w,
+                                        double R[9]) {
+  // Standard unit-quaternion to rotation-matrix conversion. Matches
+  // Eigen::Quaternion::toRotationMatrix exactly when the input is unit-length.
+  double xx = x * x, yy = y * y, zz = z * z;
+  double xy = x * y, xz = x * z, yz = y * z;
+  double wx = w * x, wy = w * y, wz = w * z;
+  R[0] = 1.0 - 2.0 * (yy + zz);
+  R[1] = 2.0 * (xy - wz);
+  R[2] = 2.0 * (xz + wy);
+  R[3] = 2.0 * (xy + wz);
+  R[4] = 1.0 - 2.0 * (xx + zz);
+  R[5] = 2.0 * (yz - wx);
+  R[6] = 2.0 * (xz - wy);
+  R[7] = 2.0 * (yz + wx);
+  R[8] = 1.0 - 2.0 * (xx + yy);
+}
+
+// Block sum-reduce. Caller provides a `kBlock`-sized shared buffer. Returns
+// the same value on every thread (sm[0] after the final sync).
+__device__ inline double block_reduce_sum(double val, double *sm) {
+  int tid = threadIdx.x;
+  sm[tid] = val;
+  __syncthreads();
+  for (int s = kBlock / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      sm[tid] += sm[tid + s];
+    }
+    __syncthreads();
+  }
+  return sm[0];
+}
+
+// Reduce VARS thread-local doubles to per-block sums and atomicAdd them into
+// `out` in one go.
+template <int VARS>
+__device__ inline void block_reduce_atomic_add(double *local, double *sm,
+                                               double *out) {
+  int tid = threadIdx.x;
+  for (int k = 0; k < VARS; ++k) {
+    double r = block_reduce_sum(local[k], sm);
+    if (tid == 0) {
+      atomicAdd(&out[k], r);
+    }
+    __syncthreads();
+  }
+}
+
+// Atomic max for fp64 via CAS on the bit representation.
+//
+// CUDA does not provide an atomicMax(double*, double); we emulate it with a
+// CAS retry loop on the bits. NaN inputs are silently ignored (matches the
+// CPU baseline, which never produces them on this hot path).
+__device__ inline void atomic_max_double(double *addr, double val) {
+  auto *p = reinterpret_cast<unsigned long long *>(addr);
+  unsigned long long old = *p;
+  unsigned long long assumed;
+  do {
+    assumed = old;
+    double cur = __longlong_as_double(static_cast<long long>(assumed));
+    // `!(val > cur)` is true for NaN val too, so we do not write NaNs.
+    if (!(val > cur)) {
+      return;
+    }
+    unsigned long long candidate =
+        static_cast<unsigned long long>(__double_as_longlong(val));
+    old = atomicCAS(p, assumed, candidate);
+  } while (assumed != old);
+}
+
+// ---------------------------------------------------------------------------
+// fit_se3 reduction kernels.
+// ---------------------------------------------------------------------------
+
+// Pass 1: accumulate the centroid sums and the K matrix.
+//
+// Per-part contributions packed into 15 doubles:
+//   local[ 0..2]  : m * t0
+//   local[ 3..5]  : m * tx
+//   local[ 6..14] : m * R(q0 * conj(qx))   (row-major flatten)
+__global__ void se3_pass1_kernel(int N, const double *q0, const double *t0,
+                                 const double *qx, const double *tx,
+                                 const double *mass, double *out_15) {
+  __shared__ double sm[kBlock];
+  double local[15];
+  for (int k = 0; k < 15; ++k) {
+    local[k] = 0.0;
+  }
+
+  for (int i = blockIdx.x * kBlock + threadIdx.x; i < N;
+       i += kBlock * gridDim.x) {
+    double m = mass[i];
+    double t0x = t0[3 * i + 0];
+    double t0y = t0[3 * i + 1];
+    double t0z = t0[3 * i + 2];
+    double txx = tx[3 * i + 0];
+    double txy = tx[3 * i + 1];
+    double txz = tx[3 * i + 2];
+
+    double q0x = q0[4 * i + 0], q0y = q0[4 * i + 1];
+    double q0z = q0[4 * i + 2], q0w = q0[4 * i + 3];
+    double qxx = qx[4 * i + 0], qxy = qx[4 * i + 1];
+    double qxz = qx[4 * i + 2], qxw = qx[4 * i + 3];
+
+    // q0 * conj(qx); conj(qx) = (-qxx, -qxy, -qxz, qxw)
+    double bx = -qxx, by = -qxy, bz = -qxz, bw = qxw;
+    double rx = q0w * bx + q0x * bw + q0y * bz - q0z * by;
+    double ry = q0w * by - q0x * bz + q0y * bw + q0z * bx;
+    double rz = q0w * bz + q0x * by - q0y * bx + q0z * bw;
+    double rw = q0w * bw - q0x * bx - q0y * by - q0z * bz;
+
+    double R[9];
+    d_quat_xyzw_to_R(rx, ry, rz, rw, R);
+
+    local[0] += m * t0x;
+    local[1] += m * t0y;
+    local[2] += m * t0z;
+    local[3] += m * txx;
+    local[4] += m * txy;
+    local[5] += m * txz;
+    for (int k = 0; k < 9; ++k) {
+      local[6 + k] += m * R[k];
+    }
+  }
+
+  block_reduce_atomic_add<15>(local, sm, out_15);
+}
+
+// Pass 2: accumulate the H = sum_i m_i (t0_i - t0_bar)(tx_i - tx_bar)^T outer
+// product after the centroids are known (host-side).
+__global__ void se3_pass2_kernel(int N, const double *t0, const double *tx,
+                                 const double *mass, Vec3 t0_bar, Vec3 tx_bar,
+                                 double *out_9) {
+  __shared__ double sm[kBlock];
+  double local[9];
+  for (int k = 0; k < 9; ++k) {
+    local[k] = 0.0;
+  }
+
+  for (int i = blockIdx.x * kBlock + threadIdx.x; i < N;
+       i += kBlock * gridDim.x) {
+    double m = mass[i];
+    double dt0x = t0[3 * i + 0] - t0_bar.v[0];
+    double dt0y = t0[3 * i + 1] - t0_bar.v[1];
+    double dt0z = t0[3 * i + 2] - t0_bar.v[2];
+    double dtxx = tx[3 * i + 0] - tx_bar.v[0];
+    double dtxy = tx[3 * i + 1] - tx_bar.v[1];
+    double dtxz = tx[3 * i + 2] - tx_bar.v[2];
+
+    // Row-major outer product m * dt0 * dtx^T.
+    local[0] += m * dt0x * dtxx;
+    local[1] += m * dt0x * dtxy;
+    local[2] += m * dt0x * dtxz;
+    local[3] += m * dt0y * dtxx;
+    local[4] += m * dt0y * dtxy;
+    local[5] += m * dt0y * dtxz;
+    local[6] += m * dt0z * dtxx;
+    local[7] += m * dt0z * dtxy;
+    local[8] += m * dt0z * dtxz;
+  }
+
+  block_reduce_atomic_add<9>(local, sm, out_9);
+}
+
+// ---------------------------------------------------------------------------
+// fit_twist reduction kernels.
+// ---------------------------------------------------------------------------
+
+// Pass 1: c_W = R_w * c_CC + t_w; accumulate sum(m * c_W) for the centroid r.
+__global__ void twist_pass1_kernel(int N, const double *c_CC,
+                                   const double *mass, Mat3 R_w, Vec3 t_w,
+                                   double *c_W_out, double *sum_mc_W_3) {
+  __shared__ double sm[kBlock];
+  double local[3] = {0.0, 0.0, 0.0};
+
+  for (int i = blockIdx.x * kBlock + threadIdx.x; i < N;
+       i += kBlock * gridDim.x) {
+    double m = mass[i];
+    double cx = c_CC[3 * i + 0];
+    double cy = c_CC[3 * i + 1];
+    double cz = c_CC[3 * i + 2];
+    double cw0 = R_w.m[0] * cx + R_w.m[1] * cy + R_w.m[2] * cz + t_w.v[0];
+    double cw1 = R_w.m[3] * cx + R_w.m[4] * cy + R_w.m[5] * cz + t_w.v[1];
+    double cw2 = R_w.m[6] * cx + R_w.m[7] * cy + R_w.m[8] * cz + t_w.v[2];
+    c_W_out[3 * i + 0] = cw0;
+    c_W_out[3 * i + 1] = cw1;
+    c_W_out[3 * i + 2] = cw2;
+    local[0] += m * cw0;
+    local[1] += m * cw1;
+    local[2] += m * cw2;
+  }
+
+  block_reduce_atomic_add<3>(local, sm, sum_mc_W_3);
+}
+
+// Pass 2: d = c_W - r; emit d to scratch and accumulate the 18 doubles needed
+// to assemble (LHS, RHS, v0) on host.
+//
+//   acc[ 0.. 8] : S = sum d[i] * (m * d[i])^T  (row-major 3x3)
+//   acc[ 9..11] : L (cross-term L of breakage.cppm fit_twist)
+//   acc[12..14] : sum(m * w)   -> regularization base
+//   acc[15..17] : sum(m * v)   -> v0
+__global__ void twist_pass2_kernel(int N, const double *c_W,
+                                   const double *mass, const double *v,
+                                   const double *w, Vec3 r, double *d_out,
+                                   double *acc_18) {
+  __shared__ double sm[kBlock];
+  double local[18];
+  for (int k = 0; k < 18; ++k) {
+    local[k] = 0.0;
+  }
+
+  for (int i = blockIdx.x * kBlock + threadIdx.x; i < N;
+       i += kBlock * gridDim.x) {
+    double m = mass[i];
+    double dx = c_W[3 * i + 0] - r.v[0];
+    double dy = c_W[3 * i + 1] - r.v[1];
+    double dz = c_W[3 * i + 2] - r.v[2];
+    d_out[3 * i + 0] = dx;
+    d_out[3 * i + 1] = dy;
+    d_out[3 * i + 2] = dz;
+
+    double dwx = m * dx, dwy = m * dy, dwz = m * dz;
+    double vx = v[3 * i + 0], vy = v[3 * i + 1], vz = v[3 * i + 2];
+    double wx = w[3 * i + 0], wy = w[3 * i + 1], wz = w[3 * i + 2];
+
+    // S(r,c) = sum d[i,r] * d_w[i,c], row-major.
+    local[0] += dx * dwx;
+    local[1] += dx * dwy;
+    local[2] += dx * dwz;
+    local[3] += dy * dwx;
+    local[4] += dy * dwy;
+    local[5] += dy * dwz;
+    local[6] += dz * dwx;
+    local[7] += dz * dwy;
+    local[8] += dz * dwz;
+
+    // L (cross-term, mirrors breakage.cppm: dot(d_w.col(j), v.col(k)) ...)
+    local[9] += dwy * vz - dwz * vy;
+    local[10] += dwz * vx - dwx * vz;
+    local[11] += dwx * vy - dwy * vx;
+
+    // sum(m*w), sum(m*v)
+    local[12] += m * wx;
+    local[13] += m * wy;
+    local[14] += m * wz;
+    local[15] += m * vx;
+    local[16] += m * vy;
+    local[17] += m * vz;
+  }
+
+  block_reduce_atomic_add<18>(local, sm, acc_18);
+}
+
+// Pass 3: v_W[i] = w0 cross d[i] + v0.
+//
+// Original CPU code: `d.rowwise().cross(-w0) + v0.transpose()` --- since
+// d × (-w0) = -(d × w0) = w0 × d, we reorder the cross to remove the negation.
+__global__ void twist_pass3_kernel(int N, const double *d, Vec3 w0, Vec3 v0,
+                                   double *v_W) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += gridDim.x * blockDim.x) {
+    double dx = d[3 * i + 0];
+    double dy = d[3 * i + 1];
+    double dz = d[3 * i + 2];
+    v_W[3 * i + 0] = w0.v[1] * dz - w0.v[2] * dy + v0.v[0];
+    v_W[3 * i + 1] = w0.v[2] * dx - w0.v[0] * dz + v0.v[1];
+    v_W[3 * i + 2] = w0.v[0] * dy - w0.v[1] * dx + v0.v[2];
+  }
+}
+
+// ---------------------------------------------------------------------------
+// compute_L kernel.
+//
+// L[i] = R_w * (I_CC[i] * w_CC), where I_CC[i] is symmetric. We pass R_w and
+// w_CC by value (precomputed on host). One thread per part.
+// ---------------------------------------------------------------------------
+
+__global__ void compute_L_kernel(int N, const double *I_CC, Mat3 R_w,
+                                 Vec3 w_CC, double *L_d) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += gridDim.x * blockDim.x) {
+    // tmp = (sum_r w_CC[r] * I_CC_row(r)) -- row vector with components
+    //   tmp[k] = sum_r w_CC[r] * I_CC[i, 3*r + k].
+    // I_CC[i] is symmetric so this equals (I_CC[i] * w_CC).
+    double tmp[3];
+    for (int k = 0; k < 3; ++k) {
+      tmp[k] = w_CC.v[0] * I_CC[9 * i + 0 + k] +
+               w_CC.v[1] * I_CC[9 * i + 3 + k] +
+               w_CC.v[2] * I_CC[9 * i + 6 + k];
+    }
+    // L[i, j] = (tmp_row * R_w^T)[j] = sum_k R_w[j, k] * tmp[k].
+    for (int j = 0; j < 3; ++j) {
+      double s = 0.0;
+      for (int k = 0; k < 3; ++k) {
+        s += R_w.m[3 * j + k] * tmp[k];
+      }
+      L_d[3 * i + j] = s;
+    }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// assemble_b kernel.
+//
+// b_mat (N x 6) layout per part i:
+//   linear  [j] = (m*(v_curr - v_prev) - J)[i] * Pi^T  / dt
+//   angular [j] = (L_curr - L_prev - H)[i] * Pi^T      / (dt * L0)   (torque-scaled)
+// ---------------------------------------------------------------------------
+
+__global__ void assemble_b_kernel(int N, const double *v_W_curr,
+                                  const double *v_W_prev,
+                                  const double *L_curr, const double *L_prev,
+                                  const double *J, const double *H,
+                                  const double *mass, Mat3 Pi, double inv_dt,
+                                  double inv_dt_L0, double *b) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += gridDim.x * blockDim.x) {
+    double m = mass[i];
+    double pre_lin[3];
+    double pre_ang[3];
+    for (int k = 0; k < 3; ++k) {
+      pre_lin[k] =
+          m * (v_W_curr[3 * i + k] - v_W_prev[3 * i + k]) - J[3 * i + k];
+      pre_ang[k] = L_curr[3 * i + k] - L_prev[3 * i + k] - H[3 * i + k];
+    }
+    // out_row[j] = sum_k pre[k] * Pi^T[k, j] = sum_k Pi[j, k] * pre[k].
+    for (int j = 0; j < 3; ++j) {
+      double sl = 0.0;
+      double sa = 0.0;
+      for (int k = 0; k < 3; ++k) {
+        double pijk = Pi.m[3 * j + k];
+        sl += pijk * pre_lin[k];
+        sa += pijk * pre_ang[k];
+      }
+      b[6 * i + j] = sl * inv_dt;
+      b[6 * i + 3 + j] = sa * inv_dt_L0;
+    }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// postprocess kernels: un-whitening and per-clutch utilization.
+// ---------------------------------------------------------------------------
+
+// One thread per clutch. Three 2x1 multiplications by Wk per clutch:
+//   x_unwhiten[j0+1:j0+3] = Wk * x[j0+1:j0+3]
+//   x_unwhiten[j0+4:j0+6] = Wk * x[j0+4:j0+6]
+//   x_unwhiten[j0+7:j0+9] = Wk * x[j0+7:j0+9]
+// All other positions are direct copies done by the preceding cudaMemcpy.
+__global__ void unwhiten_kernel(int ncv, int K, const double *clutch_whiten,
+                                const double *x, double *x_unwhiten) {
+  int k = blockIdx.x * blockDim.x + threadIdx.x;
+  if (k >= K) {
+    return;
+  }
+  int j0 = ncv + 9 * k;
+  double w00 = clutch_whiten[4 * k + 0];
+  double w01 = clutch_whiten[4 * k + 1];
+  double w10 = clutch_whiten[4 * k + 2];
+  double w11 = clutch_whiten[4 * k + 3];
+
+  const int offsets[3] = {1, 4, 7};
+  for (int t = 0; t < 3; ++t) {
+    int p = j0 + offsets[t];
+    double v0 = x[p];
+    double v1 = x[p + 1];
+    x_unwhiten[p] = w00 * v0 + w01 * v1;
+    x_unwhiten[p + 1] = w10 * v0 + w11 * v1;
+  }
+}
+
+// One thread per capacity row; atomic-max into the per-clutch utilization.
+__global__ void utilization_kernel(int ncv, int ncap,
+                                   const int *clutch_indices,
+                                   const double *capacities,
+                                   const double *x_unwhiten,
+                                   double *utilization) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= ncap) {
+    return;
+  }
+  int clutch_index = clutch_indices[idx];
+  double c[9];
+  for (int k = 0; k < 9; ++k) {
+    c[k] = capacities[9 * idx + k];
+  }
+  int xoff = ncv + 9 * clutch_index;
+  double frc_used = c[0] * x_unwhiten[xoff + 0] +
+                    c[1] * x_unwhiten[xoff + 1] +
+                    c[2] * x_unwhiten[xoff + 2];
+  double cap_dot = 0.0;
+  for (int k = 0; k < 6; ++k) {
+    cap_dot += c[3 + k] * x_unwhiten[xoff + 3 + k];
+  }
+  double frc_cap = 1.0 - cap_dot;
+  double u_fp;
+  if (frc_cap > 0.0) {
+    u_fp = frc_used / frc_cap;
+    if (u_fp < 0.0) {
+      u_fp = 0.0;
+    }
+  } else {
+    u_fp = 1e6;
+  }
+  atomic_max_double(&utilization[clutch_index], u_fp);
+}
+
+// ---------------------------------------------------------------------------
+// Host-only SO(3) helpers used by compute_Pi_host (line-for-line copies of
+// breakage.cppm:123-164 with double[] interfaces).
+// ---------------------------------------------------------------------------
+
+void so3_log_from_unit_quat_host(const double q_in[4], double phi[3]) {
+  double q[4];
+  for (int i = 0; i < 4; ++i) {
+    q[i] = q_in[i];
+  }
+  cuda_la::quat_xyzw_normalize(q);
+  // Pick the shortest representation (q.w >= 0) except for the pi case.
+  if (q[3] < 0.0) {
+    q[0] = -q[0];
+    q[1] = -q[1];
+    q[2] = -q[2];
+    q[3] = -q[3];
+  }
+  double v[3] = {q[0], q[1], q[2]};
+  double s = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); // sin(theta/2)
+  double w = q[3];                                               // cos(theta/2)
+
+  constexpr double kSmallS = 5e-5;
+  double theta_over_s;
+  if (s < kSmallS) {
+    // Series: theta/s = 2 + s^2/3 + 3 s^4/20 + O(s^6).
+    double s2 = s * s;
+    double s4 = s2 * s2;
+    theta_over_s = 2.0 + (s2 / 3.0) + (3.0 * s4 / 20.0);
+  } else {
+    double theta = 2.0 * std::atan2(s, w);
+    theta_over_s = theta / s;
+  }
+  phi[0] = theta_over_s * v[0];
+  phi[1] = theta_over_s * v[1];
+  phi[2] = theta_over_s * v[2];
+}
+
+void so3_jacobian_inv_host(const double phi[3], double J_inv[9]) {
+  double theta2 = phi[0] * phi[0] + phi[1] * phi[1] + phi[2] * phi[2];
+  constexpr double kSmallTheta = 1e-4;
+  double c;
+  if (theta2 < kSmallTheta * kSmallTheta) {
+    // Series: c(theta) = 1/12 + theta^2/720 + theta^4/30240 + O(theta^6).
+    double t2 = theta2;
+    double t4 = t2 * t2;
+    c = (1.0 / 12.0) + (t2 / 720.0) + (t4 / 30240.0);
+  } else {
+    double theta = std::sqrt(theta2);
+    double half = 0.5 * theta;
+    double cot_half = std::cos(half) / std::sin(half);
+    c = (1.0 / theta2) - (cot_half / (2.0 * theta));
+  }
+  double px = phi[0], py = phi[1], pz = phi[2];
+  // Phi = [ 0, -z,  y;
+  //         z,  0, -x;
+  //        -y,  x,  0 ]
+  double Phi[9] = {0.0, -pz, py, pz, 0.0, -px, -py, px, 0.0};
+  double P2[9];
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      double sum = 0.0;
+      for (int k = 0; k < 3; ++k) {
+        sum += Phi[3 * i + k] * Phi[3 * k + j];
+      }
+      P2[3 * i + j] = sum;
+    }
+  }
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      double v = ((i == j) ? 1.0 : 0.0) - 0.5 * Phi[3 * i + j] +
+                 c * P2[3 * i + j];
+      J_inv[3 * i + j] = v;
+    }
+  }
+}
+
+} // namespace
+
+// ---------------------------------------------------------------------------
+// Host launchers.
+// ---------------------------------------------------------------------------
+
+void fit_se3(int N, const double *q0_d, const double *t0_d, const double *qx_d,
+             const double *tx_d, const double *mass_d, double total_mass,
+             double lambda_R, double q_out[4], double t_out[3],
+             cudaStream_t stream) {
+  if (N == 0) {
+    q_out[0] = 0.0;
+    q_out[1] = 0.0;
+    q_out[2] = 0.0;
+    q_out[3] = 1.0;
+    t_out[0] = 0.0;
+    t_out[1] = 0.0;
+    t_out[2] = 0.0;
+    return;
+  }
+  if (!(total_mass > 0.0)) {
+    throw std::invalid_argument("fit_se3: total_mass must be positive");
+  }
+
+  int grid = grid_for(N);
+
+  // Pass 1: 6 centroid sums + 9 K sums = 15 doubles.
+  DeviceBuffer<double> p1(15);
+  BREAKAGE_CUDA_CHECK(
+      cudaMemsetAsync(p1.data(), 0, 15 * sizeof(double), stream));
+  se3_pass1_kernel<<<grid, kBlock, 0, stream>>>(N, q0_d, t0_d, qx_d, tx_d,
+                                                mass_d, p1.data());
+
+  double p1_h[15];
+  BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(p1_h, p1.data(), 15 * sizeof(double),
+                                      cudaMemcpyDeviceToHost, stream));
+  BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  Vec3 t0_bar;
+  Vec3 tx_bar;
+  for (int k = 0; k < 3; ++k) {
+    t0_bar.v[k] = p1_h[k] / total_mass;
+    tx_bar.v[k] = p1_h[3 + k] / total_mass;
+  }
+  double K[9];
+  for (int k = 0; k < 9; ++k) {
+    K[k] = p1_h[6 + k];
+  }
+
+  // Pass 2: H = sum_i m_i * (t0_i - t0_bar) * (tx_i - tx_bar)^T.
+  DeviceBuffer<double> p2(9);
+  BREAKAGE_CUDA_CHECK(
+      cudaMemsetAsync(p2.data(), 0, 9 * sizeof(double), stream));
+  se3_pass2_kernel<<<grid, kBlock, 0, stream>>>(N, t0_d, tx_d, mass_d, t0_bar,
+                                                tx_bar, p2.data());
+
+  double H[9];
+  BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(H, p2.data(), 9 * sizeof(double),
+                                      cudaMemcpyDeviceToHost, stream));
+  BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  // Host: Procrustes on S = H + lambda_R * K, then derive (q, t).
+  double S[9];
+  for (int k = 0; k < 9; ++k) {
+    S[k] = H[k] + lambda_R * K[k];
+  }
+  double R[9];
+  cuda_la::svd_procrustes_3x3(S, R);
+
+  // t = tx_bar - R * t0_bar
+  for (int i = 0; i < 3; ++i) {
+    double s = tx_bar.v[i];
+    for (int j = 0; j < 3; ++j) {
+      s -= R[3 * i + j] * t0_bar.v[j];
+    }
+    t_out[i] = s;
+  }
+  cuda_la::R_to_quat_xyzw(R, q_out);
+  cuda_la::quat_xyzw_normalize(q_out);
+}
+
+void fit_twist(int N, const double *w_d, const double *v_d,
+               const double *c_CC_d, const double q_W_CC[4],
+               const double t_W_CC[3], const double *mass_d, double total_mass,
+               double lambda_w, double w0_out[3], double v0_out[3],
+               double *v_W_d, cudaStream_t stream) {
+  if (N == 0) {
+    for (int k = 0; k < 3; ++k) {
+      w0_out[k] = 0.0;
+      v0_out[k] = 0.0;
+    }
+    return;
+  }
+  if (!(total_mass > 0.0)) {
+    throw std::invalid_argument("fit_twist: total_mass must be positive");
+  }
+
+  int grid = grid_for(N);
+
+  Mat3 R_w;
+  cuda_la::quat_xyzw_to_R(q_W_CC, R_w.m);
+  Vec3 t_w;
+  t_w.v[0] = t_W_CC[0];
+  t_w.v[1] = t_W_CC[1];
+  t_w.v[2] = t_W_CC[2];
+
+  // Scratch: c_W and d both (N x 3) row-major.
+  DeviceBuffer<double> c_W_buf(static_cast<std::size_t>(3) * N);
+  DeviceBuffer<double> d_buf(static_cast<std::size_t>(3) * N);
+  DeviceBuffer<double> p1(3);
+  DeviceBuffer<double> p2(18);
+  BREAKAGE_CUDA_CHECK(
+      cudaMemsetAsync(p1.data(), 0, 3 * sizeof(double), stream));
+  BREAKAGE_CUDA_CHECK(
+      cudaMemsetAsync(p2.data(), 0, 18 * sizeof(double), stream));
+
+  twist_pass1_kernel<<<grid, kBlock, 0, stream>>>(
+      N, c_CC_d, mass_d, R_w, t_w, c_W_buf.data(), p1.data());
+
+  double sum_mc[3];
+  BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(sum_mc, p1.data(), 3 * sizeof(double),
+                                      cudaMemcpyDeviceToHost, stream));
+  BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  Vec3 r;
+  for (int k = 0; k < 3; ++k) {
+    r.v[k] = sum_mc[k] / total_mass;
+  }
+
+  twist_pass2_kernel<<<grid, kBlock, 0, stream>>>(
+      N, c_W_buf.data(), mass_d, v_d, w_d, r, d_buf.data(), p2.data());
+
+  double acc[18];
+  BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(acc, p2.data(), 18 * sizeof(double),
+                                      cudaMemcpyDeviceToHost, stream));
+  BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  double S[9];
+  for (int k = 0; k < 9; ++k) {
+    S[k] = acc[k];
+  }
+  double L[3] = {acc[9], acc[10], acc[11]};
+  double mw[3] = {acc[12], acc[13], acc[14]};
+  double mv[3] = {acc[15], acc[16], acc[17]};
+
+  // LHS = I * (trace(S) + lambda_w * total_mass) - S
+  double trace = S[0] + S[4] + S[8];
+  double k_const = trace + lambda_w * total_mass;
+  double LHS[9] = {k_const - S[0], -S[1],          -S[2],          //
+                   -S[3],          k_const - S[4], -S[5],          //
+                   -S[6],          -S[7],          k_const - S[8]};
+  double RHS[3] = {L[0] + lambda_w * mw[0], L[1] + lambda_w * mw[1],
+                   L[2] + lambda_w * mw[2]};
+
+  cuda_la::spd_solve_3x3(LHS, RHS, w0_out);
+  v0_out[0] = mv[0] / total_mass;
+  v0_out[1] = mv[1] / total_mass;
+  v0_out[2] = mv[2] / total_mass;
+
+  Vec3 w0_v = {{w0_out[0], w0_out[1], w0_out[2]}};
+  Vec3 v0_v = {{v0_out[0], v0_out[1], v0_out[2]}};
+  twist_pass3_kernel<<<grid, kBlock, 0, stream>>>(N, d_buf.data(), w0_v, v0_v,
+                                                  v_W_d);
+  // Sync so the scratch DeviceBuffers can be safely freed at scope exit.
+  BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream));
+}
+
+void compute_Pi_host(const double qm[4], const double qp[4],
+                     double Pi_out[9]) {
+  // Pi = J_inv(log(R_m^T R_p)) * R_m^T
+  double qm_conj[4];
+  cuda_la::quat_xyzw_conjugate(qm, qm_conj);
+  double q_rel[4];
+  cuda_la::quat_xyzw_mul(qm_conj, qp, q_rel);
+  double phi[3];
+  so3_log_from_unit_quat_host(q_rel, phi);
+  double J_inv[9];
+  so3_jacobian_inv_host(phi, J_inv);
+  double R_m[9];
+  cuda_la::quat_xyzw_to_R(qm, R_m);
+  // Pi[i, j] = sum_k J_inv[i, k] * R_m^T[k, j] = sum_k J_inv[i, k] * R_m[j, k]
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      double s = 0.0;
+      for (int k = 0; k < 3; ++k) {
+        s += J_inv[3 * i + k] * R_m[3 * j + k];
+      }
+      Pi_out[3 * i + j] = s;
+    }
+  }
+}
+
+void compute_L(int N, const double *I_CC_d, const double q_W_CC[4],
+               const double w0[3], double *L_d, cudaStream_t stream) {
+  if (N == 0) {
+    return;
+  }
+  // Precompute R_w (row-major) and w_CC = R_w^T * w0 on the host so the
+  // kernel parameters stay tiny (12 doubles by-value).
+  Mat3 R_w;
+  cuda_la::quat_xyzw_to_R(q_W_CC, R_w.m);
+  Vec3 w_CC;
+  for (int i = 0; i < 3; ++i) {
+    // R_w^T[i, k] = R_w[k, i]
+    w_CC.v[i] = R_w.m[3 * 0 + i] * w0[0] + R_w.m[3 * 1 + i] * w0[1] +
+                R_w.m[3 * 2 + i] * w0[2];
+  }
+  int grid = grid_for(N);
+  compute_L_kernel<<<grid, kBlock, 0, stream>>>(N, I_CC_d, R_w, w_CC, L_d);
+}
+
+void assemble_b(int N, const double *v_W_curr_d, const double *v_W_prev_d,
+                const double *L_curr_d, const double *L_prev_d,
+                const double *J_d, const double *H_d, const double *mass_d,
+                const double Pi[9], double dt, double L0, double *b_d,
+                cudaStream_t stream) {
+  if (N == 0) {
+    return;
+  }
+  if (!(dt > 0.0)) {
+    throw std::invalid_argument("assemble_b: dt must be positive");
+  }
+  if (!(L0 > 0.0)) {
+    throw std::invalid_argument("assemble_b: L0 must be positive");
+  }
+  Mat3 Pi_v;
+  for (int k = 0; k < 9; ++k) {
+    Pi_v.m[k] = Pi[k];
+  }
+  double inv_dt = 1.0 / dt;
+  // EnableTorqueScaling = true (matches breakage.cppm).
+  double inv_dt_L0 = inv_dt / L0;
+  int grid = grid_for(N);
+  assemble_b_kernel<<<grid, kBlock, 0, stream>>>(
+      N, v_W_curr_d, v_W_prev_d, L_curr_d, L_prev_d, J_d, H_d, mass_d, Pi_v,
+      inv_dt, inv_dt_L0, b_d);
+}
+
+void postprocess(int ncv, int K, int ncap, const double *x_d,
+                 const int *capacity_clutch_indices_d,
+                 const double *capacities_d, const double *clutch_whiten_d,
+                 double *x_unwhiten_d, double *utilization_d,
+                 cudaStream_t stream) {
+  // num_vars = ncv + 9 * K  (cf. BreakageSystem::num_vars_).
+  int num_vars = ncv + 9 * K;
+  if (num_vars > 0) {
+    // Step 0: copy x verbatim. The unwhiten kernel below will overwrite the
+    // 6 positions per clutch that need the Wk transform; everything else stays.
+    BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(
+        x_unwhiten_d, x_d, static_cast<std::size_t>(num_vars) * sizeof(double),
+        cudaMemcpyDeviceToDevice, stream));
+  }
+  if (K > 0) {
+    // Initialize utilization to -1 before any atomicMax (matches CPU baseline).
+    cuda_la::fill(K, -1.0, utilization_d, stream);
+    int bs = 64;
+    int grid = (K + bs - 1) / bs;
+    unwhiten_kernel<<<grid, bs, 0, stream>>>(ncv, K, clutch_whiten_d, x_d,
+                                             x_unwhiten_d);
+  }
+  if (ncap > 0) {
+    int bs = 128;
+    int grid = (ncap + bs - 1) / bs;
+    utilization_kernel<<<grid, bs, 0, stream>>>(
+        ncv, ncap, capacity_clutch_indices_d, capacities_d, x_unwhiten_d,
+        utilization_d);
+  }
+}
+
+} // namespace breakage_cuda::cuda_kernels
diff --git a/native/cuda/breakage_cuda/src/linalg.cu b/native/cuda/breakage_cuda/src/linalg.cu
new file mode 100644
index 0000000..31c0b05
--- /dev/null
+++ b/native/cuda/breakage_cuda/src/linalg.cu
@@ -0,0 +1,507 @@
+// SPDX-License-Identifier: MIT
+//
+// Implementation of breakage_cuda/linalg.hpp:
+//
+//   * cuBLAS / cuSPARSE / cuSOLVER handle pool (Meyers singleton).
+//   * Thin wrappers around level-1 BLAS we need from the ADMM solver.
+//   * Generic-API SpMV.
+//   * Element-wise box-clamp / fill kernels.
+//   * Host-side 3x3 / 2x2 dense helpers (SVD-Procrustes, SPD LDLT solve,
+//     SPD inverse-square-root) backed by Eigen.
+//   * xyzw quaternion <-> rotation matrix helpers.
+//
+// All double-precision throughout: this is a verification / numerical-fidelity
+// project, not a perf-shootout.
+
+#include "breakage_cuda/linalg.hpp"
+#include "breakage_cuda/device.hpp"
+
+#include <Eigen/Dense>
+#include <Eigen/Eigenvalues>
+#include <Eigen/Geometry>
+#include <Eigen/SVD>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+#include <cusolverSp.h>
+#include <cusparse.h>
+
+#include <cmath>
+#include <stdexcept>
+#include <string>
+
+namespace breakage_cuda::cuda_la {
+
+// ---------------------------------------------------------------------------
+// Local error-check macro for cuBLAS (cuSPARSE/CUDA macros come from
+// device.hpp).
+// ---------------------------------------------------------------------------
+
+#define BREAKAGE_CUBLAS_CHECK(call)                                            \
+  do {                                                                         \
+    cublasStatus_t _s = (call);                                                \
+    if (_s != CUBLAS_STATUS_SUCCESS) {                                         \
+      throw std::runtime_error(std::string("cuBLAS error at " __FILE__ ":") +  \
+                               std::to_string(__LINE__) + " (status " +        \
+                               std::to_string(static_cast<int>(_s)) + ")");    \
+    }                                                                          \
+  } while (0)
+
+// ---------------------------------------------------------------------------
+// Handles singleton.
+// ---------------------------------------------------------------------------
+
+Handles::Handles() {
+  if (cublasCreate(&blas_) != CUBLAS_STATUS_SUCCESS) {
+    throw std::runtime_error("cublasCreate failed");
+  }
+  if (cusparseCreate(&sparse_) != CUSPARSE_STATUS_SUCCESS) {
+    cublasDestroy(blas_);
+    blas_ = nullptr;
+    throw std::runtime_error("cusparseCreate failed");
+  }
+  if (cusolverDnCreate(&solver_dn_) != CUSOLVER_STATUS_SUCCESS) {
+    cusparseDestroy(sparse_);
+    cublasDestroy(blas_);
+    sparse_ = nullptr;
+    blas_ = nullptr;
+    throw std::runtime_error("cusolverDnCreate failed");
+  }
+  if (cusolverSpCreate(&solver_sp_) != CUSOLVER_STATUS_SUCCESS) {
+    cusolverDnDestroy(solver_dn_);
+    cusparseDestroy(sparse_);
+    cublasDestroy(blas_);
+    solver_dn_ = nullptr;
+    sparse_ = nullptr;
+    blas_ = nullptr;
+    throw std::runtime_error("cusolverSpCreate failed");
+  }
+  // We always pass alpha/beta and consume reduction results through host
+  // pointers, which is the default but be explicit so that an external caller
+  // tinkering with the handles cannot break us.
+  cublasSetPointerMode(blas_, CUBLAS_POINTER_MODE_HOST);
+  cusparseSetPointerMode(sparse_, CUSPARSE_POINTER_MODE_HOST);
+}
+
+Handles::~Handles() {
+  if (solver_sp_ != nullptr) {
+    cusolverSpDestroy(solver_sp_);
+  }
+  if (solver_dn_ != nullptr) {
+    cusolverDnDestroy(solver_dn_);
+  }
+  if (sparse_ != nullptr) {
+    cusparseDestroy(sparse_);
+  }
+  if (blas_ != nullptr) {
+    cublasDestroy(blas_);
+  }
+}
+
+Handles &Handles::instance() {
+  static Handles inst;
+  return inst;
+}
+
+// ---------------------------------------------------------------------------
+// cuBLAS-backed level-1 wrappers.
+//
+// dot/nrm2/inf_norm consume a host-side scalar so they internally synchronize
+// the stream before returning. axpy/scal/copy/scal_copy are pure async kernel
+// launches.
+// ---------------------------------------------------------------------------
+
+double dot(int n, const double *x_d, const double *y_d, cudaStream_t stream) {
+  if (n <= 0) {
+    return 0.0;
+  }
+  auto h = Handles::instance().blas();
+  BREAKAGE_CUBLAS_CHECK(cublasSetStream(h, stream));
+  double r = 0.0;
+  BREAKAGE_CUBLAS_CHECK(cublasDdot(h, n, x_d, 1, y_d, 1, &r));
+  // cuBLAS in HOST_POINTER mode is synchronous wrt host for scalar results,
+  // but make it explicit so async kernel launches that follow are properly
+  // ordered.
+  BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream));
+  return r;
+}
+
+double nrm2(int n, const double *x_d, cudaStream_t stream) {
+  if (n <= 0) {
+    return 0.0;
+  }
+  auto h = Handles::instance().blas();
+  BREAKAGE_CUBLAS_CHECK(cublasSetStream(h, stream));
+  double r = 0.0;
+  BREAKAGE_CUBLAS_CHECK(cublasDnrm2(h, n, x_d, 1, &r));
+  BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream));
+  return r;
+}
+
+double inf_norm(int n, const double *x_d, cudaStream_t stream) {
+  if (n <= 0) {
+    return 0.0;
+  }
+  auto h = Handles::instance().blas();
+  BREAKAGE_CUBLAS_CHECK(cublasSetStream(h, stream));
+  int idx_1 = 0; // 1-based
+  BREAKAGE_CUBLAS_CHECK(cublasIdamax(h, n, x_d, 1, &idx_1));
+  if (idx_1 < 1 || idx_1 > n) {
+    return 0.0;
+  }
+  double v = 0.0;
+  BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(&v, x_d + (idx_1 - 1), sizeof(double),
+                                      cudaMemcpyDeviceToHost, stream));
+  BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream));
+  return std::abs(v);
+}
+
+void axpy(int n, double alpha, const double *x_d, double *y_d,
+          cudaStream_t stream) {
+  if (n <= 0) {
+    return;
+  }
+  auto h = Handles::instance().blas();
+  BREAKAGE_CUBLAS_CHECK(cublasSetStream(h, stream));
+  BREAKAGE_CUBLAS_CHECK(cublasDaxpy(h, n, &alpha, x_d, 1, y_d, 1));
+}
+
+void scal(int n, double alpha, double *x_d, cudaStream_t stream) {
+  if (n <= 0) {
+    return;
+  }
+  auto h = Handles::instance().blas();
+  BREAKAGE_CUBLAS_CHECK(cublasSetStream(h, stream));
+  BREAKAGE_CUBLAS_CHECK(cublasDscal(h, n, &alpha, x_d, 1));
+}
+
+void copy(int n, const double *src_d, double *dst_d, cudaStream_t stream) {
+  if (n <= 0) {
+    return;
+  }
+  BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(dst_d, src_d,
+                                      static_cast<std::size_t>(n) *
+                                          sizeof(double),
+                                      cudaMemcpyDeviceToDevice, stream));
+}
+
+void scal_copy(int n, double alpha, const double *x_d, double *y_d,
+               cudaStream_t stream) {
+  if (n <= 0) {
+    return;
+  }
+  copy(n, x_d, y_d, stream);
+  scal(n, alpha, y_d, stream);
+}
+
+// ---------------------------------------------------------------------------
+// Element-wise kernels: fill / clamp.
+// ---------------------------------------------------------------------------
+
+namespace {
+
+constexpr int kElemwiseBlock = 256;
+
+inline int elemwise_grid(int n) {
+  return (n + kElemwiseBlock - 1) / kElemwiseBlock;
+}
+
+__global__ void fill_kernel(int n, double v, double *x) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n) {
+    x[i] = v;
+  }
+}
+
+__global__ void clamp_kernel(int n, const double *in, const double *l,
+                             const double *u, double *out) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n) {
+    return;
+  }
+  double x = in[i];
+  double lo = l[i];
+  double hi = u[i];
+  if (x < lo) {
+    x = lo;
+  }
+  if (x > hi) {
+    x = hi;
+  }
+  out[i] = x;
+}
+
+__global__ void clamp_inplace_kernel(int n, const double *l, const double *u,
+                                     double *x_d) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i >= n) {
+    return;
+  }
+  double x = x_d[i];
+  double lo = l[i];
+  double hi = u[i];
+  if (x < lo) {
+    x = lo;
+  }
+  if (x > hi) {
+    x = hi;
+  }
+  x_d[i] = x;
+}
+
+} // namespace
+
+void fill(int n, double value, double *x_d, cudaStream_t stream) {
+  if (n <= 0) {
+    return;
+  }
+  fill_kernel<<<elemwise_grid(n), kElemwiseBlock, 0, stream>>>(n, value, x_d);
+}
+
+void clamp(int n, const double *in_d, const double *l_d, const double *u_d,
+           double *out_d, cudaStream_t stream) {
+  if (n <= 0) {
+    return;
+  }
+  clamp_kernel<<<elemwise_grid(n), kElemwiseBlock, 0, stream>>>(n, in_d, l_d,
+                                                                u_d, out_d);
+}
+
+void clamp_inplace(int n, const double *l_d, const double *u_d, double *x_d,
+                   cudaStream_t stream) {
+  if (n <= 0) {
+    return;
+  }
+  clamp_inplace_kernel<<<elemwise_grid(n), kElemwiseBlock, 0, stream>>>(n, l_d,
+                                                                       u_d,
+                                                                       x_d);
+}
+
+// ---------------------------------------------------------------------------
+// SpMV via the cuSPARSE generic API.
+//
+// We lazily attach a cusparseSpMatDescr_t to each DeviceCsr on first use.
+// system.cpp may also build it; we only do it here if it has not.
+// ---------------------------------------------------------------------------
+
+namespace {
+
+void ensure_csr_descr(const DeviceCsr &A) {
+  if (A.descr != nullptr) {
+    return;
+  }
+  // descr is a member of DeviceCsr; the lazy-init cast preserves the
+  // logical "const" of the SpMV operation (we do not touch the values).
+  auto &mut = const_cast<DeviceCsr &>(A);
+  BREAKAGE_CUSPARSE_CHECK(cusparseCreateCsr(
+      &mut.descr, A.rows, A.cols, A.nnz,
+      const_cast<int *>(A.rowptr.data()), const_cast<int *>(A.colind.data()),
+      const_cast<double *>(A.values.data()), CUSPARSE_INDEX_32I,
+      CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
+}
+
+} // namespace
+
+std::size_t spmv_buffer_size(const DeviceCsr &A, bool transpose) {
+  if (A.rows <= 0 || A.cols <= 0) {
+    return 0;
+  }
+  ensure_csr_descr(A);
+  auto h = Handles::instance().sparse();
+  cusparseOperation_t op =
+      transpose ? CUSPARSE_OPERATION_TRANSPOSE
+                : CUSPARSE_OPERATION_NON_TRANSPOSE;
+  int64_t x_n = transpose ? A.rows : A.cols;
+  int64_t y_n = transpose ? A.cols : A.rows;
+  cusparseDnVecDescr_t x_descr = nullptr, y_descr = nullptr;
+  // bufferSize never dereferences the vector data; nullptr is the documented
+  // "use placeholder" value.
+  BREAKAGE_CUSPARSE_CHECK(
+      cusparseCreateDnVec(&x_descr, x_n, nullptr, CUDA_R_64F));
+  BREAKAGE_CUSPARSE_CHECK(
+      cusparseCreateDnVec(&y_descr, y_n, nullptr, CUDA_R_64F));
+  double alpha = 1.0;
+  double beta = 0.0;
+  std::size_t bytes = 0;
+  BREAKAGE_CUSPARSE_CHECK(cusparseSpMV_bufferSize(
+      h, op, &alpha, A.descr, x_descr, &beta, y_descr, CUDA_R_64F,
+      CUSPARSE_SPMV_ALG_DEFAULT, &bytes));
+  cusparseDestroyDnVec(x_descr);
+  cusparseDestroyDnVec(y_descr);
+  return bytes;
+}
+
+void spmv(const DeviceCsr &A, bool transpose, double alpha, const double *x_d,
+          double beta, double *y_d, void *buffer_d, std::size_t buffer_bytes,
+          cudaStream_t stream) {
+  (void)buffer_bytes; // size validated by cuSPARSE itself
+  if (A.rows <= 0 || A.cols <= 0) {
+    return;
+  }
+  ensure_csr_descr(A);
+  auto h = Handles::instance().sparse();
+  BREAKAGE_CUSPARSE_CHECK(cusparseSetStream(h, stream));
+  cusparseOperation_t op =
+      transpose ? CUSPARSE_OPERATION_TRANSPOSE
+                : CUSPARSE_OPERATION_NON_TRANSPOSE;
+  int64_t x_n = transpose ? A.rows : A.cols;
+  int64_t y_n = transpose ? A.cols : A.rows;
+  cusparseDnVecDescr_t x_descr = nullptr, y_descr = nullptr;
+  BREAKAGE_CUSPARSE_CHECK(cusparseCreateDnVec(
+      &x_descr, x_n, const_cast<double *>(x_d), CUDA_R_64F));
+  BREAKAGE_CUSPARSE_CHECK(cusparseCreateDnVec(&y_descr, y_n, y_d, CUDA_R_64F));
+  BREAKAGE_CUSPARSE_CHECK(cusparseSpMV(h, op, &alpha, A.descr, x_descr, &beta,
+                                       y_descr, CUDA_R_64F,
+                                       CUSPARSE_SPMV_ALG_DEFAULT, buffer_d));
+  cusparseDestroyDnVec(x_descr);
+  cusparseDestroyDnVec(y_descr);
+}
+
+// ---------------------------------------------------------------------------
+// Host-side dense 3x3 / 2x2 helpers.
+//
+// Inputs / outputs are row-major flat arrays; we marshal to Eigen, do the
+// math, and marshal back. Allocations are stack-only.
+// ---------------------------------------------------------------------------
+
+namespace {
+
+inline Eigen::Matrix3d row_major_to_eigen3(const double M[9]) {
+  Eigen::Matrix3d out;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      out(i, j) = M[3 * i + j];
+    }
+  }
+  return out;
+}
+
+inline void eigen_to_row_major3(const Eigen::Matrix3d &M, double out[9]) {
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      out[3 * i + j] = M(i, j);
+    }
+  }
+}
+
+inline Eigen::Matrix2d row_major_to_eigen2(const double M[4]) {
+  Eigen::Matrix2d out;
+  out(0, 0) = M[0];
+  out(0, 1) = M[1];
+  out(1, 0) = M[2];
+  out(1, 1) = M[3];
+  return out;
+}
+
+inline void eigen_to_row_major2(const Eigen::Matrix2d &M, double out[4]) {
+  out[0] = M(0, 0);
+  out[1] = M(0, 1);
+  out[2] = M(1, 0);
+  out[3] = M(1, 1);
+}
+
+} // namespace
+
+void svd_procrustes_3x3(const double S[9], double R[9]) {
+  Eigen::Matrix3d S_mat = row_major_to_eigen3(S);
+  Eigen::JacobiSVD<Eigen::Matrix3d> svd(
+      S_mat, Eigen::ComputeFullU | Eigen::ComputeFullV);
+  Eigen::Matrix3d U = svd.matrixU();
+  Eigen::Matrix3d V = svd.matrixV();
+  Eigen::Matrix3d D = Eigen::Matrix3d::Identity();
+  if ((V * U.transpose()).determinant() < 0.0) {
+    D(2, 2) = -1.0;
+  }
+  Eigen::Matrix3d R_mat = V * D * U.transpose();
+  eigen_to_row_major3(R_mat, R);
+}
+
+void spd_solve_3x3(const double A[9], const double b[3], double x[3]) {
+  Eigen::Matrix3d A_mat = row_major_to_eigen3(A);
+  Eigen::Vector3d b_vec(b[0], b[1], b[2]);
+  Eigen::Vector3d x_vec = A_mat.ldlt().solve(b_vec);
+  x[0] = x_vec[0];
+  x[1] = x_vec[1];
+  x[2] = x_vec[2];
+}
+
+void inv_sqrt_spd_2x2(const double C[4], double W[4]) {
+  Eigen::Matrix2d C_mat = row_major_to_eigen2(C);
+  Eigen::SelfAdjointEigenSolver<Eigen::Matrix2d> eig(C_mat);
+  if (eig.info() != Eigen::Success) {
+    throw std::invalid_argument(
+        "inv_sqrt_spd_2x2: eigen decomposition failed");
+  }
+  Eigen::Vector2d lambda = eig.eigenvalues();
+  Eigen::Matrix2d U = eig.eigenvectors();
+  double lambda_max = lambda.maxCoeff();
+  if (!(lambda_max > 0.0)) {
+    throw std::invalid_argument("inv_sqrt_spd_2x2: non-positive matrix");
+  }
+  // Match breakage.cppm: clamp to avoid blow-up on near-degenerate patches.
+  double eps = 1e-12 * lambda_max;
+  lambda = lambda.cwiseMax(eps);
+  Eigen::Matrix2d D = Eigen::Matrix2d::Zero();
+  D(0, 0) = 1.0 / std::sqrt(lambda(0));
+  D(1, 1) = 1.0 / std::sqrt(lambda(1));
+  Eigen::Matrix2d W_mat = U * D * U.transpose();
+  eigen_to_row_major2(W_mat, W);
+}
+
+// ---------------------------------------------------------------------------
+// Quaternion (xyzw) helpers.
+//
+// xyzw matches Eigen::Quaterniond::coeffs() ordering, so we can round-trip
+// through Quaterniond without reordering.
+// ---------------------------------------------------------------------------
+
+void quat_xyzw_to_R(const double q[4], double R[9]) {
+  Eigen::Quaterniond q_eig;
+  q_eig.coeffs() << q[0], q[1], q[2], q[3];
+  Eigen::Matrix3d M = q_eig.toRotationMatrix();
+  eigen_to_row_major3(M, R);
+}
+
+void R_to_quat_xyzw(const double R[9], double q[4]) {
+  Eigen::Matrix3d M = row_major_to_eigen3(R);
+  Eigen::Quaterniond q_eig(M);
+  q_eig.normalize();
+  q[0] = q_eig.x();
+  q[1] = q_eig.y();
+  q[2] = q_eig.z();
+  q[3] = q_eig.w();
+}
+
+void quat_xyzw_normalize(double q[4]) {
+  double n2 = q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3];
+  if (!(n2 > 0.0)) {
+    return;
+  }
+  double inv = 1.0 / std::sqrt(n2);
+  q[0] *= inv;
+  q[1] *= inv;
+  q[2] *= inv;
+  q[3] *= inv;
+}
+
+void quat_xyzw_mul(const double a[4], const double b[4], double r[4]) {
+  // Hamilton product, matching Eigen Quaternion::operator*.
+  double ax = a[0], ay = a[1], az = a[2], aw = a[3];
+  double bx = b[0], by = b[1], bz = b[2], bw = b[3];
+  r[0] = aw * bx + ax * bw + ay * bz - az * by;
+  r[1] = aw * by - ax * bz + ay * bw + az * bx;
+  r[2] = aw * bz + ax * by - ay * bx + az * bw;
+  r[3] = aw * bw - ax * bx - ay * by - az * bz;
+}
+
+void quat_xyzw_conjugate(const double q[4], double r[4]) {
+  r[0] = -q[0];
+  r[1] = -q[1];
+  r[2] = -q[2];
+  r[3] = q[3];
+}
+
+#undef BREAKAGE_CUBLAS_CHECK
+
+} // namespace breakage_cuda::cuda_la
diff --git a/native/cuda/breakage_cuda/src/qp_solver.cu b/native/cuda/breakage_cuda/src/qp_solver.cu
new file mode 100644
index 0000000..c54ac84
--- /dev/null
+++ b/native/cuda/breakage_cuda/src/qp_solver.cu
@@ -0,0 +1,1264 @@
+// SPDX-License-Identifier: MIT
+//
+// Hand-written CUDA ADMM implementation of the three-stage QP wrapper that
+// lives on the CPU side in modules/bricksim/physx/osqp.cppm.
+//
+// Algorithmic summary (per stage; standard OSQP form
+// `min 0.5 x^T P x + q^T x  s.t. l <= C x <= u`):
+//
+//   each ADMM iteration t:
+//     1. rhs = sigma * x_k - q + C^T (rho * z_k - y_k)
+//     2. solve  M xt = rhs           with M = P + sigma I + rho C^T C
+//     3. zt = C xt                   (recovered from the reduced KKT)
+//     4. xt_alpha = alpha * xt + (1 - alpha) * x_k
+//        zt_alpha = alpha * zt + (1 - alpha) * z_k
+//     5. z_{k+1}  = clamp(zt_alpha + y_k / rho, l, u)
+//     6. y_{k+1} += rho * (zt_alpha - z_{k+1})
+//     7. x_{k+1}  = xt_alpha
+//
+// The reduced KKT matrix M is symmetric positive definite (since
+// P + sigma I > 0 and C^T C >= 0). We solve the inner system with a plain
+// (un-preconditioned) conjugate-gradient method, applying M implicitly as
+// `M x = P x + sigma x + rho C^T (C x)` via three SpMVs per CG step. This
+// trades raw factorization speed for far simpler code: no symbolic Cholesky
+// caching, no refactor on rho changes, no cuSolverSp roundtrips. The CG
+// initial guess is warm-started from the previous outer iteration's xt so
+// in steady state the inner loop converges in a handful of iterations.
+//
+// We deliberately do NOT replicate every OSQP feature - no polish, no
+// infeasibility detection, no Ruiz scaling. The goal is to be close enough
+// to OSQP that the breakage system behaves consistently.
+
+// Eigen headers must come before the project headers because system.hpp uses
+// Eigen::Quaterniond in HostState but only includes <Eigen/Dense> +
+// <Eigen/SparseCore>. Pull the missing pieces in here so we don't have to
+// touch system.hpp.
+#include <Eigen/Geometry>
+#include <Eigen/Sparse>
+
+#include "breakage_cuda/qp_solver.hpp"
+
+#include "breakage_cuda/device.hpp"
+#include "breakage_cuda/kernels.hpp"
+#include "breakage_cuda/linalg.hpp"
+#include "breakage_cuda/system.hpp"
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace breakage_cuda::cuda_qp {
+
+// ---------------------------------------------------------------------------
+// File-local constants and device kernels
+// ---------------------------------------------------------------------------
+
+namespace {
+
+// Stage 2 / Stage 3 inequality slack used by osqp.cppm. Hard-coded to match
+// the production default; the OSQP wrapper would override this from
+// OSQP_EPSILON env var, which we do not bother exposing here.
+constexpr double kEpsilon = 1e-4;
+
+// Diagonal regularisation on the x-block of P_rlx (matches osqp.cppm).
+constexpr double kRelXReg = 1e-4;
+
+// CG inner-solve tolerances. Inner tol is relative to ||rhs||; we keep it
+// noticeably tighter than the outer ADMM tol so the inner solve never
+// bottlenecks convergence.
+constexpr double kInnerCgTol = 1e-9;
+constexpr int kInnerCgMaxIter = 400;
+
+// Bounds on the adaptive rho. OSQP uses similar limits.
+constexpr double kRhoMin = 1e-6;
+constexpr double kRhoMax = 1e6;
+
+constexpr int kBlockSize = 256;
+
+inline int grid_for(int n) {
+  return (n + kBlockSize - 1) / kBlockSize;
+}
+
+// out[i] = a * x[i] + b * y[i]
+__global__ void axby_kernel(int n, double a, const double *__restrict__ x,
+                            double b, const double *__restrict__ y,
+                            double *__restrict__ out) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n) {
+    out[i] = a * x[i] + b * y[i];
+  }
+}
+
+// y[i] = a * x[i] + b * y[i]   (in-place version of axby_kernel)
+__global__ void axpby_inplace_kernel(int n, double a,
+                                     const double *__restrict__ x, double b,
+                                     double *__restrict__ y) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n) {
+    y[i] = a * x[i] + b * y[i];
+  }
+}
+
+// rhs[i] = sigma * x_k[i] - q[i] + ct_term[i]
+__global__ void rhs_assemble_kernel(int n, double sigma,
+                                    const double *__restrict__ x_k,
+                                    const double *__restrict__ q,
+                                    const double *__restrict__ ct_term,
+                                    double *__restrict__ rhs) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n) {
+    rhs[i] = sigma * x_k[i] - q[i] + ct_term[i];
+  }
+}
+
+// out[i] = rho * z_k[i] - y_k[i]
+__global__ void rho_z_minus_y_kernel(int m, double rho,
+                                     const double *__restrict__ z_k,
+                                     const double *__restrict__ y_k,
+                                     double *__restrict__ out) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < m) {
+    out[i] = rho * z_k[i] - y_k[i];
+  }
+}
+
+// z_out[i] = clamp(zt_alpha[i] + y_k[i] / rho, l[i], u[i])
+__global__ void z_update_kernel(int m, double inv_rho,
+                                const double *__restrict__ zt_alpha,
+                                const double *__restrict__ y_k,
+                                const double *__restrict__ l,
+                                const double *__restrict__ u,
+                                double *__restrict__ z_out) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < m) {
+    double v = zt_alpha[i] + inv_rho * y_k[i];
+    if (v < l[i])
+      v = l[i];
+    if (v > u[i])
+      v = u[i];
+    z_out[i] = v;
+  }
+}
+
+// y[i] += rho * (zt_alpha[i] - z_new[i])
+__global__ void y_update_kernel(int m, double rho,
+                                const double *__restrict__ zt_alpha,
+                                const double *__restrict__ z_new,
+                                double *__restrict__ y) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < m) {
+    y[i] += rho * (zt_alpha[i] - z_new[i]);
+  }
+}
+
+// out[i] = max(0, x[i])
+__global__ void relu_kernel(int n, const double *__restrict__ x,
+                            double *__restrict__ out) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n) {
+    out[i] = (x[i] > 0.0) ? x[i] : 0.0;
+  }
+}
+
+// Tiny RAII wrapper around a raw byte buffer used for cuSPARSE SpMV temp
+// storage. Sized lazily; never shrunk.
+class ByteBuf {
+public:
+  ByteBuf() = default;
+  ~ByteBuf() { free(); }
+  ByteBuf(const ByteBuf &) = delete;
+  ByteBuf &operator=(const ByteBuf &) = delete;
+  ByteBuf(ByteBuf &&o) noexcept : ptr_(o.ptr_), bytes_(o.bytes_) {
+    o.ptr_ = nullptr;
+    o.bytes_ = 0;
+  }
+  ByteBuf &operator=(ByteBuf &&o) noexcept {
+    if (this != &o) {
+      free();
+      ptr_ = o.ptr_;
+      bytes_ = o.bytes_;
+      o.ptr_ = nullptr;
+      o.bytes_ = 0;
+    }
+    return *this;
+  }
+
+  void ensure(std::size_t bytes) {
+    if (bytes <= bytes_)
+      return;
+    free();
+    if (bytes > 0) {
+      BREAKAGE_CUDA_CHECK(cudaMalloc(&ptr_, bytes));
+      bytes_ = bytes;
+    }
+  }
+  void *data() { return ptr_; }
+  std::size_t size() const { return bytes_; }
+
+private:
+  void free() {
+    if (ptr_) {
+      cudaFree(ptr_);
+      ptr_ = nullptr;
+      bytes_ = 0;
+    }
+  }
+  void *ptr_{nullptr};
+  std::size_t bytes_{0};
+};
+
+} // namespace
+
+// ---------------------------------------------------------------------------
+// Stage: one ADMM run (P, C constant; q, l, u may change every solve).
+//
+// Stage owns:
+//   * the constant matrices P, C, C^T (CSR, on device)
+//   * a pre-allocated SpMV byte buffer per matrix
+//   * warm-start state (x, z, y) plus the current rho
+//   * scratch device buffers for the ADMM loop and the inner CG
+// ---------------------------------------------------------------------------
+
+class Stage {
+public:
+  // Builds device-side CSR copies of P (full symmetric) and C from Eigen
+  // sparse host matrices, allocates all working buffers. Does NOT initialise
+  // warm-start state - that happens lazily on the first solve.
+  void setup(SparseMatrixCSC P_full, SparseMatrixCSC C, const Settings &s);
+
+  // Runs ADMM until convergence or max_iter. q/l/u are host vectors
+  // (re-uploaded each call). Output warm-start lives in x_, z_, y_ and is
+  // also exposed via the const accessors below.
+  StageInfo solve(const VectorXd &q_host, const VectorXd &l_host,
+                  const VectorXd &u_host, cudaStream_t stream);
+
+  // Drops warm-start state; next solve starts cold and resets rho to
+  // settings_.rho_init.
+  void reset_warm_start() { has_warm_ = false; }
+
+  int n() const { return n_; }
+  int m() const { return m_; }
+
+  const double *x_device() const { return x_.data(); }
+  const double *z_device() const { return z_.data(); }
+  const double *y_device() const { return y_.data(); }
+
+private:
+  // Apply M = P + sigma I + rho C^T C to x (n elements) and write into Mx.
+  // Uses scratch_m_ as the C*x scratchpad. Both x and Mx must be size n_.
+  void apply_M(const double *x_d, double *Mx_d, double rho,
+               cudaStream_t stream);
+
+  // Solve M xt = rhs via CG, warm-started from xt_d (in-place). Returns the
+  // number of CG iterations consumed (informational only).
+  int solve_kkt_cg(const double *rhs_d, double *xt_d, double rho,
+                   cudaStream_t stream);
+
+  Settings settings_{};
+
+  int n_{0};
+  int m_{0};
+
+  DeviceCsr P_;
+  DeviceCsr C_;
+  DeviceCsr Ct_;
+
+  ByteBuf spmv_buf_P_;
+  ByteBuf spmv_buf_C_;
+  ByteBuf spmv_buf_Ct_;
+
+  // Warm-start.
+  DeviceBuffer<double> x_; // size n_
+  DeviceBuffer<double> z_; // size m_
+  DeviceBuffer<double> y_; // size m_
+  bool has_warm_{false};
+  double rho_{0.1};
+
+  // Per-call uploads.
+  DeviceBuffer<double> q_d_; // size n_
+  DeviceBuffer<double> l_d_; // size m_
+  DeviceBuffer<double> u_d_; // size m_
+
+  // ADMM scratch buffers.
+  DeviceBuffer<double> xt_;        // size n_
+  DeviceBuffer<double> zt_;        // size m_
+  DeviceBuffer<double> xt_alpha_;  // size n_
+  DeviceBuffer<double> zt_alpha_;  // size m_
+  DeviceBuffer<double> rhs_;       // size n_
+  DeviceBuffer<double> scratch_n_; // size n_
+  DeviceBuffer<double> scratch_m_; // size m_
+
+  // Convergence-check scratch.
+  DeviceBuffer<double> r_prim_; // size m_
+  DeviceBuffer<double> r_dual_; // size n_
+
+  // CG scratch.
+  DeviceBuffer<double> cg_r_;  // size n_
+  DeviceBuffer<double> cg_p_;  // size n_
+  DeviceBuffer<double> cg_Ap_; // size n_
+};
+
+void Stage::setup(SparseMatrixCSC P_full, SparseMatrixCSC C,
+                  const Settings &s) {
+  settings_ = s;
+  n_ = static_cast<int>(P_full.rows());
+  m_ = static_cast<int>(C.rows());
+  if (P_full.cols() != n_) {
+    throw std::invalid_argument("Stage::setup: P must be square");
+  }
+  if (C.cols() != n_) {
+    throw std::invalid_argument("Stage::setup: C cols mismatch");
+  }
+
+  if (!P_full.isCompressed())
+    P_full.makeCompressed();
+  if (!C.isCompressed())
+    C.makeCompressed();
+
+  upload_csc_as_csr(P_full, P_);
+  upload_csc_as_csr(C, C_);
+
+  // C^T as a separate CSR. cuSPARSE supports SpMV with op=TRANSPOSE on a CSR
+  // input, but it is consistently slower than non-transpose, so we pay the
+  // one-time host transpose to get faster inner-loop SpMVs.
+  SparseMatrixCSC Ct = SparseMatrixCSC(C.transpose());
+  Ct.makeCompressed();
+  upload_csc_as_csr(Ct, Ct_);
+
+  // Pre-size SpMV buffers. m_ or n_ may be zero in degenerate problems, in
+  // which case spmv_buffer_size still works (cuSPARSE handles empty mats).
+  spmv_buf_P_.ensure(cuda_la::spmv_buffer_size(P_, false));
+  spmv_buf_C_.ensure(cuda_la::spmv_buffer_size(C_, false));
+  spmv_buf_Ct_.ensure(cuda_la::spmv_buffer_size(Ct_, false));
+
+  x_.resize(static_cast<std::size_t>(n_));
+  z_.resize(static_cast<std::size_t>(m_));
+  y_.resize(static_cast<std::size_t>(m_));
+  q_d_.resize(static_cast<std::size_t>(n_));
+  l_d_.resize(static_cast<std::size_t>(m_));
+  u_d_.resize(static_cast<std::size_t>(m_));
+
+  xt_.resize(static_cast<std::size_t>(n_));
+  zt_.resize(static_cast<std::size_t>(m_));
+  xt_alpha_.resize(static_cast<std::size_t>(n_));
+  zt_alpha_.resize(static_cast<std::size_t>(m_));
+  rhs_.resize(static_cast<std::size_t>(n_));
+  scratch_n_.resize(static_cast<std::size_t>(n_));
+  scratch_m_.resize(static_cast<std::size_t>(m_));
+
+  r_prim_.resize(static_cast<std::size_t>(m_));
+  r_dual_.resize(static_cast<std::size_t>(n_));
+
+  cg_r_.resize(static_cast<std::size_t>(n_));
+  cg_p_.resize(static_cast<std::size_t>(n_));
+  cg_Ap_.resize(static_cast<std::size_t>(n_));
+
+  has_warm_ = false;
+  rho_ = settings_.rho_init;
+}
+
+void Stage::apply_M(const double *x_d, double *Mx_d, double rho,
+                    cudaStream_t stream) {
+  // Mx = P*x
+  cuda_la::spmv(P_, false, 1.0, x_d, 0.0, Mx_d, spmv_buf_P_.data(),
+                spmv_buf_P_.size(), stream);
+  // Mx += sigma * x   (axpy: y += alpha * x)
+  cuda_la::axpy(n_, settings_.sigma, x_d, Mx_d, stream);
+  if (m_ > 0) {
+    // scratch_m = C * x
+    cuda_la::spmv(C_, false, 1.0, x_d, 0.0, scratch_m_.data(),
+                  spmv_buf_C_.data(), spmv_buf_C_.size(), stream);
+    // Mx += rho * C^T * scratch_m
+    cuda_la::spmv(Ct_, false, rho, scratch_m_.data(), 1.0, Mx_d,
+                  spmv_buf_Ct_.data(), spmv_buf_Ct_.size(), stream);
+  }
+}
+
+int Stage::solve_kkt_cg(const double *rhs_d, double *xt_d, double rho,
+                        cudaStream_t stream) {
+  // Initial residual r = rhs - M*xt (xt is the warm-start input).
+  // We reuse cg_Ap_ as the temporary for M*xt.
+  apply_M(xt_d, cg_Ap_.data(), rho, stream);
+  axby_kernel<<<grid_for(n_), kBlockSize, 0, stream>>>(
+      n_, 1.0, rhs_d, -1.0, cg_Ap_.data(), cg_r_.data());
+  // p = r
+  cuda_la::copy(n_, cg_r_.data(), cg_p_.data(), stream);
+
+  double rs_old = cuda_la::dot(n_, cg_r_.data(), cg_r_.data(), stream);
+  // Tolerance is relative to the rhs norm so problems of different scale
+  // all get the same relative accuracy.
+  double rhs_norm = cuda_la::nrm2(n_, rhs_d, stream);
+  double tol_sq = (kInnerCgTol * std::max(rhs_norm, 1e-30));
+  tol_sq = tol_sq * tol_sq;
+  if (rs_old <= tol_sq) {
+    return 0;
+  }
+
+  for (int it = 0; it < kInnerCgMaxIter; ++it) {
+    // cg_Ap = M * cg_p
+    apply_M(cg_p_.data(), cg_Ap_.data(), rho, stream);
+    double pAp = cuda_la::dot(n_, cg_p_.data(), cg_Ap_.data(), stream);
+    if (!(pAp > 0.0)) {
+      // Numerical breakdown - bail out with whatever we have.
+      return it;
+    }
+    double alpha = rs_old / pAp;
+    // xt += alpha * p
+    cuda_la::axpy(n_, alpha, cg_p_.data(), xt_d, stream);
+    // r  -= alpha * Ap
+    cuda_la::axpy(n_, -alpha, cg_Ap_.data(), cg_r_.data(), stream);
+
+    double rs_new = cuda_la::dot(n_, cg_r_.data(), cg_r_.data(), stream);
+    if (rs_new <= tol_sq) {
+      return it + 1;
+    }
+    double beta = rs_new / rs_old;
+    // p = r + beta * p   (in-place: p = beta * p + 1.0 * r)
+    axpby_inplace_kernel<<<grid_for(n_), kBlockSize, 0, stream>>>(
+        n_, 1.0, cg_r_.data(), beta, cg_p_.data());
+    rs_old = rs_new;
+  }
+  return kInnerCgMaxIter;
+}
+
+StageInfo Stage::solve(const VectorXd &q_host, const VectorXd &l_host,
+                       const VectorXd &u_host, cudaStream_t stream) {
+  if (q_host.size() != n_) {
+    throw std::invalid_argument("Stage::solve: q size mismatch");
+  }
+  if (l_host.size() != m_ || u_host.size() != m_) {
+    throw std::invalid_argument("Stage::solve: l/u size mismatch");
+  }
+
+  auto t_start = std::chrono::steady_clock::now();
+
+  // Upload q, l, u (re-uploaded every call).
+  if (n_ > 0) {
+    q_d_.copy_from_host(q_host.data(), static_cast<std::size_t>(n_));
+  }
+  if (m_ > 0) {
+    l_d_.copy_from_host(l_host.data(), static_cast<std::size_t>(m_));
+    u_d_.copy_from_host(u_host.data(), static_cast<std::size_t>(m_));
+  }
+
+  // Cold-start state.
+  if (!has_warm_) {
+    if (n_ > 0)
+      x_.zero();
+    if (m_ > 0) {
+      z_.zero();
+      y_.zero();
+    }
+    rho_ = settings_.rho_init;
+  }
+
+  StageInfo info;
+  info.rho = rho_;
+
+  bool converged = false;
+  int iter = 0;
+  double prim_res = std::numeric_limits<double>::quiet_NaN();
+  double dual_res = std::numeric_limits<double>::quiet_NaN();
+  double q_norm_inf =
+      (n_ > 0) ? cuda_la::inf_norm(n_, q_d_.data(), stream) : 0.0;
+
+  for (iter = 0; iter < settings_.max_iter; ++iter) {
+    // 1. Build rhs = sigma * x_k - q + C^T * (rho * z_k - y_k)
+    if (m_ > 0) {
+      // scratch_m = rho * z_k - y_k
+      rho_z_minus_y_kernel<<<grid_for(m_), kBlockSize, 0, stream>>>(
+          m_, rho_, z_.data(), y_.data(), scratch_m_.data());
+      // scratch_n = C^T * scratch_m
+      cuda_la::spmv(Ct_, false, 1.0, scratch_m_.data(), 0.0, scratch_n_.data(),
+                    spmv_buf_Ct_.data(), spmv_buf_Ct_.size(), stream);
+    } else if (n_ > 0) {
+      scratch_n_.zero();
+    }
+    if (n_ > 0) {
+      rhs_assemble_kernel<<<grid_for(n_), kBlockSize, 0, stream>>>(
+          n_, settings_.sigma, x_.data(), q_d_.data(), scratch_n_.data(),
+          rhs_.data());
+    }
+
+    // 2. Solve M xt = rhs via CG, warm-started from x_k.
+    if (n_ > 0) {
+      cuda_la::copy(n_, x_.data(), xt_.data(), stream);
+      solve_kkt_cg(rhs_.data(), xt_.data(), rho_, stream);
+    }
+
+    // 3. zt = C * xt   (recovered from the reduced KKT)
+    if (m_ > 0) {
+      cuda_la::spmv(C_, false, 1.0, xt_.data(), 0.0, zt_.data(),
+                    spmv_buf_C_.data(), spmv_buf_C_.size(), stream);
+    }
+
+    // 4. Over-relaxation:
+    //    xt_alpha = alpha * xt + (1 - alpha) * x_k
+    //    zt_alpha = alpha * zt + (1 - alpha) * z_k
+    if (n_ > 0) {
+      axby_kernel<<<grid_for(n_), kBlockSize, 0, stream>>>(
+          n_, settings_.alpha, xt_.data(), 1.0 - settings_.alpha, x_.data(),
+          xt_alpha_.data());
+    }
+    if (m_ > 0) {
+      axby_kernel<<<grid_for(m_), kBlockSize, 0, stream>>>(
+          m_, settings_.alpha, zt_.data(), 1.0 - settings_.alpha, z_.data(),
+          zt_alpha_.data());
+
+      // 5. z_{k+1} = clamp(zt_alpha + y_k / rho, l, u)  (into scratch_m)
+      double inv_rho = 1.0 / rho_;
+      z_update_kernel<<<grid_for(m_), kBlockSize, 0, stream>>>(
+          m_, inv_rho, zt_alpha_.data(), y_.data(), l_d_.data(), u_d_.data(),
+          scratch_m_.data());
+
+      // 6. y += rho * (zt_alpha - z_{k+1})
+      y_update_kernel<<<grid_for(m_), kBlockSize, 0, stream>>>(
+          m_, rho_, zt_alpha_.data(), scratch_m_.data(), y_.data());
+
+      // Promote z_{k+1} into z_  (z_ <- scratch_m).
+      cuda_la::copy(m_, scratch_m_.data(), z_.data(), stream);
+    }
+
+    // 7. x_{k+1} = xt_alpha
+    if (n_ > 0) {
+      cuda_la::copy(n_, xt_alpha_.data(), x_.data(), stream);
+    }
+
+    // 8. Periodic convergence + rho-update check.
+    bool last_iter = (iter + 1 == settings_.max_iter);
+    if (((iter + 1) % std::max(1, settings_.check_every) == 0) || last_iter) {
+      // Compute fresh A x_{k+1}, P x_{k+1}, A^T y_{k+1} for the residual
+      // checks. These cost three SpMVs but only fire occasionally.
+      double Ax_norm_inf = 0.0;
+      double z_norm_inf = 0.0;
+      double Px_norm_inf = 0.0;
+      double Aty_norm_inf = 0.0;
+
+      if (m_ > 0) {
+        // scratch_m = A * x
+        cuda_la::spmv(C_, false, 1.0, x_.data(), 0.0, scratch_m_.data(),
+                      spmv_buf_C_.data(), spmv_buf_C_.size(), stream);
+        Ax_norm_inf = cuda_la::inf_norm(m_, scratch_m_.data(), stream);
+        // r_prim = scratch_m - z
+        axby_kernel<<<grid_for(m_), kBlockSize, 0, stream>>>(
+            m_, 1.0, scratch_m_.data(), -1.0, z_.data(), r_prim_.data());
+        prim_res = cuda_la::inf_norm(m_, r_prim_.data(), stream);
+        z_norm_inf = cuda_la::inf_norm(m_, z_.data(), stream);
+      } else {
+        prim_res = 0.0;
+      }
+
+      if (n_ > 0) {
+        // scratch_n = P * x
+        cuda_la::spmv(P_, false, 1.0, x_.data(), 0.0, scratch_n_.data(),
+                      spmv_buf_P_.data(), spmv_buf_P_.size(), stream);
+        Px_norm_inf = cuda_la::inf_norm(n_, scratch_n_.data(), stream);
+
+        // r_dual = scratch_n + q
+        axby_kernel<<<grid_for(n_), kBlockSize, 0, stream>>>(
+            n_, 1.0, scratch_n_.data(), 1.0, q_d_.data(), r_dual_.data());
+
+        if (m_ > 0) {
+          // r_dual += C^T y    (uses scratch_n as a temporary)
+          cuda_la::spmv(Ct_, false, 1.0, y_.data(), 0.0, scratch_n_.data(),
+                        spmv_buf_Ct_.data(), spmv_buf_Ct_.size(), stream);
+          Aty_norm_inf = cuda_la::inf_norm(n_, scratch_n_.data(), stream);
+          cuda_la::axpy(n_, 1.0, scratch_n_.data(), r_dual_.data(), stream);
+        }
+        dual_res = cuda_la::inf_norm(n_, r_dual_.data(), stream);
+      } else {
+        dual_res = 0.0;
+      }
+
+      double prim_tol = settings_.eps_abs +
+                        settings_.eps_rel *
+                            std::max(Ax_norm_inf, z_norm_inf);
+      double dual_tol = settings_.eps_abs +
+                        settings_.eps_rel *
+                            std::max({Px_norm_inf, q_norm_inf, Aty_norm_inf});
+
+      info.prim_res = prim_res;
+      info.dual_res = dual_res;
+      info.iter = iter + 1;
+
+      if (prim_res <= prim_tol && dual_res <= dual_tol) {
+        converged = true;
+        break;
+      }
+
+      // Adaptive rho. The implicit-M inner solver does not need a refactor
+      // when rho changes - we only update the scalar.
+      if (settings_.adaptive_rho) {
+        double prim_scale = std::max({Ax_norm_inf, z_norm_inf, 1e-30});
+        double dual_scale =
+            std::max({Px_norm_inf, q_norm_inf, Aty_norm_inf, 1e-30});
+        double prim_norm_rel = prim_res / prim_scale;
+        double dual_norm_rel = dual_res / dual_scale;
+        if (prim_norm_rel > 0.0 && dual_norm_rel > 0.0) {
+          double rho_new = rho_ * std::sqrt(prim_norm_rel / dual_norm_rel);
+          rho_new = std::clamp(rho_new, kRhoMin, kRhoMax);
+          double ratio = rho_new / rho_;
+          if (ratio > settings_.adaptive_rho_factor ||
+              ratio < 1.0 / settings_.adaptive_rho_factor) {
+            rho_ = rho_new;
+            info.rho = rho_;
+          }
+        }
+      }
+    }
+  }
+
+  has_warm_ = true;
+  info.converged = converged;
+  info.rho = rho_;
+  if (info.iter == 0) {
+    info.iter = iter;
+  }
+
+  auto t_end = std::chrono::steady_clock::now();
+  info.solve_time_s = std::chrono::duration<double>(t_end - t_start).count();
+  return info;
+}
+
+// ---------------------------------------------------------------------------
+// Helpers for building stage-specific Eigen sparse matrices on the host.
+// These mirror the OsqpSolver::build_*_ helpers in osqp.cppm.
+// ---------------------------------------------------------------------------
+
+namespace {
+
+using Triplet = Eigen::Triplet<double>;
+
+// P_prj: diag(0 ... 0 (nx zeros), 1 ... 1 (me ones)) of size (nx+me, nx+me).
+SparseMatrixCSC build_P_prj(int nx, int me) {
+  std::vector<Triplet> trips;
+  trips.reserve(static_cast<std::size_t>(me));
+  for (int i = 0; i < me; ++i) {
+    trips.emplace_back(nx + i, nx + i, 1.0);
+  }
+  SparseMatrixCSC P(nx + me, nx + me);
+  P.setFromTriplets(trips.begin(), trips.end());
+  P.makeCompressed();
+  return P;
+}
+
+// C_prj: [A | -I_me] over the top me rows, [G | 0] underneath.
+SparseMatrixCSC build_C_prj(const SparseMatrixCSC &A, const SparseMatrixCSC &G,
+                            int nx, int me, int mi) {
+  std::vector<Triplet> trips;
+  trips.reserve(static_cast<std::size_t>(A.nonZeros() + G.nonZeros() + me));
+  for (int k = 0; k < A.outerSize(); ++k) {
+    for (SparseMatrixCSC::InnerIterator it(A, k); it; ++it) {
+      trips.emplace_back(static_cast<int>(it.row()),
+                         static_cast<int>(it.col()), it.value());
+    }
+  }
+  for (int k = 0; k < G.outerSize(); ++k) {
+    for (SparseMatrixCSC::InnerIterator it(G, k); it; ++it) {
+      trips.emplace_back(me + static_cast<int>(it.row()),
+                         static_cast<int>(it.col()), it.value());
+    }
+  }
+  for (int i = 0; i < me; ++i) {
+    trips.emplace_back(i, nx + i, -1.0);
+  }
+  SparseMatrixCSC C(me + mi, nx + me);
+  C.setFromTriplets(trips.begin(), trips.end());
+  C.makeCompressed();
+  return C;
+}
+
+// P_rlx: diag(rel_x_reg * I_nx, I_nv).
+SparseMatrixCSC build_P_rlx(int nx, int nv) {
+  std::vector<Triplet> trips;
+  trips.reserve(static_cast<std::size_t>(nx + nv));
+  for (int i = 0; i < nx; ++i) {
+    trips.emplace_back(i, i, kRelXReg);
+  }
+  for (int i = 0; i < nv; ++i) {
+    trips.emplace_back(nx + i, nx + i, 1.0);
+  }
+  SparseMatrixCSC P(nx + nv, nx + nv);
+  P.setFromTriplets(trips.begin(), trips.end());
+  P.makeCompressed();
+  return P;
+}
+
+// C_rlx: [ A 0 ; G 0 ; H -V ; 0 I_nv ].
+SparseMatrixCSC build_C_rlx(const SparseMatrixCSC &A,
+                            const SparseMatrixCSC &G,
+                            const SparseMatrixCSC &H,
+                            const SparseMatrixCSC &V, int nx, int me, int mi,
+                            int mh, int nv) {
+  std::vector<Triplet> trips;
+  trips.reserve(static_cast<std::size_t>(A.nonZeros() + G.nonZeros() +
+                                         H.nonZeros() + V.nonZeros() + nv));
+  for (int k = 0; k < A.outerSize(); ++k) {
+    for (SparseMatrixCSC::InnerIterator it(A, k); it; ++it) {
+      trips.emplace_back(static_cast<int>(it.row()),
+                         static_cast<int>(it.col()), it.value());
+    }
+  }
+  for (int k = 0; k < G.outerSize(); ++k) {
+    for (SparseMatrixCSC::InnerIterator it(G, k); it; ++it) {
+      trips.emplace_back(me + static_cast<int>(it.row()),
+                         static_cast<int>(it.col()), it.value());
+    }
+  }
+  for (int k = 0; k < H.outerSize(); ++k) {
+    for (SparseMatrixCSC::InnerIterator it(H, k); it; ++it) {
+      trips.emplace_back(me + mi + static_cast<int>(it.row()),
+                         static_cast<int>(it.col()), it.value());
+    }
+  }
+  for (int k = 0; k < V.outerSize(); ++k) {
+    for (SparseMatrixCSC::InnerIterator it(V, k); it; ++it) {
+      trips.emplace_back(me + mi + static_cast<int>(it.row()),
+                         nx + static_cast<int>(it.col()), -it.value());
+    }
+  }
+  for (int i = 0; i < nv; ++i) {
+    trips.emplace_back(me + mi + mh + i, nx + i, 1.0);
+  }
+  SparseMatrixCSC C(me + mi + mh + nv, nx + nv);
+  C.setFromTriplets(trips.begin(), trips.end());
+  C.makeCompressed();
+  return C;
+}
+
+// P_opt: full symmetric Q. We don't restrict to the upper triangle here
+// (osqp.cppm does for the OSQP front-end, but the inner ADMM works on the
+// symmetric version directly).
+SparseMatrixCSC build_P_opt(const SparseMatrixCSC &Q) {
+  SparseMatrixCSC P = Q;
+  P.makeCompressed();
+  return P;
+}
+
+// C_opt: [ A ; G ; H ].
+SparseMatrixCSC build_C_opt(const SparseMatrixCSC &A,
+                            const SparseMatrixCSC &G,
+                            const SparseMatrixCSC &H, int nx, int me, int mi,
+                            int mh) {
+  std::vector<Triplet> trips;
+  trips.reserve(static_cast<std::size_t>(A.nonZeros() + G.nonZeros() +
+                                         H.nonZeros()));
+  for (int k = 0; k < A.outerSize(); ++k) {
+    for (SparseMatrixCSC::InnerIterator it(A, k); it; ++it) {
+      trips.emplace_back(static_cast<int>(it.row()),
+                         static_cast<int>(it.col()), it.value());
+    }
+  }
+  for (int k = 0; k < G.outerSize(); ++k) {
+    for (SparseMatrixCSC::InnerIterator it(G, k); it; ++it) {
+      trips.emplace_back(me + static_cast<int>(it.row()),
+                         static_cast<int>(it.col()), it.value());
+    }
+  }
+  for (int k = 0; k < H.outerSize(); ++k) {
+    for (SparseMatrixCSC::InnerIterator it(H, k); it; ++it) {
+      trips.emplace_back(me + mi + static_cast<int>(it.row()),
+                         static_cast<int>(it.col()), it.value());
+    }
+  }
+  SparseMatrixCSC C(me + mi + mh, nx);
+  C.setFromTriplets(trips.begin(), trips.end());
+  C.makeCompressed();
+  return C;
+}
+
+// ---------------------------------------------------------------------------
+// Side-allocated impl structs.
+//
+// Solver and Pipeline have fixed member layouts in their public headers and
+// we are not allowed to touch those headers. The extra device buffers we
+// need (system A/V, scratch b, etc.) live in heap-allocated impl structs
+// kept in process-local maps keyed by the owner pointer. This is a bit
+// unusual but keeps the public API frozen.
+// ---------------------------------------------------------------------------
+
+struct SolverImpl {
+  DeviceCsr A_dev;
+  ByteBuf spmv_buf_A;
+  // V is small and stays on the host: V * v_relax is done with Eigen.
+  SparseMatrixCSC V_host;
+  // Scratch for b/b_prj on device.
+  DeviceBuffer<double> b_d;
+  DeviceBuffer<double> b_prj_d;
+  // max(0, x_rlx.tail(nv)) staging buffer.
+  DeviceBuffer<double> v_relax_full_d;
+};
+
+std::unordered_map<const Solver *, std::unique_ptr<SolverImpl>> &
+solver_impl_map() {
+  static std::unordered_map<const Solver *, std::unique_ptr<SolverImpl>> m;
+  return m;
+}
+std::mutex &solver_impl_mutex() {
+  static std::mutex m;
+  return m;
+}
+SolverImpl &solver_impl(const Solver *self) {
+  std::lock_guard<std::mutex> lk(solver_impl_mutex());
+  auto &m = solver_impl_map();
+  auto it = m.find(self);
+  if (it == m.end()) {
+    auto [ins, _] = m.emplace(self, std::make_unique<SolverImpl>());
+    return *ins->second;
+  }
+  return *it->second;
+}
+void erase_solver_impl(const Solver *self) {
+  std::lock_guard<std::mutex> lk(solver_impl_mutex());
+  solver_impl_map().erase(self);
+}
+void move_solver_impl(const Solver *from, const Solver *to) {
+  std::lock_guard<std::mutex> lk(solver_impl_mutex());
+  auto &m = solver_impl_map();
+  auto it = m.find(from);
+  if (it != m.end()) {
+    auto ptr = std::move(it->second);
+    m.erase(it);
+    m[to] = std::move(ptr);
+  }
+}
+
+struct PipelineImpl {
+  DeviceInput input;
+  DeviceBuffer<double> v_W_prev_d; // (N, 3) row-major
+  DeviceBuffer<double> L_prev_d;   // (N, 3) row-major
+  std::vector<double> rm_buf;      // host scratch for row-major conversion
+};
+
+std::unordered_map<const Pipeline *, std::unique_ptr<PipelineImpl>> &
+pipeline_impl_map() {
+  static std::unordered_map<const Pipeline *, std::unique_ptr<PipelineImpl>> m;
+  return m;
+}
+std::mutex &pipeline_impl_mutex() {
+  static std::mutex m;
+  return m;
+}
+PipelineImpl &pipeline_impl(const Pipeline *self) {
+  std::lock_guard<std::mutex> lk(pipeline_impl_mutex());
+  auto &m = pipeline_impl_map();
+  auto it = m.find(self);
+  if (it == m.end()) {
+    auto [ins, _] = m.emplace(self, std::make_unique<PipelineImpl>());
+    return *ins->second;
+  }
+  return *it->second;
+}
+void erase_pipeline_impl(const Pipeline *self) {
+  std::lock_guard<std::mutex> lk(pipeline_impl_mutex());
+  pipeline_impl_map().erase(self);
+}
+
+// Convert a host MatrixX3d (column-major Eigen) into a row-major flat array.
+void to_row_major_N3(const MatrixX3d &src, std::vector<double> &dst) {
+  const int N = static_cast<int>(src.rows());
+  dst.resize(static_cast<std::size_t>(N) * 3);
+  for (int i = 0; i < N; ++i) {
+    dst[3 * i + 0] = src(i, 0);
+    dst[3 * i + 1] = src(i, 1);
+    dst[3 * i + 2] = src(i, 2);
+  }
+}
+
+// Convert a row-major flat array back into a column-major MatrixX3d.
+void from_row_major_N3(const std::vector<double> &src, MatrixX3d &dst, int N) {
+  dst.resize(N, 3);
+  for (int i = 0; i < N; ++i) {
+    dst(i, 0) = src[3 * i + 0];
+    dst(i, 1) = src[3 * i + 1];
+    dst(i, 2) = src[3 * i + 2];
+  }
+}
+
+} // namespace
+
+// ---------------------------------------------------------------------------
+// Solver
+// ---------------------------------------------------------------------------
+
+Solver::Solver() = default;
+
+Solver::~Solver() { erase_solver_impl(this); }
+
+Solver::Solver(Solver &&o) noexcept
+    : prj_(std::move(o.prj_)), rlx_(std::move(o.rlx_)),
+      opt_(std::move(o.opt_)), settings_(o.settings_), nx_(o.nx_), me_(o.me_),
+      mi_(o.mi_), mh_(o.mh_), nv_(o.nv_) {
+  move_solver_impl(&o, this);
+}
+
+Solver &Solver::operator=(Solver &&o) noexcept {
+  if (this != &o) {
+    erase_solver_impl(this);
+    prj_ = std::move(o.prj_);
+    rlx_ = std::move(o.rlx_);
+    opt_ = std::move(o.opt_);
+    settings_ = o.settings_;
+    nx_ = o.nx_;
+    me_ = o.me_;
+    mi_ = o.mi_;
+    mh_ = o.mh_;
+    nv_ = o.nv_;
+    move_solver_impl(&o, this);
+  }
+  return *this;
+}
+
+int Solver::num_vars() const { return nx_; }
+int Solver::num_eq() const { return me_; }
+int Solver::num_clutches() const { return nv_; }
+
+void Solver::reset_warm_start() {
+  if (prj_)
+    prj_->reset_warm_start();
+  if (rlx_)
+    rlx_->reset_warm_start();
+  if (opt_)
+    opt_->reset_warm_start();
+}
+
+void Solver::setup(const HostSystem &host, const DeviceSystem & /*dev*/,
+                   const Settings &settings) {
+  settings_ = settings;
+  nx_ = host.num_vars;
+  me_ = host.num_eq;
+  mi_ = host.num_ineq;
+  mh_ = host.num_relaxed_ineq;
+  nv_ = host.num_clutches;
+
+  if (host.A.rows() != me_ || host.A.cols() != nx_ || host.G.rows() != mi_ ||
+      host.G.cols() != nx_ || host.H.rows() != mh_ || host.H.cols() != nx_ ||
+      host.V.rows() != mh_ || host.V.cols() != nv_ || host.Q.rows() != nx_ ||
+      host.Q.cols() != nx_) {
+    throw std::invalid_argument(
+        "Solver::setup: HostSystem matrix shape mismatch");
+  }
+
+  // Build host stage matrices (CSC, Eigen).
+  SparseMatrixCSC P_prj = build_P_prj(nx_, me_);
+  SparseMatrixCSC C_prj = build_C_prj(host.A, host.G, nx_, me_, mi_);
+  SparseMatrixCSC P_rlx = build_P_rlx(nx_, nv_);
+  SparseMatrixCSC C_rlx =
+      build_C_rlx(host.A, host.G, host.H, host.V, nx_, me_, mi_, mh_, nv_);
+  SparseMatrixCSC P_opt = build_P_opt(host.Q);
+  SparseMatrixCSC C_opt =
+      build_C_opt(host.A, host.G, host.H, nx_, me_, mi_, mh_);
+
+  prj_ = std::make_unique<Stage>();
+  rlx_ = std::make_unique<Stage>();
+  opt_ = std::make_unique<Stage>();
+  prj_->setup(std::move(P_prj), std::move(C_prj), settings_);
+  rlx_->setup(std::move(P_rlx), std::move(C_rlx), settings_);
+  opt_->setup(std::move(P_opt), std::move(C_opt), settings_);
+
+  // Per-Solver impl: A on device for b_prj, V on host for V*v_relax.
+  SolverImpl &impl = solver_impl(this);
+  SparseMatrixCSC A_csc = host.A;
+  if (!A_csc.isCompressed())
+    A_csc.makeCompressed();
+  upload_csc_as_csr(A_csc, impl.A_dev);
+  impl.spmv_buf_A.ensure(cuda_la::spmv_buffer_size(impl.A_dev, false));
+  impl.V_host = host.V;
+  if (!impl.V_host.isCompressed())
+    impl.V_host.makeCompressed();
+  impl.b_d.resize(static_cast<std::size_t>(me_));
+  impl.b_prj_d.resize(static_cast<std::size_t>(me_));
+  impl.v_relax_full_d.resize(static_cast<std::size_t>(nv_));
+}
+
+SolveInfo Solver::solve(const VectorXd &b, double *x_d, double *slack_d,
+                        double *v_relax_d, cudaStream_t stream) {
+  if (b.size() != me_) {
+    throw std::invalid_argument("Solver::solve: b size mismatch");
+  }
+  if (!prj_ || !rlx_ || !opt_) {
+    throw std::runtime_error("Solver::solve: setup() not called");
+  }
+  SolveInfo info;
+  SolverImpl &impl = solver_impl(this);
+  if (me_ > 0) {
+    impl.b_d.copy_from_host(b.data(), static_cast<std::size_t>(me_));
+  }
+
+  // -------- Stage 1: projection --------
+  {
+    VectorXd q_prj(nx_ + me_);
+    q_prj.head(nx_).setZero();
+    q_prj.tail(me_) = -b;
+    VectorXd l_prj = VectorXd::Zero(me_ + mi_);
+    VectorXd u_prj(me_ + mi_);
+    u_prj.head(me_).setZero();
+    u_prj.tail(mi_).setConstant(settings_.infinity);
+
+    info.prj = prj_->solve(q_prj, l_prj, u_prj, stream);
+  }
+
+  // b_prj = A * x_prj.head(nx)   (SpMV on device, then download)
+  VectorXd b_prj_host(me_);
+  if (me_ > 0) {
+    cuda_la::spmv(impl.A_dev, false, 1.0, prj_->x_device(), 0.0,
+                  impl.b_prj_d.data(), impl.spmv_buf_A.data(),
+                  impl.spmv_buf_A.size(), stream);
+    BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(b_prj_host.data(), impl.b_prj_d.data(),
+                                        me_ * sizeof(double),
+                                        cudaMemcpyDeviceToHost, stream));
+  }
+
+  // slack_d = b - A * x_prj.head(nx) = b - b_prj
+  if (slack_d != nullptr && me_ > 0) {
+    BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(slack_d, impl.b_d.data(),
+                                        me_ * sizeof(double),
+                                        cudaMemcpyDeviceToDevice, stream));
+    cuda_la::axpy(me_, -1.0, impl.b_prj_d.data(), slack_d, stream);
+  }
+
+  // Make sure b_prj_host is populated before we use it for stage 2 inputs.
+  BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream));
+
+  // -------- Stage 2: relaxation --------
+  {
+    VectorXd q_rlx = VectorXd::Zero(nx_ + nv_);
+    VectorXd l_rlx(me_ + mi_ + mh_ + nv_);
+    VectorXd u_rlx(me_ + mi_ + mh_ + nv_);
+    l_rlx.head(me_) = b_prj_host;
+    u_rlx.head(me_) = b_prj_host;
+    l_rlx.segment(me_, mi_).setConstant(-kEpsilon);
+    u_rlx.segment(me_, mi_).setConstant(settings_.infinity);
+    l_rlx.segment(me_ + mi_, mh_).setConstant(-settings_.infinity);
+    u_rlx.segment(me_ + mi_, mh_).setConstant(1.0 - kEpsilon);
+    l_rlx.tail(nv_).setZero();
+    u_rlx.tail(nv_).setConstant(settings_.infinity);
+
+    info.rlx = rlx_->solve(q_rlx, l_rlx, u_rlx, stream);
+  }
+
+  // v_relax = max(0, x_rlx.tail(nv))
+  VectorXd v_relax_host(nv_);
+  if (nv_ > 0) {
+    const double *x_rlx_tail = rlx_->x_device() + nx_;
+    relu_kernel<<<grid_for(nv_), kBlockSize, 0, stream>>>(
+        nv_, x_rlx_tail, impl.v_relax_full_d.data());
+    if (v_relax_d != nullptr) {
+      BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(v_relax_d,
+                                          impl.v_relax_full_d.data(),
+                                          nv_ * sizeof(double),
+                                          cudaMemcpyDeviceToDevice, stream));
+    }
+    BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(v_relax_host.data(),
+                                        impl.v_relax_full_d.data(),
+                                        nv_ * sizeof(double),
+                                        cudaMemcpyDeviceToHost, stream));
+    BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
+
+  // -------- Stage 3: optimisation --------
+  {
+    VectorXd q_opt = VectorXd::Zero(nx_);
+    VectorXd l_opt(me_ + mi_ + mh_);
+    VectorXd u_opt(me_ + mi_ + mh_);
+    l_opt.head(me_) = b_prj_host;
+    u_opt.head(me_) = b_prj_host;
+    l_opt.segment(me_, mi_).setConstant(-kEpsilon);
+    u_opt.segment(me_, mi_).setConstant(settings_.infinity);
+    l_opt.tail(mh_).setConstant(-settings_.infinity);
+    // Match osqp.cppm / ref_cpu.cpp: opt-stage H-row upper bound is 1.0
+    // (NOT 1.0 - epsilon -- that one belongs to the relaxation stage).
+    u_opt.tail(mh_).setConstant(1.0);
+    if (mh_ > 0 && nv_ > 0) {
+      VectorXd add = impl.V_host * v_relax_host;
+      u_opt.tail(mh_) += add;
+    }
+
+    info.opt = opt_->solve(q_opt, l_opt, u_opt, stream);
+  }
+
+  // Copy out x (size nx_).
+  if (x_d != nullptr && nx_ > 0) {
+    BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(x_d, opt_->x_device(),
+                                        nx_ * sizeof(double),
+                                        cudaMemcpyDeviceToDevice, stream));
+  }
+
+  info.converged =
+      info.prj.converged && info.rlx.converged && info.opt.converged;
+  return info;
+}
+
+// ---------------------------------------------------------------------------
+// Pipeline - end-to-end driver mirroring BreakageChecker::solve.
+// ---------------------------------------------------------------------------
+
+Pipeline::Pipeline() = default;
+Pipeline::~Pipeline() { erase_pipeline_impl(this); }
+
+void Pipeline::setup(const HostSystem &host, const Thresholds &thr,
+                     const Settings &settings) {
+  host_ = host;
+  thr_ = thr;
+  upload_system(host_, dev_);
+  qp_.setup(host_, dev_, settings);
+
+  const int N = host_.num_parts;
+  v_W_curr_.resize(static_cast<std::size_t>(N) * 3);
+  L_curr_.resize(static_cast<std::size_t>(N) * 3);
+  b_d_.resize(static_cast<std::size_t>(N) * 6);
+  x_d_.resize(static_cast<std::size_t>(host_.num_vars));
+  x_unwhiten_d_.resize(static_cast<std::size_t>(host_.num_vars));
+  slack_d_.resize(static_cast<std::size_t>(host_.num_eq));
+  v_relax_d_.resize(static_cast<std::size_t>(host_.num_clutches));
+  utilization_d_.resize(static_cast<std::size_t>(host_.num_clutches));
+
+  PipelineImpl &impl = pipeline_impl(this);
+  impl.v_W_prev_d.resize(static_cast<std::size_t>(N) * 3);
+  impl.L_prev_d.resize(static_cast<std::size_t>(N) * 3);
+}
+
+Solution Pipeline::solve(const HostInput &in, HostState &state,
+                         cudaStream_t stream) {
+  Solution sol;
+  PipelineImpl &impl = pipeline_impl(this);
+
+  const int N = host_.num_parts;
+  const int K = host_.num_clutches;
+  const int ncv = host_.num_contact_vertices;
+  const int ncap = static_cast<int>(host_.capacity_clutch_indices.size());
+  const int num_vars = host_.num_vars;
+  const int me = host_.num_eq;
+
+  // 0. Upload input + previous state. We always re-upload v_W_prev and
+  //    L_prev from HostState in case the user mutated them between solves.
+  upload_input(in, impl.input);
+
+  if (state.v_W_prev.rows() != N || state.L_prev.rows() != N) {
+    throw std::invalid_argument("Pipeline::solve: HostState dim mismatch");
+  }
+  to_row_major_N3(state.v_W_prev, impl.rm_buf);
+  impl.v_W_prev_d.copy_from_host(impl.rm_buf.data(),
+                                 static_cast<std::size_t>(N) * 3);
+  to_row_major_N3(state.L_prev, impl.rm_buf);
+  impl.L_prev_d.copy_from_host(impl.rm_buf.data(),
+                               static_cast<std::size_t>(N) * 3);
+
+  // 1. fit_se3 -> q_W_CC, t_W_CC (host scalars).
+  double q_W_CC[4];
+  double t_W_CC[3];
+  cuda_kernels::fit_se3(N, dev_.q_CC.data(), dev_.c_CC.data(),
+                        impl.input.q.data(), impl.input.c.data(),
+                        dev_.mass.data(), dev_.total_mass,
+                        dev_.L0 * dev_.L0 / 2.0, q_W_CC, t_W_CC, stream);
+
+  // 2. fit_twist -> w0/v0 (host) + v_W_curr (device).
+  // v0 is computed but unused by the rest of the pipeline (matches the CPU
+  // baseline in breakage.cppm: only w0 feeds compute_L and v_W feeds b).
+  double w0[3];
+  double v0[3];
+  cuda_kernels::fit_twist(N, impl.input.w.data(), impl.input.v.data(),
+                          dev_.c_CC.data(), q_W_CC, t_W_CC, dev_.mass.data(),
+                          dev_.total_mass, dev_.L0 * dev_.L0, w0, v0,
+                          v_W_curr_.data(), stream);
+  (void)v0;
+
+  // 3. compute_Pi (host scalar work). Eigen::Quaterniond stores as (x,y,z,w).
+  double q_prev[4];
+  q_prev[0] = state.q_W_CC_prev.x();
+  q_prev[1] = state.q_W_CC_prev.y();
+  q_prev[2] = state.q_W_CC_prev.z();
+  q_prev[3] = state.q_W_CC_prev.w();
+  double Pi[9];
+  cuda_kernels::compute_Pi_host(q_prev, q_W_CC, Pi);
+
+  // 4. compute_L on the device.
+  cuda_kernels::compute_L(N, dev_.I_CC.data(), q_W_CC, w0, L_curr_.data(),
+                          stream);
+
+  // 5. Assemble RHS b on the device.
+  cuda_kernels::assemble_b(N, v_W_curr_.data(), impl.v_W_prev_d.data(),
+                           L_curr_.data(), impl.L_prev_d.data(),
+                           impl.input.J.data(), impl.input.H.data(),
+                           dev_.mass.data(), Pi, in.dt, dev_.L0, b_d_.data(),
+                           stream);
+
+  // 6. Download b to host (small: 6*N doubles).
+  VectorXd b_host(6 * N);
+  if (N > 0) {
+    BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(b_host.data(), b_d_.data(),
+                                        6 * N * sizeof(double),
+                                        cudaMemcpyDeviceToHost, stream));
+    BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
+
+  // 7. QP solve.
+  sol.info = qp_.solve(b_host, x_d_.data(), slack_d_.data(), v_relax_d_.data(),
+                       stream);
+
+  // 8. slack_fraction = ||slack||_2 / max(||b||_2, floor)
+  double slack_norm =
+      (me > 0) ? cuda_la::nrm2(me, slack_d_.data(), stream) : 0.0;
+  double b_norm = b_host.norm();
+  sol.slack_fraction =
+      slack_norm / std::max(b_norm, thr_.slack_fraction_b_floor);
+
+  // 9. Postprocess: undo whitening + per-clutch utilisation.
+  if (sol.info.converged) {
+    cuda_kernels::postprocess(ncv, K, ncap, x_d_.data(),
+                              dev_.capacity_clutch_indices.data(),
+                              dev_.capacities.data(), dev_.clutch_whiten.data(),
+                              x_unwhiten_d_.data(), utilization_d_.data(),
+                              stream);
+
+    sol.x.resize(num_vars);
+    sol.utilization.resize(K);
+    if (num_vars > 0) {
+      BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(sol.x.data(), x_unwhiten_d_.data(),
+                                          num_vars * sizeof(double),
+                                          cudaMemcpyDeviceToHost, stream));
+    }
+    if (K > 0) {
+      BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(sol.utilization.data(),
+                                          utilization_d_.data(),
+                                          K * sizeof(double),
+                                          cudaMemcpyDeviceToHost, stream));
+    }
+    BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream));
+  }
+
+  // 10. Persist HostState (host-side copies of v_W_curr, L_curr, q_W_CC).
+  // Eigen::Quaterniond constructor takes (w, x, y, z).
+  state.q_W_CC_prev =
+      Eigen::Quaterniond(q_W_CC[3], q_W_CC[0], q_W_CC[1], q_W_CC[2]);
+
+  if (state.v_W_prev.rows() != N) {
+    state.v_W_prev.resize(N, 3);
+  }
+  if (state.L_prev.rows() != N) {
+    state.L_prev.resize(N, 3);
+  }
+  if (N > 0) {
+    impl.rm_buf.resize(static_cast<std::size_t>(N) * 3);
+    BREAKAGE_CUDA_CHECK(cudaMemcpy(impl.rm_buf.data(), v_W_curr_.data(),
+                                   3 * N * sizeof(double),
+                                   cudaMemcpyDeviceToHost));
+    from_row_major_N3(impl.rm_buf, state.v_W_prev, N);
+    BREAKAGE_CUDA_CHECK(cudaMemcpy(impl.rm_buf.data(), L_curr_.data(),
+                                   3 * N * sizeof(double),
+                                   cudaMemcpyDeviceToHost));
+    from_row_major_N3(impl.rm_buf, state.L_prev, N);
+  }
+
+  return sol;
+}
+
+} // namespace breakage_cuda::cuda_qp
diff --git a/native/cuda/breakage_cuda/src/ref_cpu.cpp b/native/cuda/breakage_cuda/src/ref_cpu.cpp
new file mode 100644
index 0000000..76d2eeb
--- /dev/null
+++ b/native/cuda/breakage_cuda/src/ref_cpu.cpp
@@ -0,0 +1,778 @@
+// SPDX-License-Identifier: MIT
+//
+// CPU reference implementation of the per-step BreakageChecker hot path.
+//
+// This file is a near-verbatim port of the algorithmic kernels in
+// native/modules/bricksim/physx/breakage.cppm (fit_se3, fit_twist,
+// so3_log_from_unit_quat, so3_jacobian_inv, compute_Pi, compute_L,
+// the b-vector assembly and the unwhitening / utilization post-process)
+// together with a hand-written re-implementation of the three-stage OSQP
+// wrapper from native/modules/bricksim/physx/osqp.cppm. We deliberately
+// avoid any bricksim C++26 module imports so this subproject stays
+// self-contained.
+//
+// Constants (kSmallS, kSmallTheta, lambda regularizers, OSQP epsilon,
+// rel_x_reg, rho overrides) are copied verbatim so that this baseline
+// stays binary-compatible with the production BreakageChecker output up
+// to OSQP's tolerance.
+
+#include "breakage_cuda/ref_cpu.hpp"
+
+#include <Eigen/Geometry>
+#include <Eigen/SVD>
+
+#include <osqp.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdlib>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace breakage_cuda {
+
+namespace {
+
+// Eigen 3.4 does not ship Vector3d::asSkewSymmetric() (which breakage.cppm
+// uses); inline an equivalent here so the rest of the port can stay
+// line-by-line faithful to the original.
+inline Matrix3d skew(const Vector3d &v) {
+  Matrix3d m;
+  m << 0.0, -v.z(), v.y(),
+       v.z(), 0.0, -v.x(),
+       -v.y(), v.x(), 0.0;
+  return m;
+}
+
+// Wrap an Eigen CSC sparse matrix as an OSQP CSC view. The OSQP matrix
+// borrows the Eigen pointers; the Eigen sparse must outlive osqp_setup()
+// (which copies the data into its workspace) but may be freed afterwards.
+inline OSQPCscMatrix to_osqp_csc(const SparseMatrixCSC &mat) {
+  OSQPCscMatrix out{};
+  out.m = static_cast<OSQPInt>(mat.rows());
+  out.n = static_cast<OSQPInt>(mat.cols());
+  out.p = const_cast<OSQPInt *>(mat.outerIndexPtr());
+  out.i = const_cast<OSQPInt *>(mat.innerIndexPtr());
+  out.x = const_cast<OSQPFloat *>(mat.valuePtr());
+  out.nzmax = static_cast<OSQPInt>(mat.nonZeros());
+  out.nz = -1;
+  out.owned = 0;
+  return out;
+}
+
+inline void check_osqp(OSQPInt ret) {
+  if (ret != 0) {
+    const char *msg = osqp_error_message(ret);
+    throw std::runtime_error(std::string("osqp: ") +
+                             (msg ? msg : "unknown error"));
+  }
+}
+
+// Mirrors osqp.cppm's build_settings_(): verbose off, tight tolerances tied
+// to the same epsilon used by the constraint slack, max_iter overridable
+// from the OSQP_MAX_ITER environment variable to match the production code.
+OSQPSettings make_settings(double epsilon) {
+  OSQPSettings s;
+  osqp_set_default_settings(&s);
+  s.verbose = 0;
+  s.max_iter = 10000;
+  s.eps_abs = epsilon;
+  s.eps_rel = epsilon;
+  if (auto *env = std::getenv("OSQP_MAX_ITER")) {
+    s.max_iter = std::stoi(env);
+  }
+  return s;
+}
+
+void fill_stage(StageInfo &si, const OSQPInfo &info, double rho) {
+  si.converged = info.status_val == OSQP_SOLVED;
+  si.iter = static_cast<int>(info.iter);
+  si.prim_res = info.prim_res;
+  si.dual_res = info.dual_res;
+  si.rho = rho;
+  si.solve_time_s = info.solve_time;
+}
+
+}  // namespace
+
+// ---------------------------------------------------------------------------
+// InternalQpState (forward-declared in system.hpp). The CPU baseline keeps
+// the cached stage matrices, the warm-start vectors, and the three OSQP
+// solver handles here so the HostState payload can stay opaque.
+// ---------------------------------------------------------------------------
+
+struct InternalQpState {
+  InternalQpState() = default;
+  explicit InternalQpState(const HostSystem &sys) { setup_from(sys); }
+
+  // Captured dimensions (mirror osqp.cppm).
+  int nx{0};  // primary variables
+  int nv{0};  // relaxation slack variables (one per clutch)
+  int me{0};  // equality rows = 6 * num_parts
+  int mi{0};  // inequality rows
+  int mh{0};  // relaxed inequality rows
+
+  // A/V are needed at solve time (b_prj = A * x_prj.head(nx); u_opt += V*v).
+  SparseMatrixCSC A;
+  SparseMatrixCSC V;
+
+  // Per-stage matrices/vectors, built once per setup().
+  SparseMatrixCSC P_prj, C_prj;
+  VectorXd l_prj, u_prj;
+  SparseMatrixCSC P_rlx, C_rlx;
+  VectorXd q_rlx;
+  SparseMatrixCSC P_opt, C_opt;
+  VectorXd q_opt;
+
+  // OSQP tunables. epsilon mirrors osqp.cppm's default and honors the
+  // OSQP_EPSILON env var like the production solver does.
+  double epsilon{1e-4};
+  double rel_x_reg{1e-4};
+
+  // Persisted across calls (warm starts + last solve outputs).
+  bool has_state{false};
+  VectorXd prj_sol, prj_dual;
+  double prj_rho{};
+  VectorXd rlx_sol, rlx_dual;
+  double rlx_rho{};
+  VectorXd opt_sol, opt_dual;
+  double opt_rho{};
+  VectorXd slack;     // b - A * x_prj.head(nx)
+  VectorXd v_relax;   // max(0, rlx_sol.tail(nv))
+
+  // OSQP solver handles. Default-initialized to nullptr so the destructor
+  // is a no-op until something is actually built.
+  std::unique_ptr<OSQPSolver, decltype(&osqp_cleanup)> prj_solver{
+      nullptr, &osqp_cleanup};
+  std::unique_ptr<OSQPSolver, decltype(&osqp_cleanup)> rlx_solver{
+      nullptr, &osqp_cleanup};
+  std::unique_ptr<OSQPSolver, decltype(&osqp_cleanup)> opt_solver{
+      nullptr, &osqp_cleanup};
+
+  void setup_from(const HostSystem &sys);
+
+  // Linear cost / bound builders, parametrized by the current right-hand
+  // side. These mirror build_q_prj_ / build_l_rlx_ / ... in osqp.cppm.
+  VectorXd build_q_prj(const VectorXd &b) const {
+    VectorXd q(nx + me);
+    q.head(nx).setZero();
+    q.tail(me) = -b;
+    return q;
+  }
+
+  VectorXd build_l_rlx(const VectorXd &b_prj) const {
+    VectorXd l(me + mi + mh + nv);
+    l.head(me) = b_prj;
+    l.segment(me, mi).setConstant(-epsilon);
+    l.segment(me + mi, mh).setConstant(-OSQP_INFTY);
+    l.tail(nv).setZero();
+    return l;
+  }
+
+  VectorXd build_u_rlx(const VectorXd &b_prj) const {
+    VectorXd u(me + mi + mh + nv);
+    u.head(me) = b_prj;
+    u.segment(me, mi).setConstant(OSQP_INFTY);
+    u.segment(me + mi, mh).setConstant(1.0 - epsilon);
+    u.tail(nv).setConstant(OSQP_INFTY);
+    return u;
+  }
+
+  VectorXd build_l_opt(const VectorXd &b_prj) const {
+    VectorXd l(me + mi + mh);
+    l.head(me) = b_prj;
+    l.segment(me, mi).setConstant(-epsilon);
+    l.tail(mh).setConstant(-OSQP_INFTY);
+    return l;
+  }
+
+  VectorXd build_u_opt(const VectorXd &b_prj, const VectorXd &v) const {
+    VectorXd u(me + mi + mh);
+    u.head(me) = b_prj;
+    u.segment(me, mi).setConstant(OSQP_INFTY);
+    u.tail(mh).setConstant(1.0);
+    u.tail(mh) += V * v;
+    return u;
+  }
+};
+
+void InternalQpState::setup_from(const HostSystem &sys) {
+  using Trip = Eigen::Triplet<double>;
+
+  nx = static_cast<int>(sys.Q.rows());
+  nv = static_cast<int>(sys.V.cols());
+  me = static_cast<int>(sys.A.rows());
+  mi = static_cast<int>(sys.G.rows());
+  mh = static_cast<int>(sys.H.rows());
+
+  if (auto *env = std::getenv("OSQP_EPSILON")) {
+    epsilon = std::stod(env);
+  }
+
+  if (sys.Q.cols() != nx || sys.A.cols() != nx || sys.G.cols() != nx ||
+      sys.H.cols() != nx || sys.V.rows() != mh) {
+    throw std::invalid_argument("InternalQpState: matrix dimension mismatch");
+  }
+
+  A = sys.A;
+  V = sys.V;
+  if (!A.isCompressed()) A.makeCompressed();
+  if (!V.isCompressed()) V.makeCompressed();
+
+  // -- Stage 1 (projection) --------------------------------------------------
+  // Variables [x; s] of size nx + me. Objective is 1/2 ||s||^2 only, so P_prj
+  // has identity on the s-block and zero on the x-block.
+  {
+    std::vector<Trip> tr;
+    tr.reserve(static_cast<std::size_t>(me));
+    for (int i = 0; i < me; ++i) {
+      tr.emplace_back(nx + i, nx + i, 1.0);
+    }
+    P_prj.resize(nx + me, nx + me);
+    P_prj.setFromTriplets(tr.begin(), tr.end());
+    P_prj.makeCompressed();
+  }
+  // C_prj = [A  -I; G  0]: the upper block expresses A x - s = -b (q tail);
+  // the lower block keeps Gx >= 0.
+  {
+    std::vector<Trip> tr;
+    tr.reserve(static_cast<std::size_t>(A.nonZeros() + sys.G.nonZeros() + me));
+    for (int k = 0; k < A.outerSize(); ++k) {
+      for (SparseMatrixCSC::InnerIterator it(A, k); it; ++it) {
+        tr.emplace_back(it.row(), it.col(), it.value());
+      }
+    }
+    for (int k = 0; k < sys.G.outerSize(); ++k) {
+      for (SparseMatrixCSC::InnerIterator it(sys.G, k); it; ++it) {
+        tr.emplace_back(me + it.row(), it.col(), it.value());
+      }
+    }
+    for (int i = 0; i < me; ++i) {
+      tr.emplace_back(i, nx + i, -1.0);
+    }
+    C_prj.resize(me + mi, nx + me);
+    C_prj.setFromTriplets(tr.begin(), tr.end());
+    C_prj.makeCompressed();
+  }
+  l_prj = VectorXd::Zero(me + mi);
+  u_prj.resize(me + mi);
+  u_prj.head(me).setZero();             // equality rows: 0 <= ... <= 0
+  u_prj.tail(mi).setConstant(OSQP_INFTY);  // inequality rows: 0 <= ... <= +inf
+
+  // -- Stage 2 (relaxation) --------------------------------------------------
+  // Variables [x; v] of size nx + nv. P_rlx penalizes x with rel_x_reg and
+  // v with 1.0 (forces v small unless capacity constraints push it up).
+  {
+    std::vector<Trip> tr;
+    tr.reserve(static_cast<std::size_t>(nx + nv));
+    for (int i = 0; i < nx; ++i) tr.emplace_back(i, i, rel_x_reg);
+    for (int i = 0; i < nv; ++i) tr.emplace_back(nx + i, nx + i, 1.0);
+    P_rlx.resize(nx + nv, nx + nv);
+    P_rlx.setFromTriplets(tr.begin(), tr.end());
+    P_rlx.makeCompressed();
+  }
+  // C_rlx = [A 0; G 0; H -V; 0 I]
+  {
+    std::vector<Trip> tr;
+    tr.reserve(static_cast<std::size_t>(A.nonZeros() + sys.G.nonZeros() +
+                                        sys.H.nonZeros() + V.nonZeros() + nv));
+    for (int k = 0; k < A.outerSize(); ++k) {
+      for (SparseMatrixCSC::InnerIterator it(A, k); it; ++it) {
+        tr.emplace_back(it.row(), it.col(), it.value());
+      }
+    }
+    for (int k = 0; k < sys.G.outerSize(); ++k) {
+      for (SparseMatrixCSC::InnerIterator it(sys.G, k); it; ++it) {
+        tr.emplace_back(me + it.row(), it.col(), it.value());
+      }
+    }
+    for (int k = 0; k < sys.H.outerSize(); ++k) {
+      for (SparseMatrixCSC::InnerIterator it(sys.H, k); it; ++it) {
+        tr.emplace_back(me + mi + it.row(), it.col(), it.value());
+      }
+    }
+    for (int k = 0; k < V.outerSize(); ++k) {
+      for (SparseMatrixCSC::InnerIterator it(V, k); it; ++it) {
+        tr.emplace_back(me + mi + it.row(), nx + it.col(), -it.value());
+      }
+    }
+    for (int i = 0; i < nv; ++i) {
+      tr.emplace_back(me + mi + mh + i, nx + i, 1.0);
+    }
+    C_rlx.resize(me + mi + mh + nv, nx + nv);
+    C_rlx.setFromTriplets(tr.begin(), tr.end());
+    C_rlx.makeCompressed();
+  }
+  q_rlx = VectorXd::Zero(nx + nv);
+
+  // -- Stage 3 (optimization) ------------------------------------------------
+  // OSQP wants the upper triangle of the Hessian only.
+  {
+    std::vector<Trip> tr;
+    tr.reserve(static_cast<std::size_t>(sys.Q.nonZeros()));
+    for (int k = 0; k < sys.Q.outerSize(); ++k) {
+      for (SparseMatrixCSC::InnerIterator it(sys.Q, k); it; ++it) {
+        if (it.row() <= it.col()) {
+          tr.emplace_back(it.row(), it.col(), it.value());
+        }
+      }
+    }
+    P_opt.resize(nx, nx);
+    P_opt.setFromTriplets(tr.begin(), tr.end());
+    P_opt.makeCompressed();
+  }
+  // C_opt = [A; G; H]
+  {
+    std::vector<Trip> tr;
+    tr.reserve(static_cast<std::size_t>(A.nonZeros() + sys.G.nonZeros() +
+                                        sys.H.nonZeros()));
+    for (int k = 0; k < A.outerSize(); ++k) {
+      for (SparseMatrixCSC::InnerIterator it(A, k); it; ++it) {
+        tr.emplace_back(it.row(), it.col(), it.value());
+      }
+    }
+    for (int k = 0; k < sys.G.outerSize(); ++k) {
+      for (SparseMatrixCSC::InnerIterator it(sys.G, k); it; ++it) {
+        tr.emplace_back(me + it.row(), it.col(), it.value());
+      }
+    }
+    for (int k = 0; k < sys.H.outerSize(); ++k) {
+      for (SparseMatrixCSC::InnerIterator it(sys.H, k); it; ++it) {
+        tr.emplace_back(me + mi + it.row(), it.col(), it.value());
+      }
+    }
+    C_opt.resize(me + mi + mh, nx);
+    C_opt.setFromTriplets(tr.begin(), tr.end());
+    C_opt.makeCompressed();
+  }
+  q_opt = VectorXd::Zero(nx);
+}
+
+namespace {
+
+using OsqpSolverPtr = std::unique_ptr<OSQPSolver, decltype(&osqp_cleanup)>;
+
+OsqpSolverPtr build_prj_solver(InternalQpState &qp, const VectorXd &b) {
+  OSQPSettings s = make_settings(qp.epsilon);
+  OSQPCscMatrix P = to_osqp_csc(qp.P_prj);
+  OSQPCscMatrix C = to_osqp_csc(qp.C_prj);
+  VectorXd q = qp.build_q_prj(b);
+  OSQPSolver *solver = nullptr;
+  check_osqp(osqp_setup(&solver, &P, q.data(), &C, qp.l_prj.data(),
+                        qp.u_prj.data(), C.m, C.n, &s));
+  return {solver, &osqp_cleanup};
+}
+
+OsqpSolverPtr build_rlx_solver(InternalQpState &qp, const VectorXd &b_prj) {
+  OSQPSettings s = make_settings(qp.epsilon);
+  OSQPCscMatrix P = to_osqp_csc(qp.P_rlx);
+  OSQPCscMatrix C = to_osqp_csc(qp.C_rlx);
+  VectorXd l = qp.build_l_rlx(b_prj);
+  VectorXd u = qp.build_u_rlx(b_prj);
+  OSQPSolver *solver = nullptr;
+  check_osqp(osqp_setup(&solver, &P, qp.q_rlx.data(), &C, l.data(), u.data(),
+                        C.m, C.n, &s));
+  return {solver, &osqp_cleanup};
+}
+
+OsqpSolverPtr build_opt_solver(InternalQpState &qp, const VectorXd &b_prj,
+                               const VectorXd &v) {
+  OSQPSettings s = make_settings(qp.epsilon);
+  // Mirror osqp.cppm: opt stage starts from rho=1.0 (not the default 0.1).
+  s.rho = 1.0;
+  OSQPCscMatrix P = to_osqp_csc(qp.P_opt);
+  OSQPCscMatrix C = to_osqp_csc(qp.C_opt);
+  VectorXd l = qp.build_l_opt(b_prj);
+  VectorXd u = qp.build_u_opt(b_prj, v);
+  OSQPSolver *solver = nullptr;
+  check_osqp(osqp_setup(&solver, &P, qp.q_opt.data(), &C, l.data(), u.data(),
+                        C.m, C.n, &s));
+  return {solver, &osqp_cleanup};
+}
+
+void update_prj_solver(InternalQpState &qp, OSQPSolver *solver,
+                       const VectorXd &b) {
+  VectorXd q = qp.build_q_prj(b);
+  check_osqp(osqp_update_data_vec(solver, q.data(), nullptr, nullptr));
+}
+
+void update_rlx_solver(InternalQpState &qp, OSQPSolver *solver,
+                       const VectorXd &b_prj) {
+  VectorXd l = qp.build_l_rlx(b_prj);
+  VectorXd u = qp.build_u_rlx(b_prj);
+  check_osqp(osqp_update_data_vec(solver, nullptr, l.data(), u.data()));
+}
+
+void update_opt_solver(InternalQpState &qp, OSQPSolver *solver,
+                       const VectorXd &b_prj, const VectorXd &v) {
+  VectorXd l = qp.build_l_opt(b_prj);
+  VectorXd u = qp.build_u_opt(b_prj, v);
+  check_osqp(osqp_update_data_vec(solver, nullptr, l.data(), u.data()));
+}
+
+}  // namespace
+
+}  // namespace breakage_cuda
+
+namespace breakage_cuda::ref_cpu {
+
+// ---------------------------------------------------------------------------
+// Pure dense kernels. Verbatim ports of breakage.cppm:58-180.
+// ---------------------------------------------------------------------------
+
+Transformd fit_se3(const MatrixX4d &q0, const MatrixX3d &t0,
+                   const MatrixX4d &qx, const MatrixX3d &tx,
+                   const VectorXd &mass, double total_mass, double lambda_R) {
+  using Eigen::Index;
+  using Eigen::Quaterniond;
+  Index N = mass.size();
+  Vector3d t0_bar = t0.transpose() * mass / total_mass;
+  Vector3d tx_bar = tx.transpose() * mass / total_mass;
+  Matrix3d H =
+      ((t0.rowwise() - t0_bar.transpose()).array().colwise() * mass.array())
+          .matrix()
+          .transpose() *
+      (tx.rowwise() - tx_bar.transpose());
+  Matrix3d K = Matrix3d::Zero();
+  for (Index i = 0; i < N; ++i) {
+    K += mass(i) * (Quaterniond{q0.row(i).transpose()} *
+                    Quaterniond{qx.row(i).transpose()}.conjugate())
+                       .toRotationMatrix();
+  }
+  Matrix3d S = H + lambda_R * K;
+  Eigen::JacobiSVD<Matrix3d> svd{S, Eigen::ComputeFullU | Eigen::ComputeFullV};
+  Matrix3d U = svd.matrixU();
+  Matrix3d V = svd.matrixV();
+  Matrix3d D = Matrix3d::Identity();
+  if ((V * U.transpose()).determinant() < 0.0) {
+    D(2, 2) = -1.0;
+  }
+  Matrix3d R = V * D * U.transpose();
+  Vector3d t = tx_bar - R * t0_bar;
+  Quaterniond q{R};
+  q.normalize();
+  return {q, t};
+}
+
+TwistFitResult fit_twist(const MatrixX3d &w, const MatrixX3d &v,
+                         const MatrixX3d &c_CC, const Transformd &T_W_CC,
+                         const VectorXd &mass, double total_mass,
+                         double lambda_w) {
+  const auto &q_W_CC = T_W_CC.q;
+  const auto &t_W_CC = T_W_CC.t;
+  MatrixX3d c_W = (c_CC * q_W_CC.toRotationMatrix().transpose()).rowwise() +
+                  t_W_CC.transpose();
+  Vector3d r = c_W.transpose() * mass / total_mass;
+  MatrixX3d d = c_W.rowwise() - r.transpose();
+  MatrixX3d d_weighted = d.array().colwise() * mass.array();
+  Matrix3d S = d.transpose() * d_weighted.matrix();
+  Matrix3d LHS =
+      Matrix3d::Identity() * (S.trace() + lambda_w * total_mass) - S;
+  Vector3d L;
+  L.x() = d_weighted.col(1).dot(v.col(2)) - d_weighted.col(2).dot(v.col(1));
+  L.y() = d_weighted.col(2).dot(v.col(0)) - d_weighted.col(0).dot(v.col(2));
+  L.z() = d_weighted.col(0).dot(v.col(1)) - d_weighted.col(1).dot(v.col(0));
+  Vector3d regularization = lambda_w * (w.transpose() * mass);
+  Vector3d RHS = L + regularization;
+  TwistFitResult result;
+  result.w0 = LHS.ldlt().solve(RHS);
+  result.v0 = v.transpose() * mass / total_mass;
+  result.v_W = d.rowwise().cross(-result.w0).rowwise() + result.v0.transpose();
+  return result;
+}
+
+Vector3d so3_log_from_unit_quat(Eigen::Quaterniond q) {
+  // Branch threshold on s = ||q.vec|| = sin(theta/2).
+  constexpr double kSmallS = 5e-5;
+  q.normalize();
+  // Pick the shortest representation (theta in [0, pi]); the pi case (w==0)
+  // is intrinsically ambiguous.
+  if (q.w() < 0.0) {
+    q.coeffs() *= -1.0;
+  }
+  Vector3d v = q.vec();
+  double s = v.norm();
+  double w = q.w();
+  if (s < kSmallS) {
+    // theta/s = 2 + s^2/3 + 3 s^4/20 + O(s^6)
+    double s2 = s * s;
+    double s4 = s2 * s2;
+    double theta_over_s = 2.0 + (s2 / 3.0) + (3.0 * s4 / 20.0);
+    return theta_over_s * v;
+  }
+  double theta = 2.0 * std::atan2(s, w);
+  return (theta / s) * v;
+}
+
+Matrix3d so3_jacobian_inv(const Vector3d &phi) {
+  // Branch threshold on theta = ||phi||.
+  constexpr double kSmallTheta = 1e-4;
+  Matrix3d Phi = skew(phi);
+  double theta2 = phi.squaredNorm();
+  if (theta2 < kSmallTheta * kSmallTheta) {
+    // c(theta) = 1/12 + theta^2/720 + theta^4/30240 + O(theta^6)
+    double t2 = theta2;
+    double t4 = t2 * t2;
+    double c = (1.0 / 12.0) + (t2 / 720.0) + (t4 / 30240.0);
+    return Matrix3d::Identity() - 0.5 * Phi + c * (Phi * Phi);
+  }
+  double theta = std::sqrt(theta2);
+  double half = 0.5 * theta;
+  double cot_half = std::cos(half) / std::sin(half);
+  // c(theta) = 1/theta^2 - cot(theta/2)/(2 theta)
+  double c = (1.0 / theta2) - (cot_half / (2.0 * theta));
+  return Matrix3d::Identity() - 0.5 * Phi + c * (Phi * Phi);
+}
+
+Matrix3d compute_Pi(const Eigen::Quaterniond &qm,
+                    const Eigen::Quaterniond &qp) {
+  // Relative rotation: R_rel = Rm^T Rp.
+  return so3_jacobian_inv(so3_log_from_unit_quat(qm.conjugate() * qp)) *
+         qm.toRotationMatrix().transpose();
+}
+
+MatrixX3d compute_L(
+    const Eigen::Matrix<double, Dynamic, 9, RowMajor> &I_flat,
+    const Eigen::Quaterniond &q_W_CC, const Vector3d &w0) {
+  Vector3d w_CC = q_W_CC.conjugate() * w0;
+  return (I_flat.leftCols<3>() * w_CC.x() +
+          I_flat.middleCols<3>(3) * w_CC.y() +
+          I_flat.rightCols<3>() * w_CC.z()) *
+         q_W_CC.toRotationMatrix().transpose();
+}
+
+// ---------------------------------------------------------------------------
+// b vector assembly. Layout: row-major (num_parts, 6). Mirrors lines
+// 1018-1035 of breakage.cppm.
+// ---------------------------------------------------------------------------
+
+VectorXd assemble_b(const HostSystem &sys, const HostInput &in,
+                    const MatrixX3d &v_W_curr, const MatrixX3d &v_W_prev,
+                    const MatrixX3d &L_curr, const MatrixX3d &L_prev,
+                    const Matrix3d &Pi) {
+  using Eigen::Index;
+  Index N = sys.num_parts;
+  VectorXd b(6 * N);
+  Eigen::Map<Eigen::Matrix<double, Dynamic, 6, RowMajor>> b_mat{b.data(), N, 6};
+  b_mat.leftCols<3>() =
+      (((v_W_curr - v_W_prev).array().colwise() * sys.mass.array()).matrix() -
+       in.J) *
+      Pi.transpose() / in.dt;
+  b_mat.rightCols<3>() = ((L_curr - L_prev - in.H) * Pi.transpose()) / in.dt;
+  if constexpr (kEnableTorqueScaling) {
+    b_mat.rightCols<3>() /= sys.L0;
+  }
+  return b;
+}
+
+VectorXd compute_b(const HostSystem &sys, const HostInput &in, HostState &state,
+                   Eigen::Quaterniond *q_W_CC_out, MatrixX3d *v_W_out,
+                   MatrixX3d *L_out) {
+  Transformd T_W_CC = fit_se3(sys.q_CC, sys.c_CC, in.q, in.c, sys.mass,
+                              sys.total_mass, sys.L0 * sys.L0 / 2.0);
+  TwistFitResult twist =
+      fit_twist(in.w, in.v, sys.c_CC, T_W_CC, sys.mass, sys.total_mass,
+                sys.L0 * sys.L0);
+  MatrixX3d L_curr = compute_L(sys.I_CC, T_W_CC.q, twist.w0);
+  Matrix3d Pi = compute_Pi(state.q_W_CC_prev, T_W_CC.q);
+  VectorXd b = assemble_b(sys, in, twist.v_W, state.v_W_prev, L_curr,
+                          state.L_prev, Pi);
+  if (q_W_CC_out) *q_W_CC_out = T_W_CC.q;
+  if (v_W_out) *v_W_out = std::move(twist.v_W);
+  if (L_out) *L_out = std::move(L_curr);
+  return b;
+}
+
+// ---------------------------------------------------------------------------
+// Three-stage QP solver. Mirrors OsqpSolver::solve() in osqp.cppm step by
+// step: each stage either updates an existing OSQP solver in place (warm
+// path) or rebuilds it (cold or post-reset path) and warm-starts it from
+// the persisted state vectors.
+// ---------------------------------------------------------------------------
+
+SolveInfo solve_qp(const HostSystem &sys, const VectorXd &b, HostState &state,
+                   VectorXd &x_out, VectorXd &slack_out,
+                   VectorXd &v_relax_out) {
+  if (b.size() != 6 * sys.num_parts) {
+    throw std::invalid_argument("solve_qp: b dimension mismatch");
+  }
+  if (!state.qp) {
+    state.qp = std::make_shared<InternalQpState>(sys);
+  }
+  InternalQpState &qp = *state.qp;
+  if (qp.has_state) {
+    if (qp.prj_sol.size() != qp.C_prj.cols() ||
+        qp.prj_dual.size() != qp.C_prj.rows() ||
+        qp.rlx_sol.size() != qp.C_rlx.cols() ||
+        qp.rlx_dual.size() != qp.C_rlx.rows() ||
+        qp.opt_sol.size() != qp.C_opt.cols() ||
+        qp.opt_dual.size() != qp.C_opt.rows()) {
+      throw std::invalid_argument("solve_qp: warm-start dimension mismatch");
+    }
+  }
+
+  SolveInfo info;
+
+  // -- Stage 1: projection ---------------------------------------------------
+  if (qp.has_state) {
+    if (qp.prj_solver) {
+      update_prj_solver(qp, qp.prj_solver.get(), b);
+    } else {
+      qp.prj_solver = build_prj_solver(qp, b);
+      check_osqp(osqp_warm_start(qp.prj_solver.get(), qp.prj_sol.data(),
+                                 qp.prj_dual.data()));
+      check_osqp(osqp_update_rho(qp.prj_solver.get(), qp.prj_rho));
+    }
+  } else {
+    qp.prj_solver = build_prj_solver(qp, b);
+  }
+  check_osqp(osqp_solve(qp.prj_solver.get()));
+  {
+    OSQPSolver *s = qp.prj_solver.get();
+    qp.prj_sol = Eigen::Map<const VectorXd>(s->solution->x, qp.C_prj.cols());
+    qp.prj_dual = Eigen::Map<const VectorXd>(s->solution->y, qp.C_prj.rows());
+    qp.prj_rho = s->settings->rho;
+    fill_stage(info.prj, *s->info, qp.prj_rho);
+  }
+  VectorXd b_prj = qp.A * qp.prj_sol.head(qp.nx);
+  qp.slack = b - b_prj;
+
+  // -- Stage 2: relaxation ---------------------------------------------------
+  if (qp.has_state) {
+    if (qp.rlx_solver) {
+      update_rlx_solver(qp, qp.rlx_solver.get(), b_prj);
+    } else {
+      qp.rlx_solver = build_rlx_solver(qp, b_prj);
+      check_osqp(osqp_warm_start(qp.rlx_solver.get(), qp.rlx_sol.data(),
+                                 qp.rlx_dual.data()));
+      check_osqp(osqp_update_rho(qp.rlx_solver.get(), qp.rlx_rho));
+    }
+  } else {
+    qp.rlx_solver = build_rlx_solver(qp, b_prj);
+  }
+  check_osqp(osqp_solve(qp.rlx_solver.get()));
+  {
+    OSQPSolver *s = qp.rlx_solver.get();
+    qp.rlx_sol = Eigen::Map<const VectorXd>(s->solution->x, qp.C_rlx.cols());
+    qp.rlx_dual = Eigen::Map<const VectorXd>(s->solution->y, qp.C_rlx.rows());
+    qp.rlx_rho = s->settings->rho;
+    fill_stage(info.rlx, *s->info, qp.rlx_rho);
+  }
+  qp.v_relax = qp.rlx_sol.tail(qp.nv).cwiseMax(0.0);
+
+  // -- Stage 3: optimization -------------------------------------------------
+  if (qp.has_state) {
+    if (qp.opt_solver) {
+      update_opt_solver(qp, qp.opt_solver.get(), b_prj, qp.v_relax);
+    } else {
+      qp.opt_solver = build_opt_solver(qp, b_prj, qp.v_relax);
+      check_osqp(osqp_warm_start(qp.opt_solver.get(), qp.opt_sol.data(),
+                                 qp.opt_dual.data()));
+      check_osqp(osqp_update_rho(qp.opt_solver.get(), qp.opt_rho));
+    }
+  } else {
+    qp.opt_solver = build_opt_solver(qp, b_prj, qp.v_relax);
+  }
+  check_osqp(osqp_solve(qp.opt_solver.get()));
+  {
+    OSQPSolver *s = qp.opt_solver.get();
+    qp.opt_sol = Eigen::Map<const VectorXd>(s->solution->x, qp.C_opt.cols());
+    qp.opt_dual = Eigen::Map<const VectorXd>(s->solution->y, qp.C_opt.rows());
+    qp.opt_rho = s->settings->rho;
+    fill_stage(info.opt, *s->info, qp.opt_rho);
+  }
+  info.converged =
+      info.prj.converged && info.rlx.converged && info.opt.converged;
+  qp.has_state = true;
+
+  // x_out is left whitened; solve_full() does the un-whitening, mirroring the
+  // CUDA pipeline which also returns whitened `x_d` from its solver.
+  x_out = qp.opt_sol.head(qp.nx);
+  slack_out = qp.slack;
+  v_relax_out = qp.v_relax;
+  return info;
+}
+
+// ---------------------------------------------------------------------------
+// End-to-end driver. Mirrors BreakageChecker::solve() at breakage.cppm:996,
+// minus the dump bookkeeping and logging which are out of scope here.
+// ---------------------------------------------------------------------------
+
+Solution solve_full(const HostSystem &sys, const HostInput &in,
+                    HostState &state, const Thresholds &thr) {
+  Eigen::Quaterniond q_W_CC_curr;
+  MatrixX3d v_W_curr;
+  MatrixX3d L_curr;
+  VectorXd b = compute_b(sys, in, state, &q_W_CC_curr, &v_W_curr, &L_curr);
+
+  Solution sol;
+  if (!thr.enabled) {
+    // Match the production code path: state is NOT advanced when the checker
+    // is disabled. Re-enabling later restarts from the previously cached
+    // q_W_CC_prev / v_W_prev / L_prev.
+    return sol;
+  }
+
+  VectorXd x_raw, slack, v_relax;
+  sol.info = solve_qp(sys, b, state, x_raw, slack, v_relax);
+  sol.slack_fraction =
+      slack.norm() / std::max(b.norm(), thr.slack_fraction_b_floor);
+
+  // Advance the per-frame state (matches lines 1053-1055 of breakage.cppm:
+  // done unconditionally after the solve, before the converged-only
+  // postprocess block).
+  state.q_W_CC_prev = q_W_CC_curr;
+  state.v_W_prev = std::move(v_W_curr);
+  state.L_prev = std::move(L_curr);
+
+  if (sol.info.converged) {
+    sol.x = std::move(x_raw);
+    if constexpr (kEnableClutchWhitening) {
+      const int ncv = sys.num_contact_vertices;
+      for (int k = 0; k < sys.num_clutches; ++k) {
+        // The 4 doubles per row in HostSystem.clutch_whiten are the raw
+        // memory of bricksim's `std::vector<Matrix2d>` (column-major), so
+        // mapping them back as a ColMajor 2x2 reproduces Wk exactly.
+        // (Wk is symmetric anyway, so the choice does not affect the math.)
+        Eigen::Map<const Eigen::Matrix2d> Wk(sys.clutch_whiten.data() + 4 * k);
+        const int j0 = ncv + 9 * k;
+        Eigen::Vector2d a_scaled = sol.x.segment<2>(j0 + 1);
+        sol.x.segment<2>(j0 + 1) = Wk * a_scaled;
+        Eigen::Vector2d b_scaled = sol.x.segment<2>(j0 + 4);
+        sol.x.segment<2>(j0 + 4) = Wk * b_scaled;
+        Eigen::Vector2d c_scaled = sol.x.segment<2>(j0 + 7);
+        sol.x.segment<2>(j0 + 7) = Wk * c_scaled;
+      }
+    }
+    sol.utilization.setConstant(sys.num_clutches, -1.0);
+    const int ncv = sys.num_contact_vertices;
+    for (std::size_t idx = 0; idx < sys.capacity_clutch_indices.size(); ++idx) {
+      Eigen::Matrix<double, 9, 1> c =
+          sys.capacities.row(static_cast<Eigen::Index>(idx)).transpose();
+      const int clutch_index = sys.capacity_clutch_indices[idx];
+      Eigen::Matrix<double, 9, 1> x =
+          sol.x.segment<9>(ncv + 9 * clutch_index);
+      double frc_used = c.head<3>().dot(x.head<3>());
+      double frc_cap = 1.0 - c.tail<6>().dot(x.tail<6>());
+      double u_fp;
+      if (frc_cap > 0.0) {
+        u_fp = frc_used / frc_cap;
+        u_fp = std::max(u_fp, 0.0);
+      } else {
+        u_fp = 1e6;
+      }
+      double &u_max = sol.utilization(clutch_index);
+      if (u_fp > u_max) {
+        u_max = u_fp;
+      }
+    }
+  }
+
+  return sol;
+}
+
+}  // namespace breakage_cuda::ref_cpu
diff --git a/native/cuda/breakage_cuda/src/system.cpp b/native/cuda/breakage_cuda/src/system.cpp
new file mode 100644
index 0000000..645f8ab
--- /dev/null
+++ b/native/cuda/breakage_cuda/src/system.cpp
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: MIT
+//
+// JSON loaders + host-to-device transfers for the breakage_cuda subproject.
+//
+// JSON layout matches bricksim::matrix_to_json (see
+// modules/bricksim/utils/matrix_serialization.cppm) and the to_json overloads
+// for BreakageDebugDump in modules/bricksim/physx/breakage.cppm.
+//
+// On the device side, all dense per-part data is stored row-major and all
+// sparse matrices are CSR (cuSPARSE-friendly).
+
+#include "breakage_cuda/device.hpp"
+#include "breakage_cuda/system.hpp"
+
+#include <Eigen/Geometry>
+#include <nlohmann/json.hpp>
+
+#include <algorithm>
+#include <array>
+#include <bit>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <span>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+namespace breakage_cuda {
+
+// ===========================================================================
+// JSON helpers (matrix_serialization-compatible).
+//
+//   dense  -> {"kind":"dense", "dtype","order"("C"|"F"),"shape":[r,c],
+//              "data_b64"}
+//   sparse -> {"kind":"sparse","format"("csr"|"csc"),"shape":[r,c],"nnz",
+//              "data_dtype","index_dtype",
+//              "data_b64","indices_b64","indptr_b64"}
+//
+// Bytes inside *_b64 are raw little-/big-endian according to the dtype prefix
+// ('<','>','|','='). On x86_64 Linux native is little-endian, so the common
+// case '<f8' / '<i4' reduces to a memcpy.
+// ===========================================================================
+
+namespace {
+
+// ----- minimal base64 decoder (standard alphabet, '=' padding) -------------
+
+constexpr std::array<std::int8_t, 256> make_base64_lut() {
+  std::array<std::int8_t, 256> lut{};
+  for (std::size_t i = 0; i < 256; ++i) lut[i] = -1;
+  std::string_view alpha =
+      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+  for (std::size_t i = 0; i < alpha.size(); ++i) {
+    lut[static_cast<unsigned char>(alpha[i])] = static_cast<std::int8_t>(i);
+  }
+  lut[static_cast<unsigned char>('=')] = -2;
+  return lut;
+}
+
+constexpr auto kBase64Lut = make_base64_lut();
+
+std::vector<std::byte> base64_decode(std::string_view s) {
+  if (s.empty()) return {};
+  if ((s.size() & 3u) != 0) {
+    throw std::runtime_error{"base64: input length not divisible by 4"};
+  }
+  std::size_t pad = 0;
+  if (s.back() == '=') {
+    pad = (s.size() >= 2 && s[s.size() - 2] == '=') ? 2 : 1;
+  }
+  std::vector<std::byte> out((s.size() / 4) * 3 - pad);
+  const auto *p = reinterpret_cast<const unsigned char *>(s.data());
+  std::byte *o = out.data();
+  const std::size_t full = s.size() / 4 - (pad != 0 ? 1 : 0);
+  auto get6 = [&]() {
+    int v = kBase64Lut[*p++];
+    if (v < 0) throw std::runtime_error{"base64: invalid character"};
+    return v;
+  };
+  for (std::size_t i = 0; i < full; ++i) {
+    std::uint32_t x = (std::uint32_t(get6()) << 18) |
+                      (std::uint32_t(get6()) << 12) |
+                      (std::uint32_t(get6()) << 6) | std::uint32_t(get6());
+    *o++ = std::byte((x >> 16) & 0xFFu);
+    *o++ = std::byte((x >> 8) & 0xFFu);
+    *o++ = std::byte(x & 0xFFu);
+  }
+  if (pad == 1) {
+    std::uint32_t x = (std::uint32_t(get6()) << 18) |
+                      (std::uint32_t(get6()) << 12) |
+                      (std::uint32_t(get6()) << 6);
+    *o++ = std::byte((x >> 16) & 0xFFu);
+    *o++ = std::byte((x >> 8) & 0xFFu);
+  } else if (pad == 2) {
+    std::uint32_t x = (std::uint32_t(get6()) << 18) |
+                      (std::uint32_t(get6()) << 12);
+    *o++ = std::byte((x >> 16) & 0xFFu);
+  }
+  return out;
+}
+
+// ----- numpy dtype validation + per-element byteswap -----------------------
+
+template <class T>
+constexpr char numpy_kind() {
+  if constexpr (std::is_floating_point_v<T>)
+    return 'f';
+  else if constexpr (std::is_signed_v<T>)
+    return 'i';
+  else
+    return 'u';
+}
+
+template <class T>
+bool dtype_needs_swap(std::string_view dtype, std::string_view field) {
+  if (dtype.size() < 3) {
+    throw std::runtime_error{
+        std::string{"dtype too short for "}.append(field)};
+  }
+  std::size_t isz = 0;
+  for (std::size_t i = 2; i < dtype.size(); ++i) {
+    char c = dtype[i];
+    if (c < '0' || c > '9') {
+      throw std::runtime_error{
+          std::string{"dtype itemsize parse error in "}.append(field)};
+    }
+    isz = isz * 10 + static_cast<std::size_t>(c - '0');
+  }
+  if (dtype[1] != numpy_kind<T>() || isz != sizeof(T)) {
+    throw std::runtime_error{std::string{"dtype mismatch for "}
+                                 .append(field)
+                                 .append(": ")
+                                 .append(dtype)};
+  }
+  if (sizeof(T) <= 1) return false;
+  const char endian = dtype[0];
+  if (endian == '|' || endian == '=') return false;
+  const bool native_le = (std::endian::native == std::endian::little);
+  if (endian == '<') return !native_le;
+  if (endian == '>') return native_le;
+  throw std::runtime_error{
+      std::string{"invalid endian prefix in "}.append(field)};
+}
+
+template <class T>
+void byteswap_inplace(std::span<T> xs) {
+  for (auto &x : xs) {
+    auto bytes = std::as_writable_bytes(std::span<T, 1>{&x, 1});
+    std::ranges::reverse(bytes);
+  }
+}
+
+template <class T>
+void decode_typed(std::string_view b64, std::span<T> out,
+                  std::string_view dtype, std::string_view field) {
+  const bool swap = dtype_needs_swap<T>(dtype, field);
+  std::vector<std::byte> bytes = base64_decode(b64);
+  if (bytes.size() != out.size_bytes()) {
+    throw std::runtime_error{
+        std::string{"decoded byte count mismatch for "}.append(field)};
+  }
+  if (!bytes.empty()) {
+    std::memcpy(out.data(), bytes.data(), bytes.size());
+  }
+  if (swap) byteswap_inplace(out);
+}
+
+// ----- dense matrix loader -------------------------------------------------
+
+template <class EigenType>
+EigenType load_dense(const nlohmann::json &j, std::string_view field) {
+  using Scalar = typename EigenType::Scalar;
+  if (j.at("kind").get_ref<const std::string &>() != "dense") {
+    throw std::runtime_error{
+        std::string{field}.append(": expected kind 'dense'")};
+  }
+  const auto &shape = j.at("shape");
+  if (!shape.is_array() || shape.size() != 2) {
+    throw std::runtime_error{
+        std::string{field}.append(": shape must be [rows, cols]")};
+  }
+  const Eigen::Index rows = shape.at(0).get<Eigen::Index>();
+  const Eigen::Index cols = shape.at(1).get<Eigen::Index>();
+
+  const auto &order = j.at("order").get_ref<const std::string &>();
+  const bool input_rm = (order == "C");
+  if (!input_rm && order != "F") {
+    throw std::runtime_error{std::string{field}
+                                 .append(": invalid order '")
+                                 .append(order)
+                                 .append("'")};
+  }
+
+  const auto &dtype = j.at("dtype").get_ref<const std::string &>();
+  const auto &b64 = j.at("data_b64").get_ref<const std::string &>();
+
+  EigenType out;
+  out.resize(rows, cols);
+  const bool out_rm = static_cast<bool>(EigenType::IsRowMajor);
+  // For vectors (one extent <= 1), storage order is identical either way.
+  const bool degenerate = (rows <= 1) || (cols <= 1);
+
+  if (input_rm == out_rm || degenerate) {
+    decode_typed<Scalar>(
+        b64,
+        std::span<Scalar>{out.data(), static_cast<std::size_t>(out.size())},
+        dtype, field);
+  } else if (input_rm) {
+    Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> tmp(
+        rows, cols);
+    decode_typed<Scalar>(
+        b64,
+        std::span<Scalar>{tmp.data(), static_cast<std::size_t>(tmp.size())},
+        dtype, field);
+    out = tmp;
+  } else {
+    Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor> tmp(
+        rows, cols);
+    decode_typed<Scalar>(
+        b64,
+        std::span<Scalar>{tmp.data(), static_cast<std::size_t>(tmp.size())},
+        dtype, field);
+    out = tmp;
+  }
+  return out;
+}
+
+// ----- sparse matrix loader (always returns CSC) ---------------------------
+
+SparseMatrixCSC load_sparse_csc(const nlohmann::json &j,
+                                std::string_view field) {
+  if (j.at("kind").get_ref<const std::string &>() != "sparse") {
+    throw std::runtime_error{
+        std::string{field}.append(": expected kind 'sparse'")};
+  }
+  const auto &shape = j.at("shape");
+  if (!shape.is_array() || shape.size() != 2) {
+    throw std::runtime_error{
+        std::string{field}.append(": shape must be [rows, cols]")};
+  }
+  const Eigen::Index rows = shape.at(0).get<Eigen::Index>();
+  const Eigen::Index cols = shape.at(1).get<Eigen::Index>();
+  const std::size_t nnz = j.at("nnz").get<std::size_t>();
+
+  const auto &format = j.at("format").get_ref<const std::string &>();
+  const bool is_csc = (format == "csc");
+  const bool is_csr = (format == "csr");
+  if (!is_csc && !is_csr) {
+    throw std::runtime_error{std::string{field}
+                                 .append(": invalid sparse format '")
+                                 .append(format)
+                                 .append("'")};
+  }
+  const std::size_t outer = is_csc ? static_cast<std::size_t>(cols)
+                                   : static_cast<std::size_t>(rows);
+
+  const auto &data_dtype =
+      j.at("data_dtype").get_ref<const std::string &>();
+  const auto &index_dtype =
+      j.at("index_dtype").get_ref<const std::string &>();
+
+  std::vector<double> values(nnz);
+  std::vector<int> indices(nnz);
+  std::vector<int> indptr(outer + 1);
+  decode_typed<double>(j.at("data_b64").get_ref<const std::string &>(),
+                       std::span<double>{values}, data_dtype, field);
+  decode_typed<int>(j.at("indices_b64").get_ref<const std::string &>(),
+                    std::span<int>{indices}, index_dtype, field);
+  decode_typed<int>(j.at("indptr_b64").get_ref<const std::string &>(),
+                    std::span<int>{indptr}, index_dtype, field);
+
+  if (nnz == 0) {
+    SparseMatrixCSC empty(rows, cols);
+    empty.makeCompressed();
+    return empty;
+  }
+
+  if (is_csc) {
+    using MapType = Eigen::SparseMatrix<double, Eigen::ColMajor, int>;
+    Eigen::Map<MapType> mapped(rows, cols, static_cast<Eigen::Index>(nnz),
+                               indptr.data(), indices.data(), values.data());
+    return SparseMatrixCSC{mapped};
+  }
+  // CSR -> re-encode to CSC; Eigen handles the layout change on assignment.
+  using MapType = Eigen::SparseMatrix<double, Eigen::RowMajor, int>;
+  Eigen::Map<MapType> mapped(rows, cols, static_cast<Eigen::Index>(nnz),
+                             indptr.data(), indices.data(), values.data());
+  return SparseMatrixCSC{mapped};
+}
+
+}  // namespace
+
+// ===========================================================================
+// from_json overloads.
+// ===========================================================================
+
+void from_json(const nlohmann::json &j, Thresholds &out) {
+  j.at("enabled").get_to(out.enabled);
+  j.at("contact_regularization").get_to(out.contact_regularization);
+  j.at("clutch_axial_compliance").get_to(out.clutch_axial_compliance);
+  j.at("clutch_radial_compliance").get_to(out.clutch_radial_compliance);
+  j.at("clutch_tangential_compliance")
+      .get_to(out.clutch_tangential_compliance);
+  j.at("friction_coefficient").get_to(out.friction_coefficient);
+  j.at("preloaded_force").get_to(out.preloaded_force);
+  j.at("slack_fraction_warn").get_to(out.slack_fraction_warn);
+  j.at("slack_fraction_b_floor").get_to(out.slack_fraction_b_floor);
+  j.at("debug_dump").get_to(out.debug_dump);
+  j.at("breakage_cooldown_time").get_to(out.breakage_cooldown_time);
+}
+
+void from_json(const nlohmann::json &j, HostSystem &out) {
+  j.at("num_parts").get_to(out.num_parts);
+  j.at("num_clutches").get_to(out.num_clutches);
+  j.at("num_contact_vertices").get_to(out.num_contact_vertices);
+  j.at("num_vars").get_to(out.num_vars);
+  j.at("num_eq").get_to(out.num_eq);
+  j.at("num_ineq").get_to(out.num_ineq);
+  j.at("num_relaxed_ineq").get_to(out.num_relaxed_ineq);
+  j.at("total_mass").get_to(out.total_mass);
+  j.at("L0").get_to(out.L0);
+
+  out.mass = load_dense<VectorXd>(j.at("mass"), "mass");
+  out.q_CC = load_dense<MatrixX4d>(j.at("q_CC"), "q_CC");
+  out.c_CC = load_dense<MatrixX3d>(j.at("c_CC"), "c_CC");
+  out.I_CC = load_dense<Eigen::Matrix<double, Dynamic, 9, RowMajor>>(
+      j.at("I_CC"), "I_CC");
+
+  out.Q = load_sparse_csc(j.at("Q"), "Q");
+  out.A = load_sparse_csc(j.at("A"), "A");
+  out.G = load_sparse_csc(j.at("G"), "G");
+  out.H = load_sparse_csc(j.at("H"), "H");
+  out.V = load_sparse_csc(j.at("V"), "V");
+
+  j.at("capacity_clutch_indices").get_to(out.capacity_clutch_indices);
+  out.capacities = load_dense<Eigen::Matrix<double, Dynamic, 9, RowMajor>>(
+      j.at("capacities"), "capacities");
+  out.clutch_whiten = load_dense<Eigen::Matrix<double, Dynamic, 4, RowMajor>>(
+      j.at("clutch_whiten"), "clutch_whiten");
+  // part_ids / clutch_ids are intentionally ignored; GPU only needs indices.
+}
+
+void from_json(const nlohmann::json &j, HostInput &out) {
+  out.w = load_dense<MatrixX3d>(j.at("w"), "w");
+  out.v = load_dense<MatrixX3d>(j.at("v"), "v");
+  out.q = load_dense<MatrixX4d>(j.at("q"), "q");
+  out.c = load_dense<MatrixX3d>(j.at("c"), "c");
+  j.at("dt").get_to(out.dt);
+  out.J = load_dense<MatrixX3d>(j.at("J"), "J");
+  out.H = load_dense<MatrixX3d>(j.at("H"), "H");
+}
+
+void from_json(const nlohmann::json &j, HostState &out) {
+  out.q_W_CC_prev.coeffs() =
+      load_dense<Vector4d>(j.at("q_W_CC_prev"), "q_W_CC_prev");
+  out.v_W_prev = load_dense<MatrixX3d>(j.at("v_W_prev"), "v_W_prev");
+  out.L_prev = load_dense<MatrixX3d>(j.at("L_prev"), "L_prev");
+  // The JSON solver_state is backend-specific (OSQP); each backend manages
+  // its own warm start, so we drop it.
+  out.qp.reset();
+}
+
+void from_json(const nlohmann::json &j, DebugDump &out) {
+  j.at("thresholds").get_to(out.thresholds);
+  j.at("system").get_to(out.system);
+  j.at("input").get_to(out.input);
+  j.at("state").get_to(out.state);
+
+  // Solution: only x / utilization / slack_fraction; OsqpInfo is backend-
+  // specific and dropped.
+  const auto &sol = j.at("solution");
+  out.solution.x = load_dense<VectorXd>(sol.at("x"), "solution.x");
+  out.solution.utilization =
+      load_dense<VectorXd>(sol.at("utilization"), "solution.utilization");
+  sol.at("slack_fraction").get_to(out.solution.slack_fraction);
+  out.solution.info = SolveInfo{};
+
+  out.b = load_dense<VectorXd>(j.at("b"), "b");
+
+  if (j.contains("prev_state")) {
+    HostState ps;
+    from_json(j.at("prev_state"), ps);
+    out.prev_state = std::move(ps);
+  } else {
+    out.prev_state.reset();
+  }
+}
+
+DebugDump load_debug_dump(const std::string &path) {
+  std::ifstream stream(path);
+  if (!stream.is_open()) {
+    throw std::runtime_error{"failed to open debug dump file: " + path};
+  }
+  nlohmann::json j;
+  try {
+    stream >> j;
+  } catch (const std::exception &e) {
+    throw std::runtime_error{"failed to parse JSON in debug dump '" + path +
+                             "': " + e.what()};
+  }
+  DebugDump dump;
+  from_json(j, dump);
+  return dump;
+}
+
+double relative_inf_error(const VectorXd &a, const VectorXd &b, double floor) {
+  if (a.size() != b.size()) {
+    throw std::invalid_argument{
+        "relative_inf_error: vector size mismatch"};
+  }
+  if (a.size() == 0) return 0.0;
+  const double denom = std::max(b.cwiseAbs().maxCoeff(), floor);
+  return (a - b).cwiseAbs().maxCoeff() / denom;
+}
+
+// ===========================================================================
+// Device upload helpers.
+// ===========================================================================
+
+void upload_csc_as_csr(const SparseMatrixCSC &src, DeviceCsr &dst) {
+  // The descriptor caches raw device pointers; tear it down before any
+  // potential buffer reallocation in copy_from_host.
+  if (dst.descr != nullptr) {
+    cusparseDestroySpMat(dst.descr);
+    dst.descr = nullptr;
+  }
+
+  dst.rows = static_cast<int>(src.rows());
+  dst.cols = static_cast<int>(src.cols());
+  dst.nnz = static_cast<int>(src.nonZeros());
+
+  // Eigen does the CSC -> CSR re-encoding on assignment.
+  SparseMatrixCSR tmp = src;
+  tmp.makeCompressed();
+
+  dst.rowptr.copy_from_host(tmp.outerIndexPtr(),
+                            static_cast<std::size_t>(dst.rows) + 1);
+  dst.colind.copy_from_host(tmp.innerIndexPtr(),
+                            static_cast<std::size_t>(dst.nnz));
+  dst.values.copy_from_host(tmp.valuePtr(),
+                            static_cast<std::size_t>(dst.nnz));
+
+  // cuSPARSE allows null colind/values when nnz == 0.
+  BREAKAGE_CUSPARSE_CHECK(cusparseCreateCsr(
+      &dst.descr, static_cast<std::int64_t>(dst.rows),
+      static_cast<std::int64_t>(dst.cols),
+      static_cast<std::int64_t>(dst.nnz), dst.rowptr.data(), dst.colind.data(),
+      dst.values.data(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
+      CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F));
+}
+
+void upload_system(const HostSystem &host, DeviceSystem &dev) {
+  dev.N = host.num_parts;
+  dev.K = host.num_clutches;
+  dev.ncv = host.num_contact_vertices;
+  dev.nx = host.num_vars;
+  dev.me = host.num_eq;
+  dev.mi = host.num_ineq;
+  dev.mh = host.num_relaxed_ineq;
+  dev.ncap = static_cast<int>(host.capacity_clutch_indices.size());
+  dev.total_mass = host.total_mass;
+  dev.L0 = host.L0;
+
+  dev.mass.copy_from_host(host.mass.data(),
+                          static_cast<std::size_t>(host.mass.size()));
+
+  // Device convention: row-major dense per-part data. q_CC / c_CC are
+  // ColMajor on the host so we transcode here.
+  Eigen::Matrix<double, Dynamic, 4, RowMajor> q_rm = host.q_CC;
+  Eigen::Matrix<double, Dynamic, 3, RowMajor> c_rm = host.c_CC;
+  dev.q_CC.copy_from_host(q_rm.data(), static_cast<std::size_t>(q_rm.size()));
+  dev.c_CC.copy_from_host(c_rm.data(), static_cast<std::size_t>(c_rm.size()));
+  // I_CC / capacities / clutch_whiten are already RowMajor on the host.
+  dev.I_CC.copy_from_host(host.I_CC.data(),
+                          static_cast<std::size_t>(host.I_CC.size()));
+
+  upload_csc_as_csr(host.Q, dev.Q);
+  upload_csc_as_csr(host.A, dev.A);
+  upload_csc_as_csr(host.G, dev.G);
+  upload_csc_as_csr(host.H, dev.H);
+  upload_csc_as_csr(host.V, dev.V);
+
+  dev.capacity_clutch_indices.copy_from_host(
+      host.capacity_clutch_indices.data(),
+      host.capacity_clutch_indices.size());
+  dev.capacities.copy_from_host(
+      host.capacities.data(),
+      static_cast<std::size_t>(host.capacities.size()));
+  dev.clutch_whiten.copy_from_host(
+      host.clutch_whiten.data(),
+      static_cast<std::size_t>(host.clutch_whiten.size()));
+}
+
+void upload_input(const HostInput &host, DeviceInput &dev) {
+  dev.dt = host.dt;
+
+  auto upload_rm3 = [](const MatrixX3d &src, DeviceBuffer<double> &dst_buf) {
+    Eigen::Matrix<double, Dynamic, 3, RowMajor> rm = src;
+    dst_buf.copy_from_host(rm.data(), static_cast<std::size_t>(rm.size()));
+  };
+  auto upload_rm4 = [](const MatrixX4d &src, DeviceBuffer<double> &dst_buf) {
+    Eigen::Matrix<double, Dynamic, 4, RowMajor> rm = src;
+    dst_buf.copy_from_host(rm.data(), static_cast<std::size_t>(rm.size()));
+  };
+
+  upload_rm3(host.w, dev.w);
+  upload_rm3(host.v, dev.v);
+  upload_rm4(host.q, dev.q);
+  upload_rm3(host.c, dev.c);
+  upload_rm3(host.J, dev.J);
+  upload_rm3(host.H, dev.H);
+}
+
+void upload_state(const HostState &host, DeviceState &dev, int N) {
+  Vector4d quat = host.q_W_CC_prev.coeffs();
+  dev.q_W_CC_prev.copy_from_host(quat.data(), 4);
+
+  if (host.v_W_prev.rows() != N || host.L_prev.rows() != N) {
+    throw std::invalid_argument{
+        "upload_state: v_W_prev / L_prev row count does not match N"};
+  }
+  Eigen::Matrix<double, Dynamic, 3, RowMajor> v_rm = host.v_W_prev;
+  Eigen::Matrix<double, Dynamic, 3, RowMajor> L_rm = host.L_prev;
+  const std::size_t n3 =
+      static_cast<std::size_t>(3) * static_cast<std::size_t>(N);
+  dev.v_W_prev.copy_from_host(v_rm.data(), n3);
+  dev.L_prev.copy_from_host(L_rm.data(), n3);
+}
+
+void download_state(const DeviceState &dev, HostState &host, int N) {
+  Vector4d quat;
+  dev.q_W_CC_prev.copy_to_host(quat.data(), 4);
+  host.q_W_CC_prev.coeffs() = quat;
+
+  const std::size_t n3 =
+      static_cast<std::size_t>(3) * static_cast<std::size_t>(N);
+  Eigen::Matrix<double, Dynamic, 3, RowMajor> v_rm(N, 3);
+  Eigen::Matrix<double, Dynamic, 3, RowMajor> L_rm(N, 3);
+  dev.v_W_prev.copy_to_host(v_rm.data(), n3);
+  dev.L_prev.copy_to_host(L_rm.data(), n3);
+  host.v_W_prev = v_rm;
+  host.L_prev = L_rm;
+}
+
+}  // namespace breakage_cuda
diff --git a/native/cuda/breakage_cuda/tests/bench.cpp b/native/cuda/breakage_cuda/tests/bench.cpp
new file mode 100644
index 0000000..745af54
--- /dev/null
+++ b/native/cuda/breakage_cuda/tests/bench.cpp
@@ -0,0 +1,403 @@
+// SPDX-License-Identifier: MIT
+//
+// Wall-clock benchmark for the full breakage pipeline.
+//
+// Usage: ./breakage_cuda_bench [N_parts]   (default N_parts = 32)
+//
+// Builds a synthetic chain of N_parts parts joined by N_parts - 1 clutches
+// (matching the dimensional layout enforced by `BreakageSystem::check_shape`
+// in breakage.cppm) and times N iterations of:
+//   * `cuda_qp::Pipeline::solve`  -- GPU end-to-end pipeline
+//   * `ref_cpu::solve_full`       -- OSQP-backed CPU baseline (reference)
+//
+// We use std::chrono::steady_clock and call cudaDeviceSynchronize() before
+// taking the GPU stop time so the measurement covers the full kernel
+// completion, not just queueing.
+
+#include "breakage_cuda/qp_solver.hpp"
+#include "breakage_cuda/ref_cpu.hpp"
+#include "breakage_cuda/system.hpp"
+
+#include <Eigen/Core>
+#include <Eigen/Geometry>
+#include <Eigen/SparseCore>
+
+#include <cuda_runtime.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <exception>
+#include <vector>
+
+namespace {
+
+using namespace breakage_cuda;
+using clock_type = std::chrono::steady_clock;
+using Trip = Eigen::Triplet<double>;
+
+// ---------------------------------------------------------------------------
+// Synthetic builder for `N_parts` parts arranged on the x-axis at unit
+// spacing. Each consecutive pair is joined by one clutch, giving K = N - 1.
+//
+// All matrices are tiled copies of the (N=2, K=1) building block in
+// test_full_pipeline.cpp so the per-clutch numerics stay well-conditioned.
+//
+// Dimensions (per check_shape in breakage.cppm):
+//   num_parts          = N
+//   num_clutches       = K = N - 1
+//   num_contact_vertices = 0
+//   num_vars           = 9 * K
+//   num_eq             = 6 * N
+//   num_ineq           = K          (one axial-non-negativity per clutch)
+//   num_relaxed_ineq   = 2 * K      (two friction-cone rows per clutch)
+//   capacities         = 2 * K rows
+//   clutch_whiten      = K rows
+// ---------------------------------------------------------------------------
+HostSystem make_synthetic_system_n(int N) {
+  if (N < 2) N = 2;
+  const int K = N - 1;
+
+  HostSystem sys;
+  sys.num_parts = N;
+  sys.num_clutches = K;
+  sys.num_contact_vertices = 0;
+  sys.num_vars = 9 * K;
+  sys.num_eq = 6 * N;
+  sys.num_ineq = K;
+  sys.num_relaxed_ineq = 2 * K;
+  sys.total_mass = static_cast<double>(N);
+  sys.L0 = 1.0;
+
+  sys.mass = VectorXd::Constant(N, 1.0);
+
+  sys.q_CC.resize(N, 4);
+  sys.c_CC.resize(N, 3);
+  sys.I_CC.resize(N, 9);
+  for (int i = 0; i < N; ++i) {
+    sys.q_CC.row(i) << 0.0, 0.0, 0.0, 1.0;
+    sys.c_CC.row(i) << static_cast<double>(i) - (N - 1) / 2.0, 0.0, 0.0;
+    sys.I_CC.row(i) << 1.0, 0.0, 0.0,
+                       0.0, 1.0, 0.0,
+                       0.0, 0.0, 1.0;
+  }
+
+  // Q: small positive-definite diagonal on every variable.
+  std::vector<Trip> Qt;
+  Qt.reserve(static_cast<std::size_t>(9 * K));
+  for (int v = 0; v < 9 * K; ++v) Qt.emplace_back(v, v, 0.1);
+  sys.Q.resize(9 * K, 9 * K);
+  sys.Q.setFromTriplets(Qt.begin(), Qt.end());
+  sys.Q.makeCompressed();
+
+  // A: 6N x 9K. For each clutch k between parts k and k+1, the three axial
+  // variables x[9k..9k+2] couple part k's linear residual with -I_3 and
+  // part k+1's with +I_3. Angular rows of A_i / A_j stay zero - the angular
+  // component of b becomes projection-stage slack on both backends.
+  std::vector<Trip> At;
+  At.reserve(static_cast<std::size_t>(6 * K));
+  for (int k = 0; k < K; ++k) {
+    const int v0 = 9 * k;
+    const int r_i = 6 * k;
+    const int r_j = 6 * (k + 1);
+    for (int axis = 0; axis < 3; ++axis) {
+      At.emplace_back(r_i + axis, v0 + axis, -1.0);
+      At.emplace_back(r_j + axis, v0 + axis, 1.0);
+    }
+  }
+  sys.A.resize(6 * N, 9 * K);
+  sys.A.setFromTriplets(At.begin(), At.end());
+  sys.A.makeCompressed();
+
+  // G: x[9k] >= 0 (axial force compressive) for each clutch.
+  std::vector<Trip> Gt;
+  Gt.reserve(static_cast<std::size_t>(K));
+  for (int k = 0; k < K; ++k) Gt.emplace_back(k, 9 * k, 1.0);
+  sys.G.resize(K, 9 * K);
+  sys.G.setFromTriplets(Gt.begin(), Gt.end());
+  sys.G.makeCompressed();
+
+  // H: 2 friction-cone rows per clutch, low magnitudes so the limits stay
+  // slack and we do not measure the constraint-saturation regime.
+  std::vector<Trip> Ht;
+  Ht.reserve(static_cast<std::size_t>(4 * K));
+  for (int k = 0; k < K; ++k) {
+    const int v0 = 9 * k;
+    const int r0 = 2 * k;
+    Ht.emplace_back(r0, v0, -0.1);
+    Ht.emplace_back(r0, v0 + 1, 0.1);
+    Ht.emplace_back(r0 + 1, v0, -0.1);
+    Ht.emplace_back(r0 + 1, v0 + 1, -0.1);
+  }
+  sys.H.resize(2 * K, 9 * K);
+  sys.H.setFromTriplets(Ht.begin(), Ht.end());
+  sys.H.makeCompressed();
+
+  // V: 2K x K, identity on each (pair-of-rows, clutch) block.
+  std::vector<Trip> Vt;
+  Vt.reserve(static_cast<std::size_t>(2 * K));
+  for (int k = 0; k < K; ++k) {
+    Vt.emplace_back(2 * k, k, 1.0);
+    Vt.emplace_back(2 * k + 1, k, 1.0);
+  }
+  sys.V.resize(2 * K, K);
+  sys.V.setFromTriplets(Vt.begin(), Vt.end());
+  sys.V.makeCompressed();
+
+  sys.capacity_clutch_indices.resize(static_cast<std::size_t>(2 * K));
+  for (int k = 0; k < K; ++k) {
+    sys.capacity_clutch_indices[2 * k] = k;
+    sys.capacity_clutch_indices[2 * k + 1] = k;
+  }
+  sys.capacities.resize(2 * K, 9);
+  for (int k = 0; k < K; ++k) {
+    sys.capacities.row(2 * k) << 1.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0;
+    sys.capacities.row(2 * k + 1) << 1.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0,
+        0.0;
+  }
+
+  // Wk = identity per clutch. (The CPU reads it column-major and the GPU
+  // reads it row-major; both interpretations match for a symmetric Wk.)
+  sys.clutch_whiten.resize(K, 4);
+  for (int k = 0; k < K; ++k) {
+    sys.clutch_whiten.row(k) << 1.0, 0.0, 0.0, 1.0;
+  }
+
+  return sys;
+}
+
+HostInput make_synthetic_input_n(int N) {
+  HostInput in;
+  in.dt = 0.01;
+  in.w = MatrixX3d::Zero(N, 3);
+
+  // Alternating-sign y velocities so consecutive parts pull on each other.
+  in.v.resize(N, 3);
+  for (int i = 0; i < N; ++i) {
+    const double s = (i % 2 == 0) ? 0.05 : -0.05;
+    in.v.row(i) << 0.0, s, 0.0;
+  }
+
+  in.q.resize(N, 4);
+  in.c.resize(N, 3);
+  for (int i = 0; i < N; ++i) {
+    in.q.row(i) << 0.0, 0.0, 0.0, 1.0;
+    const double dy = (i % 2 == 0) ? 0.0005 : -0.0005;
+    in.c.row(i) << static_cast<double>(i) - (N - 1) / 2.0, dy, 0.0;
+  }
+
+  in.J = MatrixX3d::Zero(N, 3);
+  in.H = MatrixX3d::Zero(N, 3);
+  return in;
+}
+
+HostState make_synthetic_state_n(int N) {
+  HostState st;
+  st.q_W_CC_prev = Eigen::Quaterniond::Identity();
+  st.v_W_prev = MatrixX3d::Zero(N, 3);
+  st.L_prev = MatrixX3d::Zero(N, 3);
+  return st;
+}
+
+// ---------------------------------------------------------------------------
+// Timing helpers.
+// ---------------------------------------------------------------------------
+struct Stats {
+  double mean_us{};
+  double std_us{};
+  double min_us{};
+  double max_us{};
+  int n{};
+};
+
+Stats summarize(const std::vector<double> &samples_us) {
+  Stats s;
+  s.n = static_cast<int>(samples_us.size());
+  if (samples_us.empty()) return s;
+  double sum = 0.0;
+  s.min_us = samples_us[0];
+  s.max_us = samples_us[0];
+  for (double v : samples_us) {
+    sum += v;
+    if (v < s.min_us) s.min_us = v;
+    if (v > s.max_us) s.max_us = v;
+  }
+  s.mean_us = sum / static_cast<double>(samples_us.size());
+  double var = 0.0;
+  for (double v : samples_us) {
+    const double d = v - s.mean_us;
+    var += d * d;
+  }
+  if (samples_us.size() > 1) {
+    var /= static_cast<double>(samples_us.size() - 1);
+  } else {
+    var = 0.0;
+  }
+  s.std_us = std::sqrt(var);
+  return s;
+}
+
+void print_stats(const char *label, const Stats &s) {
+  std::printf("[bench] %s  n=%d  mean=%9.1f us  std=%8.1f us  "
+              "min=%9.1f us  max=%9.1f us\n",
+              label, s.n, s.mean_us, s.std_us, s.min_us, s.max_us);
+}
+
+// ---------------------------------------------------------------------------
+// Per-backend benchmark drivers.
+//
+// Both warm-start across iterations (mirrors how BreakageChecker is used in
+// production: state.qp persists between frames). The first few iterations
+// are dropped to give the warm-start path a chance to stabilise.
+// ---------------------------------------------------------------------------
+Stats bench_gpu(const HostSystem &sys, const HostInput &in,
+                const Thresholds &thr, int warmup, int measured) {
+  const bool tolerate = []() {
+    const char *e = std::getenv("BENCH_TOLERATE_NONCONVERGED");
+    return e && std::atoi(e) > 0;
+  }();
+
+  cuda_qp::Pipeline pipe;
+  pipe.setup(sys, thr);
+
+  HostState state = make_synthetic_state_n(sys.num_parts);
+
+  for (int i = 0; i < warmup; ++i) {
+    Solution sol = pipe.solve(in, state);
+    if (!sol.info.converged) {
+      std::fprintf(stderr,
+                   "[bench] gpu warmup %d/%d did not converge "
+                   "(prj=%d rlx=%d opt=%d, prim=%.2e dual=%.2e)%s\n",
+                   i + 1, warmup, static_cast<int>(sol.info.prj.converged),
+                   static_cast<int>(sol.info.rlx.converged),
+                   static_cast<int>(sol.info.opt.converged),
+                   sol.info.opt.prim_res, sol.info.opt.dual_res,
+                   tolerate ? " (tolerated)" : "");
+      if (!tolerate) std::exit(2);
+    }
+  }
+  BREAKAGE_CUDA_CHECK(cudaDeviceSynchronize());
+
+  std::vector<double> samples_us;
+  samples_us.reserve(static_cast<std::size_t>(measured));
+  for (int i = 0; i < measured; ++i) {
+    const auto t0 = clock_type::now();
+    Solution sol = pipe.solve(in, state);
+    BREAKAGE_CUDA_CHECK(cudaDeviceSynchronize());
+    const auto t1 = clock_type::now();
+    if (!sol.info.converged) {
+      std::fprintf(stderr,
+                   "[bench] gpu measured iter %d/%d did not converge%s\n",
+                   i + 1, measured, tolerate ? " (tolerated)" : "");
+      if (!tolerate) std::exit(2);
+    }
+    const double us =
+        std::chrono::duration<double, std::micro>(t1 - t0).count();
+    samples_us.push_back(us);
+  }
+  return summarize(samples_us);
+}
+
+Stats bench_cpu(const HostSystem &sys, const HostInput &in,
+                const Thresholds &thr, int warmup, int measured) {
+  HostState state = make_synthetic_state_n(sys.num_parts);
+
+  // BENCH_TOLERATE_NONCONVERGED=1 turns "did not converge" into a warning so
+  // we can still get wall-clock numbers at very large N where OSQP/our ADMM
+  // would otherwise hit max_iter and abort.
+  const bool tolerate = []() {
+    const char *e = std::getenv("BENCH_TOLERATE_NONCONVERGED");
+    return e && std::atoi(e) > 0;
+  }();
+
+  for (int i = 0; i < warmup; ++i) {
+    Solution sol = ref_cpu::solve_full(sys, in, state, thr);
+    if (!sol.info.converged) {
+      std::fprintf(stderr,
+                   "[bench] cpu warmup %d/%d did not converge%s\n", i + 1,
+                   warmup, tolerate ? " (tolerated)" : "");
+      if (!tolerate) std::exit(2);
+    }
+  }
+
+  std::vector<double> samples_us;
+  samples_us.reserve(static_cast<std::size_t>(measured));
+  for (int i = 0; i < measured; ++i) {
+    const auto t0 = clock_type::now();
+    Solution sol = ref_cpu::solve_full(sys, in, state, thr);
+    const auto t1 = clock_type::now();
+    if (!sol.info.converged) {
+      std::fprintf(stderr,
+                   "[bench] cpu measured iter %d/%d did not converge%s\n",
+                   i + 1, measured, tolerate ? " (tolerated)" : "");
+      if (!tolerate) std::exit(2);
+    }
+    const double us =
+        std::chrono::duration<double, std::micro>(t1 - t0).count();
+    samples_us.push_back(us);
+  }
+  return summarize(samples_us);
+}
+
+} // namespace
+
+int main(int argc, char **argv) {
+  try {
+    int N = 32;
+    if (argc >= 2) {
+      N = std::atoi(argv[1]);
+      if (N < 2) {
+        std::fprintf(stderr, "[bench] N_parts must be >= 2 (got %d)\n", N);
+        return 1;
+      }
+    }
+
+    std::printf("[bench] N_parts=%d\n", N);
+
+    HostSystem sys = make_synthetic_system_n(N);
+    HostInput in = make_synthetic_input_n(N);
+    Thresholds thr;
+    thr.enabled = true;
+
+    std::printf("[bench]   num_parts=%d num_clutches=%d num_vars=%d "
+                "num_eq=%d num_ineq=%d num_relaxed_ineq=%d\n",
+                sys.num_parts, sys.num_clutches, sys.num_vars, sys.num_eq,
+                sys.num_ineq, sys.num_relaxed_ineq);
+
+    int kGpuWarmup = 5;
+    int kGpuMeasured = 50;
+    int kCpuWarmup = 5;
+    int kCpuMeasured = 20;
+    if (const char *q = std::getenv("BENCH_QUICK"); q && std::atoi(q) > 0) {
+      kGpuWarmup = 1;
+      kGpuMeasured = 1;
+      kCpuWarmup = 1;
+      kCpuMeasured = 1;
+    }
+    if (const char *g = std::getenv("BENCH_GPU_ITER"); g && std::atoi(g) > 0) {
+      kGpuMeasured = std::atoi(g);
+    }
+    if (const char *c = std::getenv("BENCH_CPU_ITER"); c && std::atoi(c) > 0) {
+      kCpuMeasured = std::atoi(c);
+    }
+
+    const Stats gpu = bench_gpu(sys, in, thr, kGpuWarmup, kGpuMeasured);
+    const Stats cpu = bench_cpu(sys, in, thr, kCpuWarmup, kCpuMeasured);
+
+    print_stats("gpu pipe.solve     ", gpu);
+    print_stats("cpu solve_full     ", cpu);
+
+    if (gpu.mean_us > 0.0) {
+      const double speedup = cpu.mean_us / gpu.mean_us;
+      std::printf("[bench] speedup     cpu_mean / gpu_mean = %.2fx\n",
+                  speedup);
+    }
+
+    return 0;
+  } catch (const std::exception &e) {
+    std::fprintf(stderr, "[bench] ERROR: %s\n", e.what());
+    return 2;
+  }
+}
diff --git a/native/cuda/breakage_cuda/tests/test_dense_kernels.cpp b/native/cuda/breakage_cuda/tests/test_dense_kernels.cpp
new file mode 100644
index 0000000..adf60a7
--- /dev/null
+++ b/native/cuda/breakage_cuda/tests/test_dense_kernels.cpp
@@ -0,0 +1,473 @@
+// SPDX-License-Identifier: MIT
+//
+// Correctness check: each dense CUDA kernel vs its CPU reference on a single
+// synthetic problem (N = 8 parts).
+//
+// Coverage:
+//   1. cuda_kernels::fit_se3       vs ref_cpu::fit_se3
+//   2. cuda_kernels::fit_twist     vs ref_cpu::fit_twist
+//   3. cuda_kernels::compute_Pi_host vs ref_cpu::compute_Pi
+//   4. cuda_kernels::compute_L     vs ref_cpu::compute_L
+//   5. cuda_kernels::assemble_b    vs ref_cpu::assemble_b
+//
+// Layout conventions (verified against src/system.cpp upload helpers):
+//   * MatrixX3d / MatrixX4d aliases default to ColMajor; we transcode to
+//     RowMajor before any DeviceBuffer upload, matching upload_input().
+//   * I_CC is RowMajor on the host already (see system.hpp), so its raw
+//     .data() can be uploaded directly.
+//   * Quaternions are flat xyzw, length-4. Eigen::Quaterniond::coeffs()
+//     returns xyzw, so we round-trip via that.
+//
+// No bug fixes were required in src/kernels.cu, src/linalg.cu, or
+// src/ref_cpu.cpp to make this test pass.
+
+#include "breakage_cuda/device.hpp"
+#include "breakage_cuda/kernels.hpp"
+#include "breakage_cuda/linalg.hpp"
+#include "breakage_cuda/ref_cpu.hpp"
+#include "breakage_cuda/system.hpp"
+
+#include <Eigen/Geometry>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <random>
+#include <string>
+#include <vector>
+
+namespace bc = breakage_cuda;
+
+namespace {
+
+constexpr int kN = 8;
+
+// Per-test record collected so we can emit a final summary table.
+struct TestResult {
+  std::string name;
+  bool passed = false;
+  double max_abs_diff = 0.0;
+  double tol = 0.0;
+  std::string detail;
+};
+
+void log_pass(const std::string &name, double diff, const std::string &detail) {
+  std::printf("[PASS] %-12s  max_abs_diff=%.3e  %s\n", name.c_str(), diff,
+              detail.c_str());
+}
+
+void log_fail(const std::string &name, double diff, const std::string &detail) {
+  std::printf("[FAIL] %-12s  max_abs_diff=%.3e  %s\n", name.c_str(), diff,
+              detail.c_str());
+}
+
+// Sample a uniformly random unit quaternion via four Gaussians + normalize.
+Eigen::Quaterniond random_unit_quat(std::mt19937 &rng) {
+  std::normal_distribution<double> n(0.0, 1.0);
+  Eigen::Quaterniond q;
+  // (x, y, z, w) order in coeffs().
+  q.coeffs() << n(rng), n(rng), n(rng), n(rng);
+  q.normalize();
+  if (q.w() < 0.0) {
+    q.coeffs() *= -1.0;
+  }
+  return q;
+}
+
+// Compare two quaternions modulo the q ~ -q sign ambiguity.
+double quat_diff_sign_invariant(const Eigen::Quaterniond &a,
+                                const Eigen::Quaterniond &b) {
+  double d_pos = (a.coeffs() - b.coeffs()).cwiseAbs().maxCoeff();
+  double d_neg = (a.coeffs() + b.coeffs()).cwiseAbs().maxCoeff();
+  return std::min(d_pos, d_neg);
+}
+
+// Max-abs diff between an N x 3 row-major flat buffer and an Eigen N x 3
+// matrix (any storage order).
+double max_abs_diff_n3(const double *buf_rm, const bc::MatrixX3d &ref) {
+  Eigen::Map<const Eigen::Matrix<double, Eigen::Dynamic, 3, Eigen::RowMajor>>
+      view(buf_rm, ref.rows(), 3);
+  return (view - ref).cwiseAbs().maxCoeff();
+}
+
+// Max-abs diff between a 9-double row-major buffer and an Eigen Matrix3d
+// (Eigen Matrix3d is column-major by default; we compare element-wise).
+double max_abs_diff_3x3(const double *buf_rm, const Eigen::Matrix3d &ref) {
+  double d = 0.0;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      d = std::max(d, std::abs(buf_rm[3 * i + j] - ref(i, j)));
+    }
+  }
+  return d;
+}
+
+// Random N x 3 dense matrix with entries from [lo, hi].
+bc::MatrixX3d random_n3(std::mt19937 &rng, int N, double lo, double hi) {
+  std::uniform_real_distribution<double> u(lo, hi);
+  bc::MatrixX3d m(N, 3);
+  for (int i = 0; i < N; ++i)
+    for (int j = 0; j < 3; ++j) m(i, j) = u(rng);
+  return m;
+}
+
+// Pack an Eigen N x 3 (any layout) into a row-major flat std::vector for
+// device upload.
+std::vector<double> to_rm3_flat(const bc::MatrixX3d &m) {
+  std::vector<double> v(static_cast<std::size_t>(m.rows() * 3));
+  for (int i = 0; i < m.rows(); ++i) {
+    v[3 * i + 0] = m(i, 0);
+    v[3 * i + 1] = m(i, 1);
+    v[3 * i + 2] = m(i, 2);
+  }
+  return v;
+}
+
+// Same for N x 4 (used for quaternions).
+std::vector<double> to_rm4_flat(const bc::MatrixX4d &m) {
+  std::vector<double> v(static_cast<std::size_t>(m.rows() * 4));
+  for (int i = 0; i < m.rows(); ++i) {
+    v[4 * i + 0] = m(i, 0);
+    v[4 * i + 1] = m(i, 1);
+    v[4 * i + 2] = m(i, 2);
+    v[4 * i + 3] = m(i, 3);
+  }
+  return v;
+}
+
+}  // namespace
+
+int main() {
+  std::printf("test_dense_kernels: N=%d\n", kN);
+
+  std::mt19937 rng(0xDEADBEEFu);
+  std::uniform_real_distribution<double> u_mass(0.5, 2.0);
+  std::uniform_real_distribution<double> u_pos(-1.0, 1.0);
+  std::uniform_real_distribution<double> u_small(-0.05, 0.05);
+  std::normal_distribution<double> n01(0.0, 1.0);
+
+  // Per-part mass + total mass.
+  bc::VectorXd mass(kN);
+  for (int i = 0; i < kN; ++i) mass(i) = u_mass(rng);
+  const double total_mass = mass.sum();
+
+  // Reference frame: q0 random unit quat, t0 random in [-1, 1]^3.
+  // Current frame: qx = q0 * small rotation, tx = t0 + small noise. This
+  // keeps fit_se3's optimum well-defined and away from the q ~ -q boundary.
+  bc::MatrixX4d q0(kN, 4);
+  bc::MatrixX3d t0(kN, 3);
+  bc::MatrixX4d qx(kN, 4);
+  bc::MatrixX3d tx(kN, 3);
+  for (int i = 0; i < kN; ++i) {
+    Eigen::Quaterniond q = random_unit_quat(rng);
+    q0(i, 0) = q.x();
+    q0(i, 1) = q.y();
+    q0(i, 2) = q.z();
+    q0(i, 3) = q.w();
+    t0(i, 0) = u_pos(rng);
+    t0(i, 1) = u_pos(rng);
+    t0(i, 2) = u_pos(rng);
+
+    Eigen::Vector3d axis(n01(rng), n01(rng), n01(rng));
+    if (axis.norm() < 1e-12) axis = Eigen::Vector3d::UnitX();
+    double angle = 0.05 * u_pos(rng);
+    Eigen::Quaterniond rot(Eigen::AngleAxisd(angle, axis.normalized()));
+    Eigen::Quaterniond qxq = q * rot;
+    qxq.normalize();
+    if (qxq.w() < 0.0) qxq.coeffs() *= -1.0;
+    qx(i, 0) = qxq.x();
+    qx(i, 1) = qxq.y();
+    qx(i, 2) = qxq.z();
+    qx(i, 3) = qxq.w();
+
+    tx(i, 0) = t0(i, 0) + u_small(rng);
+    tx(i, 1) = t0(i, 1) + u_small(rng);
+    tx(i, 2) = t0(i, 2) + u_small(rng);
+  }
+
+  // Per-part w / v for fit_twist (small range).
+  bc::MatrixX3d w_in = random_n3(rng, kN, -0.5, 0.5);
+  bc::MatrixX3d v_in = random_n3(rng, kN, -0.5, 0.5);
+  // c_CC[i] = t0[i] (per the task spec).
+  bc::MatrixX3d c_CC = t0;
+
+  // Inertia tensors: diag(2, 3, 5) + small symmetric off-diagonal noise.
+  // Stored row-major as (N, 9), matching HostSystem::I_CC.
+  Eigen::Matrix<double, Eigen::Dynamic, 9, Eigen::RowMajor> I_flat(kN, 9);
+  for (int i = 0; i < kN; ++i) {
+    double a01 = 0.05 * u_pos(rng);
+    double a02 = 0.05 * u_pos(rng);
+    double a12 = 0.05 * u_pos(rng);
+    double diag[3] = {2.0, 3.0, 5.0};
+    double M[9] = {diag[0], a01,     a02,     //
+                   a01,     diag[1], a12,     //
+                   a02,     a12,     diag[2]};
+    for (int k = 0; k < 9; ++k) I_flat(i, k) = M[k];
+  }
+
+  // J / H impulses for assemble_b.
+  bc::MatrixX3d J = random_n3(rng, kN, -0.1, 0.1);
+  bc::MatrixX3d H = random_n3(rng, kN, -0.1, 0.1);
+
+  // v_W / L scratch matrices (random) for assemble_b. Note these are
+  // independent of fit_twist's outputs; the kernel just consumes them.
+  bc::MatrixX3d v_W_curr = random_n3(rng, kN, -0.5, 0.5);
+  bc::MatrixX3d v_W_prev = random_n3(rng, kN, -0.5, 0.5);
+  bc::MatrixX3d L_curr = random_n3(rng, kN, -0.5, 0.5);
+  bc::MatrixX3d L_prev = random_n3(rng, kN, -0.5, 0.5);
+
+  const double lambda_R = 1e-3;
+  const double lambda_w = 1e-3;
+  const double dt = 1.0 / 240.0;
+  const double L0 = 1.0;
+
+  // -- Reusable device buffers ----------------------------------------------
+  std::vector<double> q0_h = to_rm4_flat(q0);
+  std::vector<double> qx_h = to_rm4_flat(qx);
+  std::vector<double> t0_h = to_rm3_flat(t0);
+  std::vector<double> tx_h = to_rm3_flat(tx);
+  std::vector<double> c_CC_h = to_rm3_flat(c_CC);
+  std::vector<double> w_h = to_rm3_flat(w_in);
+  std::vector<double> v_h = to_rm3_flat(v_in);
+
+  bc::DeviceBuffer<double> q0_d, qx_d, t0_d, tx_d, c_CC_d, w_d, v_d, mass_d;
+  q0_d.copy_from_host(q0_h.data(), q0_h.size());
+  qx_d.copy_from_host(qx_h.data(), qx_h.size());
+  t0_d.copy_from_host(t0_h.data(), t0_h.size());
+  tx_d.copy_from_host(tx_h.data(), tx_h.size());
+  c_CC_d.copy_from_host(c_CC_h.data(), c_CC_h.size());
+  w_d.copy_from_host(w_h.data(), w_h.size());
+  v_d.copy_from_host(v_h.data(), v_h.size());
+  mass_d.copy_from_host(mass.data(), static_cast<std::size_t>(mass.size()));
+
+  std::vector<TestResult> results;
+
+  // -------------------------------------------------------------------------
+  // 1. fit_se3
+  // -------------------------------------------------------------------------
+  {
+    bc::ref_cpu::Transformd cpu = bc::ref_cpu::fit_se3(
+        q0, t0, qx, tx, mass, total_mass, lambda_R);
+
+    double q_out[4] = {0, 0, 0, 0};
+    double t_out[3] = {0, 0, 0};
+    bc::cuda_kernels::fit_se3(kN, q0_d.data(), t0_d.data(), qx_d.data(),
+                              tx_d.data(), mass_d.data(), total_mass,
+                              lambda_R, q_out, t_out);
+
+    Eigen::Quaterniond q_gpu;
+    q_gpu.coeffs() << q_out[0], q_out[1], q_out[2], q_out[3];
+
+    double q_diff = quat_diff_sign_invariant(cpu.q, q_gpu);
+    double t_diff =
+        (cpu.t - Eigen::Vector3d(t_out[0], t_out[1], t_out[2])).cwiseAbs().maxCoeff();
+    double max_d = std::max(q_diff, t_diff);
+    char buf[160];
+    std::snprintf(buf, sizeof(buf), "(q_diff=%.3e, t_diff=%.3e)", q_diff,
+                  t_diff);
+    constexpr double tol = 1e-9;
+    TestResult r{"fit_se3", max_d <= tol, max_d, tol, buf};
+    if (r.passed)
+      log_pass(r.name, max_d, buf);
+    else
+      log_fail(r.name, max_d, buf);
+    results.push_back(std::move(r));
+  }
+
+  // -------------------------------------------------------------------------
+  // 2. fit_twist
+  //
+  // Use T_W_CC = identity quaternion (xyzw = (0, 0, 0, 1)) + zero translation
+  // so the c_W = c_CC simplification holds and any divergence comes from the
+  // twist solve itself.
+  // -------------------------------------------------------------------------
+  {
+    bc::ref_cpu::Transformd T_W_CC{Eigen::Quaterniond::Identity(),
+                                   bc::Vector3d::Zero()};
+    bc::ref_cpu::TwistFitResult cpu = bc::ref_cpu::fit_twist(
+        w_in, v_in, c_CC, T_W_CC, mass, total_mass, lambda_w);
+
+    const double q_W_CC[4] = {0.0, 0.0, 0.0, 1.0};
+    const double t_W_CC[3] = {0.0, 0.0, 0.0};
+    double w0_out[3] = {0, 0, 0};
+    double v0_out[3] = {0, 0, 0};
+    bc::DeviceBuffer<double> v_W_d(static_cast<std::size_t>(3) * kN);
+
+    bc::cuda_kernels::fit_twist(kN, w_d.data(), v_d.data(), c_CC_d.data(),
+                                q_W_CC, t_W_CC, mass_d.data(), total_mass,
+                                lambda_w, w0_out, v0_out, v_W_d.data());
+
+    Eigen::Vector3d w0_gpu(w0_out[0], w0_out[1], w0_out[2]);
+    Eigen::Vector3d v0_gpu(v0_out[0], v0_out[1], v0_out[2]);
+    double w0_diff = (cpu.w0 - w0_gpu).cwiseAbs().maxCoeff();
+    double v0_diff = (cpu.v0 - v0_gpu).cwiseAbs().maxCoeff();
+
+    std::vector<double> v_W_h(static_cast<std::size_t>(3) * kN);
+    v_W_d.copy_to_host(v_W_h.data(), v_W_h.size());
+    double vW_diff = max_abs_diff_n3(v_W_h.data(), cpu.v_W);
+
+    double max_d = std::max({w0_diff, v0_diff, vW_diff});
+    char buf[200];
+    std::snprintf(buf, sizeof(buf),
+                  "(w0_diff=%.3e, v0_diff=%.3e, v_W_diff=%.3e)", w0_diff,
+                  v0_diff, vW_diff);
+    constexpr double tol = 1e-9;
+    TestResult r{"fit_twist", max_d <= tol, max_d, tol, buf};
+    if (r.passed)
+      log_pass(r.name, max_d, buf);
+    else
+      log_fail(r.name, max_d, buf);
+    results.push_back(std::move(r));
+  }
+
+  // -------------------------------------------------------------------------
+  // 3. compute_Pi
+  // -------------------------------------------------------------------------
+  {
+    Eigen::Quaterniond qm = random_unit_quat(rng);
+    Eigen::Quaterniond qp = random_unit_quat(rng);
+
+    Eigen::Matrix3d Pi_cpu = bc::ref_cpu::compute_Pi(qm, qp);
+
+    double qm_a[4] = {qm.x(), qm.y(), qm.z(), qm.w()};
+    double qp_a[4] = {qp.x(), qp.y(), qp.z(), qp.w()};
+    double Pi_gpu[9] = {0};
+    bc::cuda_kernels::compute_Pi_host(qm_a, qp_a, Pi_gpu);
+
+    double max_d = max_abs_diff_3x3(Pi_gpu, Pi_cpu);
+    char buf[80];
+    std::snprintf(buf, sizeof(buf), "(3x3)");
+    constexpr double tol = 1e-12;
+    TestResult r{"compute_Pi", max_d <= tol, max_d, tol, buf};
+    if (r.passed)
+      log_pass(r.name, max_d, buf);
+    else
+      log_fail(r.name, max_d, buf);
+    results.push_back(std::move(r));
+  }
+
+  // -------------------------------------------------------------------------
+  // 4. compute_L
+  // -------------------------------------------------------------------------
+  {
+    Eigen::Quaterniond q_W_CC = random_unit_quat(rng);
+    Eigen::Vector3d w0(0.7, -0.4, 0.2);
+
+    bc::MatrixX3d L_ref = bc::ref_cpu::compute_L(I_flat, q_W_CC, w0);
+
+    bc::DeviceBuffer<double> I_CC_d;
+    // I_flat is already RowMajor; .data() is row-major (N, 9) flatten.
+    I_CC_d.copy_from_host(I_flat.data(),
+                          static_cast<std::size_t>(I_flat.size()));
+
+    double q_a[4] = {q_W_CC.x(), q_W_CC.y(), q_W_CC.z(), q_W_CC.w()};
+    double w_a[3] = {w0.x(), w0.y(), w0.z()};
+    bc::DeviceBuffer<double> L_d(static_cast<std::size_t>(3) * kN);
+    bc::cuda_kernels::compute_L(kN, I_CC_d.data(), q_a, w_a, L_d.data());
+
+    std::vector<double> L_h(static_cast<std::size_t>(3) * kN);
+    L_d.copy_to_host(L_h.data(), L_h.size());
+
+    double max_d = max_abs_diff_n3(L_h.data(), L_ref);
+    char buf[80];
+    std::snprintf(buf, sizeof(buf), "(N=%d, 3 cols)", kN);
+    constexpr double tol = 1e-9;
+    TestResult r{"compute_L", max_d <= tol, max_d, tol, buf};
+    if (r.passed)
+      log_pass(r.name, max_d, buf);
+    else
+      log_fail(r.name, max_d, buf);
+    results.push_back(std::move(r));
+  }
+
+  // -------------------------------------------------------------------------
+  // 5. assemble_b
+  //
+  // ref_cpu::assemble_b only reads sys.{num_parts, mass, L0} and
+  // in.{dt, J, H} (sparse blocks of HostSystem are untouched), so we leave
+  // the rest at default-constructed.
+  // -------------------------------------------------------------------------
+  {
+    bc::HostSystem sys;
+    sys.num_parts = kN;
+    sys.mass = mass;
+    sys.L0 = L0;
+
+    bc::HostInput in;
+    in.dt = dt;
+    in.J = J;
+    in.H = H;
+
+    Eigen::Matrix3d Pi;
+    Pi << 1.1, 0.2, -0.1,
+          -0.05, 0.95, 0.07,
+          0.03, -0.02, 1.03;
+
+    bc::VectorXd b_cpu = bc::ref_cpu::assemble_b(sys, in, v_W_curr, v_W_prev,
+                                                 L_curr, L_prev, Pi);
+
+    std::vector<double> v_W_curr_h = to_rm3_flat(v_W_curr);
+    std::vector<double> v_W_prev_h = to_rm3_flat(v_W_prev);
+    std::vector<double> L_curr_h = to_rm3_flat(L_curr);
+    std::vector<double> L_prev_h = to_rm3_flat(L_prev);
+    std::vector<double> J_h = to_rm3_flat(J);
+    std::vector<double> H_h = to_rm3_flat(H);
+
+    bc::DeviceBuffer<double> v_W_curr_d, v_W_prev_d, L_curr_d, L_prev_d, J_d,
+        H_d;
+    v_W_curr_d.copy_from_host(v_W_curr_h.data(), v_W_curr_h.size());
+    v_W_prev_d.copy_from_host(v_W_prev_h.data(), v_W_prev_h.size());
+    L_curr_d.copy_from_host(L_curr_h.data(), L_curr_h.size());
+    L_prev_d.copy_from_host(L_prev_h.data(), L_prev_h.size());
+    J_d.copy_from_host(J_h.data(), J_h.size());
+    H_d.copy_from_host(H_h.data(), H_h.size());
+
+    // Pi in row-major flat form for the CUDA kernel.
+    double Pi_flat[9];
+    for (int i = 0; i < 3; ++i)
+      for (int j = 0; j < 3; ++j) Pi_flat[3 * i + j] = Pi(i, j);
+
+    bc::DeviceBuffer<double> b_d(static_cast<std::size_t>(6) * kN);
+    bc::cuda_kernels::assemble_b(kN, v_W_curr_d.data(), v_W_prev_d.data(),
+                                 L_curr_d.data(), L_prev_d.data(), J_d.data(),
+                                 H_d.data(), mass_d.data(), Pi_flat, dt, L0,
+                                 b_d.data());
+
+    std::vector<double> b_h(static_cast<std::size_t>(6) * kN);
+    b_d.copy_to_host(b_h.data(), b_h.size());
+
+    // b_cpu is laid out row-major as (N, 6); compare element-wise to the
+    // (already row-major) GPU buffer.
+    double max_d = 0.0;
+    for (std::size_t i = 0; i < b_h.size(); ++i) {
+      max_d = std::max(max_d, std::abs(b_h[i] - b_cpu(static_cast<int>(i))));
+    }
+    char buf[80];
+    std::snprintf(buf, sizeof(buf), "(6N=%d)", 6 * kN);
+    constexpr double tol = 1e-9;
+    TestResult r{"assemble_b", max_d <= tol, max_d, tol, buf};
+    if (r.passed)
+      log_pass(r.name, max_d, buf);
+    else
+      log_fail(r.name, max_d, buf);
+    results.push_back(std::move(r));
+  }
+
+  // -------------------------------------------------------------------------
+  // Summary table.
+  // -------------------------------------------------------------------------
+  std::printf("\n=========================== Summary ===========================\n");
+  std::printf("%-15s %-6s %-12s %-12s %s\n", "kernel", "status", "max_diff",
+              "tol", "detail");
+  std::printf("---------------------------------------------------------------\n");
+  bool all_pass = true;
+  for (const auto &r : results) {
+    std::printf("%-15s %-6s %-12.3e %-12.3e %s\n", r.name.c_str(),
+                r.passed ? "PASS" : "FAIL", r.max_abs_diff, r.tol,
+                r.detail.c_str());
+    if (!r.passed) all_pass = false;
+  }
+  std::printf("===============================================================\n");
+  std::printf("%s\n", all_pass ? "OVERALL: PASS" : "OVERALL: FAIL");
+  return all_pass ? 0 : 1;
+}
diff --git a/native/cuda/breakage_cuda/tests/test_full_pipeline.cpp b/native/cuda/breakage_cuda/tests/test_full_pipeline.cpp
new file mode 100644
index 0000000..e6b6e77
--- /dev/null
+++ b/native/cuda/breakage_cuda/tests/test_full_pipeline.cpp
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: MIT
+//
+// End-to-end pipeline test: drives the OSQP-backed CPU baseline
+// (`ref_cpu::solve_full`) and the hand-written CUDA pipeline
+// (`cuda_qp::Pipeline`) on the same input and checks that they agree on the
+// un-whitened solution `x` and the per-clutch utilization vector.
+//
+// Two modes:
+//   1. JSON-driven: argv[1] is a path to a debug-dump JSON produced by
+//      bricksim::BreakageChecker (i.e. one of the dumps that
+//      `breakage_cuda::load_debug_dump` accepts).
+//   2. Synthetic fallback (no args): builds the smallest non-trivial
+//      HostSystem (N=2 parts, K=1 clutch, ncv=0) by hand. The matrices follow
+//      the `BreakageSystem::check_shape` invariants from breakage.cppm:
+//      `num_vars = ncv + 9*K`, `num_eq = 6*N`, V is (mh, K), etc.
+//
+// Tolerance: relative inf-norm error <= 1e-2. Both solvers are independent
+// ADMM implementations so bit-exact agreement is not expected; we only check
+// that they land in the same neighbourhood.
+
+#include "breakage_cuda/qp_solver.hpp"
+#include "breakage_cuda/ref_cpu.hpp"
+#include "breakage_cuda/system.hpp"
+
+#include <Eigen/Core>
+#include <Eigen/Geometry>
+#include <Eigen/SparseCore>
+
+#include <cstdio>
+#include <cstdlib>
+#include <exception>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace {
+
+using namespace breakage_cuda;
+using Trip = Eigen::Triplet<double>;
+
+// ---------------------------------------------------------------------------
+// Synthetic builder.
+//
+// Layout: 2 parts arranged on the x-axis at +/-0.5 m, joined by 1 clutch
+// (so num_clutches = 1, num_contact_vertices = 0, num_vars = 9). Each per-
+// part 3x3 inertia is the identity. Q is a small positive-definite diagonal
+// (matches the contact regularization role from breakage.cppm). A is set up
+// so that the first three "axial" variables of the clutch couple the two
+// parts' linear residuals with opposite signs (Newton's third law for the
+// linear part); the angular block of A is zero, so the angular components of
+// b appear as projection-stage slack but the rest of the QP is well-posed.
+// G enforces a non-negativity on the axial variable; H + V give two friction-
+// cone-like rows so the relaxation stage has something to do.
+// ---------------------------------------------------------------------------
+HostSystem make_synthetic_system() {
+  HostSystem sys;
+  const int N = 2;
+  const int K = 1;
+
+  sys.num_parts = N;
+  sys.num_clutches = K;
+  sys.num_contact_vertices = 0;
+  sys.num_vars = 9 * K;
+  sys.num_eq = 6 * N;
+  sys.num_ineq = 1;
+  sys.num_relaxed_ineq = 2;
+  sys.total_mass = 2.0;
+  sys.L0 = 1.0;
+
+  sys.mass = VectorXd::Constant(N, 1.0);
+
+  sys.q_CC.resize(N, 4);
+  sys.q_CC.row(0) << 0.0, 0.0, 0.0, 1.0;
+  sys.q_CC.row(1) << 0.0, 0.0, 0.0, 1.0;
+
+  sys.c_CC.resize(N, 3);
+  sys.c_CC.row(0) << -0.5, 0.0, 0.0;
+  sys.c_CC.row(1) << 0.5, 0.0, 0.0;
+
+  sys.I_CC.resize(N, 9);
+  for (int i = 0; i < N; ++i) {
+    sys.I_CC.row(i) << 1.0, 0.0, 0.0,
+                       0.0, 1.0, 0.0,
+                       0.0, 0.0, 1.0;
+  }
+
+  // Q: 0.1 * I_9. Small positive definite; mirrors ContactRegularization on
+  // the diagonal in breakage.cppm.
+  std::vector<Trip> Qt;
+  Qt.reserve(9);
+  for (int i = 0; i < 9; ++i) Qt.emplace_back(i, i, 0.1);
+  sys.Q.resize(9, 9);
+  sys.Q.setFromTriplets(Qt.begin(), Qt.end());
+  sys.Q.makeCompressed();
+
+  // A: 12 x 9.
+  // Linear coupling: -I_3 on cols 0..2 of A_i (rows 0..2),
+  //                  +I_3 on cols 0..2 of A_j (rows 6..8).
+  // Angular rows (3..5 and 9..11) are zero - the angular component of b
+  // becomes projection-stage slack on both backends.
+  std::vector<Trip> At;
+  At.reserve(6);
+  for (int axis = 0; axis < 3; ++axis) {
+    At.emplace_back(axis, axis, -1.0);
+    At.emplace_back(6 + axis, axis, 1.0);
+  }
+  sys.A.resize(12, 9);
+  sys.A.setFromTriplets(At.begin(), At.end());
+  sys.A.makeCompressed();
+
+  // G: x[0] >= 0 (axial force is compressive).
+  std::vector<Trip> Gt;
+  Gt.emplace_back(0, 0, 1.0);
+  sys.G.resize(1, 9);
+  sys.G.setFromTriplets(Gt.begin(), Gt.end());
+  sys.G.makeCompressed();
+
+  // H: 2 friction-cone rows of small magnitude so the limit stays slack at
+  // the optimum (we want to compare unconstrained-ish behaviour, not test
+  // who clamps first).
+  std::vector<Trip> Ht;
+  Ht.emplace_back(0, 0, -0.1);
+  Ht.emplace_back(0, 1, 0.1);
+  Ht.emplace_back(1, 0, -0.1);
+  Ht.emplace_back(1, 1, -0.1);
+  sys.H.resize(2, 9);
+  sys.H.setFromTriplets(Ht.begin(), Ht.end());
+  sys.H.makeCompressed();
+
+  // V: 2 x 1, both relaxed-ineq rows belong to clutch 0.
+  std::vector<Trip> Vt;
+  Vt.emplace_back(0, 0, 1.0);
+  Vt.emplace_back(1, 0, 1.0);
+  sys.V.resize(2, 1);
+  sys.V.setFromTriplets(Vt.begin(), Vt.end());
+  sys.V.makeCompressed();
+
+  // Capacity rows: both for clutch 0; structured per breakage.cppm
+  // (head<3>() = "force used", tail<6>() = "force capacity offset").
+  sys.capacity_clutch_indices = {0, 0};
+  sys.capacities.resize(2, 9);
+  sys.capacities.row(0) << 1.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0;
+  sys.capacities.row(1) << 1.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0;
+
+  // Wk = identity (so CPU's column-major Map and GPU's row-major read
+  // produce the same matrix; in production both bricksim and breakage_cuda
+  // assume Wk is symmetric so the two readouts agree by construction).
+  sys.clutch_whiten.resize(1, 4);
+  sys.clutch_whiten.row(0) << 1.0, 0.0, 0.0, 1.0;
+
+  return sys;
+}
+
+HostInput make_synthetic_input() {
+  HostInput in;
+  const int N = 2;
+
+  in.dt = 0.01;
+  in.w = MatrixX3d::Zero(N, 3);
+
+  // Equal-and-opposite y velocities. After fit_twist this picks up a tiny
+  // z-axis rotation (w0_z = -0.01) and produces both linear AND angular
+  // contributions to b.
+  in.v.resize(N, 3);
+  in.v.row(0) << 0.0, 0.05, 0.0;
+  in.v.row(1) << 0.0, -0.05, 0.0;
+
+  in.q.resize(N, 4);
+  in.q.row(0) << 0.0, 0.0, 0.0, 1.0;
+  in.q.row(1) << 0.0, 0.0, 0.0, 1.0;
+
+  // Very small displacement off c_CC; fit_se3 still recovers ~identity.
+  in.c.resize(N, 3);
+  in.c.row(0) << -0.5, 0.0005, 0.0;
+  in.c.row(1) << 0.5, -0.0005, 0.0;
+
+  in.J = MatrixX3d::Zero(N, 3);
+  in.H = MatrixX3d::Zero(N, 3);
+  return in;
+}
+
+HostState make_synthetic_state(int N) {
+  HostState st;
+  st.q_W_CC_prev = Eigen::Quaterniond::Identity();
+  st.v_W_prev = MatrixX3d::Zero(N, 3);
+  st.L_prev = MatrixX3d::Zero(N, 3);
+  return st;
+}
+
+// ---------------------------------------------------------------------------
+// Pretty-printing.
+// ---------------------------------------------------------------------------
+void print_solve_info(const char *label, const SolveInfo &info) {
+  std::printf("  %-3s prj iter=%4d res(p,d)=(%.2e,%.2e) %s | "
+              "rlx iter=%4d res(p,d)=(%.2e,%.2e) %s | "
+              "opt iter=%4d res(p,d)=(%.2e,%.2e) %s\n",
+              label,
+              info.prj.iter, info.prj.prim_res, info.prj.dual_res,
+              info.prj.converged ? "OK  " : "FAIL",
+              info.rlx.iter, info.rlx.prim_res, info.rlx.dual_res,
+              info.rlx.converged ? "OK  " : "FAIL",
+              info.opt.iter, info.opt.prim_res, info.opt.dual_res,
+              info.opt.converged ? "OK  " : "FAIL");
+}
+
+bool run_compare(const char *label, const HostSystem &sys, const HostInput &in,
+                 HostState state_cpu, HostState state_gpu,
+                 const Thresholds &thr) {
+  std::printf("[%s] num_parts=%d num_clutches=%d num_vars=%d num_eq=%d "
+              "num_ineq=%d num_relaxed_ineq=%d\n",
+              label, sys.num_parts, sys.num_clutches, sys.num_vars,
+              sys.num_eq, sys.num_ineq, sys.num_relaxed_ineq);
+
+  Solution sol_cpu = ref_cpu::solve_full(sys, in, state_cpu, thr);
+  std::printf("  cpu converged=%d slack_fraction=%.3e\n",
+              static_cast<int>(sol_cpu.info.converged),
+              sol_cpu.slack_fraction);
+  print_solve_info("cpu", sol_cpu.info);
+
+  cuda_qp::Pipeline pipe;
+  pipe.setup(sys, thr);
+  Solution sol_gpu = pipe.solve(in, state_gpu);
+  std::printf("  gpu converged=%d slack_fraction=%.3e\n",
+              static_cast<int>(sol_gpu.info.converged),
+              sol_gpu.slack_fraction);
+  print_solve_info("gpu", sol_gpu.info);
+
+  bool ok = true;
+  if (!sol_cpu.info.converged) {
+    std::printf("[FAIL] %s: cpu solver did not converge\n", label);
+    ok = false;
+  }
+  if (!sol_gpu.info.converged) {
+    std::printf("[FAIL] %s: gpu solver did not converge\n", label);
+    ok = false;
+  }
+  if (!ok) return false;
+
+  if (sol_cpu.x.size() != sol_gpu.x.size()) {
+    std::printf("[FAIL] %s: x size mismatch (cpu=%lld gpu=%lld)\n", label,
+                static_cast<long long>(sol_cpu.x.size()),
+                static_cast<long long>(sol_gpu.x.size()));
+    return false;
+  }
+  if (sol_cpu.utilization.size() != sol_gpu.utilization.size()) {
+    std::printf("[FAIL] %s: utilization size mismatch (cpu=%lld gpu=%lld)\n",
+                label,
+                static_cast<long long>(sol_cpu.utilization.size()),
+                static_cast<long long>(sol_gpu.utilization.size()));
+    return false;
+  }
+
+  // Floor: 1e-3 absolute. With an x of order ~1, 1e-2 relative ~ 1e-2
+  // absolute, which is well above the floor; tiny solutions still get a
+  // sensible scale to compare against.
+  constexpr double kFloor = 1e-3;
+  constexpr double kTol = 1e-2;
+
+  const double err_x = relative_inf_error(sol_gpu.x, sol_cpu.x, kFloor);
+  std::printf("  x:           cpu_inf=%.4e gpu_inf=%.4e abs_diff_inf=%.4e "
+              "rel_inf_err=%.4e\n",
+              sol_cpu.x.cwiseAbs().maxCoeff(),
+              sol_gpu.x.cwiseAbs().maxCoeff(),
+              (sol_cpu.x - sol_gpu.x).cwiseAbs().maxCoeff(), err_x);
+  if (err_x <= kTol) {
+    std::printf("[PASS] full_pipeline.x           rel_inf=%.4e <= %.0e\n",
+                err_x, kTol);
+  } else {
+    std::printf("[FAIL] full_pipeline.x           rel_inf=%.4e >  %.0e\n",
+                err_x, kTol);
+    ok = false;
+  }
+
+  const double err_u =
+      relative_inf_error(sol_gpu.utilization, sol_cpu.utilization, kFloor);
+  std::printf("  utilization: cpu_inf=%.4e gpu_inf=%.4e abs_diff_inf=%.4e "
+              "rel_inf_err=%.4e\n",
+              sol_cpu.utilization.cwiseAbs().maxCoeff(),
+              sol_gpu.utilization.cwiseAbs().maxCoeff(),
+              (sol_cpu.utilization - sol_gpu.utilization)
+                  .cwiseAbs()
+                  .maxCoeff(),
+              err_u);
+  if (err_u <= kTol) {
+    std::printf("[PASS] full_pipeline.utilization rel_inf=%.4e <= %.0e\n",
+                err_u, kTol);
+  } else {
+    std::printf("[FAIL] full_pipeline.utilization rel_inf=%.4e >  %.0e\n",
+                err_u, kTol);
+    ok = false;
+  }
+
+  return ok;
+}
+
+bool run_json_mode(const std::string &path) {
+  std::printf("[full_pipeline] (JSON mode) loading %s\n", path.c_str());
+  DebugDump dump = load_debug_dump(path);
+  // Both solvers mutate state; clone so each sees a fresh copy.
+  HostState state_cpu = dump.state;
+  HostState state_gpu = dump.state;
+  return run_compare("json", dump.system, dump.input, std::move(state_cpu),
+                     std::move(state_gpu), dump.thresholds);
+}
+
+bool run_synthetic_mode() {
+  std::printf("[full_pipeline] (synthetic mode)\n");
+  HostSystem sys = make_synthetic_system();
+  HostInput in = make_synthetic_input();
+  HostState state_cpu = make_synthetic_state(sys.num_parts);
+  HostState state_gpu = make_synthetic_state(sys.num_parts);
+  Thresholds thr;
+  thr.enabled = true;
+  return run_compare("synthetic", sys, in, std::move(state_cpu),
+                     std::move(state_gpu), thr);
+}
+
+} // namespace
+
+int main(int argc, char **argv) {
+  try {
+    bool ok = (argc >= 2) ? run_json_mode(argv[1]) : run_synthetic_mode();
+    std::printf("[full_pipeline] %s\n", ok ? "OK" : "FAILED");
+    return ok ? 0 : 1;
+  } catch (const std::exception &e) {
+    std::fprintf(stderr, "[ERROR] full_pipeline: %s\n", e.what());
+    return 2;
+  }
+}
diff --git a/native/cuda/breakage_cuda/tests/test_qp_random.cpp b/native/cuda/breakage_cuda/tests/test_qp_random.cpp
new file mode 100644
index 0000000..c3234f4
--- /dev/null
+++ b/native/cuda/breakage_cuda/tests/test_qp_random.cpp
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: MIT
+//
+// Cross-validation test for the CUDA QP solver against the CPU reference
+// (real OSQP). Builds a tiny synthetic HostSystem and runs the same RHS
+// through both backends, then compares their stage-3 `x` outputs via a
+// relative inf-norm error.
+//
+// Toy HostSystem layout (NOT a physical breakage problem; the QP solvers
+// are oblivious to the per-part fields, so we only populate the matrices
+// that `Solver::setup` / `InternalQpState::setup_from` actually read):
+//
+//   num_parts = 1            => num_eq = me = 6
+//   num_clutches = 1         => nv = 1
+//   num_contact_vertices = 1 => num_vars = nx = 1 + 9*1 = 10  (consistent
+//                                                              with
+//                                                              breakage.cppm
+//                                                              invariant)
+//   num_ineq = mi = 2
+//   num_relaxed_ineq = mh = 2
+//
+//   A in R^{6x10}: [ I_6 | 0_{6x4} ]
+//                  -> stage 1 finds b_prj = b for any b (range(A) = R^6),
+//                     and stages 2/3 force x.head(6) == b exactly.
+//   Q in R^{10x10}: I_10 + 0.5 * sum_{i=0..3} (e_i e_{6+i}^T + e_{6+i} e_i^T)
+//                  -> SPD (each (i, 6+i) 2x2 block has eigenvalues 0.5, 1.5;
+//                     other diagonals are 1.0; condition number 3).
+//                  -> stage 3 unconstrained min over y = x.tail(4) is
+//                     y* = -0.5 * b.head(4).
+//   G in R^{2x10}: rows pick x[6], x[7] (so G x = [x[6]; x[7]] >= -eps).
+//   H in R^{2x10}: rows pick x[8], x[9] (so H x = [x[8]; x[9]] <= 1+V*v).
+//   V in R^{2x1}:  ones, so the single clutch slack v couples both H rows.
+//
+// We pick b = [4, 4, -4, -4, 0, 0]:
+//   Stage 2 unconstrained optimum is x.tail(4) = 0, v = 0 -> v_relax = 0.
+//   Stage 3 unconstrained optimum y* = (-2, -2, 2, 2) violates ALL FOUR
+//   inequality constraints, so the constrained minimum hits the box at
+//   (y[0], y[1], y[2], y[3]) = (-eps, -eps, 1, 1). All three stages run
+//   with non-trivial active sets.
+//
+// The OSQP baseline and the hand-rolled CUDA ADMM are independent solvers
+// with different inner KKT machinery (OSQP factors, CUDA uses CG on the
+// reduced normal equations) and different exact convergence criteria, so
+// we do NOT expect bit equality. Tolerance: relative inf-norm error
+// `||x_cuda - x_cpu||_inf / max(1, ||x_cpu||_inf) <= 1e-2`. Per-stage
+// iter / prim_res / dual_res are printed for both backends so we can see
+// whether either one stalled.
+//
+// SOURCE-FILE FIX APPLIED while writing this test:
+//   src/qp_solver.cu, Solver::solve, opt-stage upper bound:
+//     `u_opt.tail(mh_).setConstant(1.0 - kEpsilon);` was wrong --
+//     osqp.cppm (the source of truth) and ref_cpu.cpp both use plain 1.0
+//     for the H-row upper bound in stage 3 (the `1 - epsilon` slack
+//     belongs to stage 2 only). Changed to `setConstant(1.0)` so the two
+//     backends solve the same QP. Without this fix the CUDA opt-stage
+//     ceiling sits 1e-4 below OSQP's, which biases every active H-row
+//     constraint by epsilon.
+
+#include "breakage_cuda/device.hpp"
+#include "breakage_cuda/qp_solver.hpp"
+#include "breakage_cuda/ref_cpu.hpp"
+#include "breakage_cuda/system.hpp"
+
+#include <Eigen/Dense>
+#include <Eigen/SparseCore>
+
+#include <cuda_runtime.h>
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <exception>
+#include <vector>
+
+using breakage_cuda::DeviceBuffer;
+using breakage_cuda::DeviceSystem;
+using breakage_cuda::HostState;
+using breakage_cuda::HostSystem;
+using breakage_cuda::MatrixX3d;
+using breakage_cuda::MatrixX4d;
+using breakage_cuda::SolveInfo;
+using breakage_cuda::SparseMatrixCSC;
+using breakage_cuda::StageInfo;
+using breakage_cuda::VectorXd;
+using breakage_cuda::relative_inf_error;
+
+namespace {
+
+SparseMatrixCSC dense_to_csc(const Eigen::MatrixXd &m) {
+  std::vector<Eigen::Triplet<double>> trips;
+  trips.reserve(static_cast<std::size_t>(m.rows()) *
+                static_cast<std::size_t>(m.cols()));
+  for (int j = 0; j < m.cols(); ++j) {
+    for (int i = 0; i < m.rows(); ++i) {
+      const double v = m(i, j);
+      if (v != 0.0) {
+        trips.emplace_back(i, j, v);
+      }
+    }
+  }
+  SparseMatrixCSC sp(m.rows(), m.cols());
+  sp.setFromTriplets(trips.begin(), trips.end());
+  sp.makeCompressed();
+  return sp;
+}
+
+HostSystem build_toy_system() {
+  HostSystem sys;
+  const int N = 1;
+  const int K = 1;
+  const int ncv = 1;
+  const int nx = ncv + 9 * K;
+  const int me = 6 * N;
+  const int mi = 2;
+  const int mh = 2;
+
+  sys.num_parts = N;
+  sys.num_clutches = K;
+  sys.num_contact_vertices = ncv;
+  sys.num_vars = nx;
+  sys.num_eq = me;
+  sys.num_ineq = mi;
+  sys.num_relaxed_ineq = mh;
+  sys.total_mass = 1.0;
+  sys.L0 = 1.0;
+
+  // Per-part dense fields are unused by the QP solvers (they're only
+  // touched by the dense kernels in the full pipeline). Populate them
+  // with sane defaults so downstream consumers wouldn't trip on garbage.
+  sys.mass = VectorXd::Constant(N, 1.0);
+  sys.q_CC = MatrixX4d::Zero(N, 4);
+  sys.q_CC(0, 3) = 1.0;
+  sys.c_CC = MatrixX3d::Zero(N, 3);
+  sys.I_CC.resize(N, 9);
+  sys.I_CC.setZero();
+  sys.I_CC(0, 0) = 1.0;
+  sys.I_CC(0, 4) = 1.0;
+  sys.I_CC(0, 8) = 1.0;
+
+  Eigen::MatrixXd Qd = Eigen::MatrixXd::Identity(nx, nx);
+  for (int i = 0; i < 4; ++i) {
+    Qd(i, 6 + i) = 0.5;
+    Qd(6 + i, i) = 0.5;
+  }
+  sys.Q = dense_to_csc(Qd);
+
+  Eigen::MatrixXd Ad = Eigen::MatrixXd::Zero(me, nx);
+  for (int i = 0; i < me; ++i) {
+    Ad(i, i) = 1.0;
+  }
+  sys.A = dense_to_csc(Ad);
+
+  Eigen::MatrixXd Gd = Eigen::MatrixXd::Zero(mi, nx);
+  Gd(0, 6) = 1.0;
+  Gd(1, 7) = 1.0;
+  sys.G = dense_to_csc(Gd);
+
+  Eigen::MatrixXd Hd = Eigen::MatrixXd::Zero(mh, nx);
+  Hd(0, 8) = 1.0;
+  Hd(1, 9) = 1.0;
+  sys.H = dense_to_csc(Hd);
+
+  Eigen::MatrixXd Vd = Eigen::MatrixXd::Ones(mh, K);
+  sys.V = dense_to_csc(Vd);
+
+  // Friction-cone metadata is only consulted by the postprocess step in
+  // the full pipeline; leave empty for the QP-only test.
+  sys.capacity_clutch_indices.clear();
+  sys.capacities.resize(0, 9);
+  sys.clutch_whiten.resize(K, 4);
+  sys.clutch_whiten.setZero();
+  sys.clutch_whiten(0, 0) = 1.0;
+  sys.clutch_whiten(0, 3) = 1.0;
+
+  return sys;
+}
+
+void print_stage(const char *name, const StageInfo &cpu, const StageInfo &cuda) {
+  std::printf("  %s:\n", name);
+  std::printf("    CPU  : converged=%d iter=%4d prim_res=%.3e dual_res=%.3e "
+              "rho=%.3e\n",
+              cpu.converged ? 1 : 0, cpu.iter, cpu.prim_res, cpu.dual_res,
+              cpu.rho);
+  std::printf("    CUDA : converged=%d iter=%4d prim_res=%.3e dual_res=%.3e "
+              "rho=%.3e\n",
+              cuda.converged ? 1 : 0, cuda.iter, cuda.prim_res, cuda.dual_res,
+              cuda.rho);
+}
+
+}  // namespace
+
+int main() {
+  try {
+    HostSystem sys = build_toy_system();
+
+    VectorXd b(sys.num_eq);
+    b << 4.0, 4.0, -4.0, -4.0, 0.0, 0.0;
+
+    std::printf("=== test_qp_random ===\n");
+    std::printf("Toy QP: N=%d K=%d ncv=%d  nx=%d  me=%d  mi=%d  mh=%d  nv=%d\n",
+                sys.num_parts, sys.num_clutches, sys.num_contact_vertices,
+                sys.num_vars, sys.num_eq, sys.num_ineq, sys.num_relaxed_ineq,
+                sys.num_clutches);
+    std::printf("RHS b = [%g, %g, %g, %g, %g, %g]\n", b[0], b[1], b[2], b[3],
+                b[4], b[5]);
+
+    // -------------------- CPU reference (OSQP) ----------------------------
+    HostState cpu_state;
+    cpu_state.v_W_prev = MatrixX3d::Zero(sys.num_parts, 3);
+    cpu_state.L_prev = MatrixX3d::Zero(sys.num_parts, 3);
+    VectorXd x_cpu, slack_cpu, v_relax_cpu;
+    SolveInfo cpu_info = breakage_cuda::ref_cpu::solve_qp(
+        sys, b, cpu_state, x_cpu, slack_cpu, v_relax_cpu);
+
+    // -------------------- CUDA solver -------------------------------------
+    // Solver::setup ignores its DeviceSystem argument (uses HostSystem
+    // matrices directly), so a default-constructed one is fine here.
+    DeviceSystem dev_dummy{};
+    breakage_cuda::cuda_qp::Settings settings;
+    breakage_cuda::cuda_qp::Solver cuda_solver;
+    cuda_solver.setup(sys, dev_dummy, settings);
+
+    DeviceBuffer<double> x_d(static_cast<std::size_t>(sys.num_vars));
+    DeviceBuffer<double> slack_d(static_cast<std::size_t>(sys.num_eq));
+    DeviceBuffer<double> v_relax_d(
+        static_cast<std::size_t>(sys.num_clutches));
+
+    SolveInfo cuda_info = cuda_solver.solve(b, x_d.data(), slack_d.data(),
+                                            v_relax_d.data());
+
+    VectorXd x_cuda(sys.num_vars);
+    x_d.copy_to_host(x_cuda.data(), x_cuda.size());
+
+    VectorXd v_relax_cuda(sys.num_clutches);
+    if (sys.num_clutches > 0) {
+      v_relax_d.copy_to_host(v_relax_cuda.data(), v_relax_cuda.size());
+    }
+
+    // -------------------- Per-stage info ----------------------------------
+    std::printf("\nPer-stage convergence (CPU = OSQP, CUDA = ADMM+CG):\n");
+    print_stage("prj", cpu_info.prj, cuda_info.prj);
+    print_stage("rlx", cpu_info.rlx, cuda_info.rlx);
+    print_stage("opt", cpu_info.opt, cuda_info.opt);
+    std::printf("  v_relax: cpu = %.6e   cuda = %.6e\n", v_relax_cpu[0],
+                v_relax_cuda[0]);
+
+    // -------------------- Compare x ---------------------------------------
+    const double tol = 1e-2;
+    const double err = relative_inf_error(x_cuda, x_cpu, /*floor=*/1.0);
+    std::printf("\nx (CPU vs CUDA), relative inf-norm error = %.3e (tol %.0e)\n",
+                err, tol);
+    std::printf("  i :    x_cpu        x_cuda      |diff|\n");
+    for (int i = 0; i < sys.num_vars; ++i) {
+      std::printf("  %2d: %12.6f %12.6f %.3e\n", i, x_cpu[i], x_cuda[i],
+                  std::abs(x_cpu[i] - x_cuda[i]));
+    }
+
+    // -------------------- Pass / fail summary -----------------------------
+    bool overall_ok = true;
+    auto stage_pass = [&](const char *name, bool cpu_conv, bool cuda_conv) {
+      const bool ok = cpu_conv && cuda_conv;
+      std::printf("[%s] %s convergence (CPU=%d, CUDA=%d)\n",
+                  ok ? "PASS" : "FAIL", name, cpu_conv ? 1 : 0,
+                  cuda_conv ? 1 : 0);
+      if (!ok) overall_ok = false;
+    };
+    stage_pass("prj", cpu_info.prj.converged, cuda_info.prj.converged);
+    stage_pass("rlx", cpu_info.rlx.converged, cuda_info.rlx.converged);
+    stage_pass("opt", cpu_info.opt.converged, cuda_info.opt.converged);
+
+    if (err <= tol) {
+      std::printf("[PASS] x agreement (err = %.3e <= %.0e)\n", err, tol);
+    } else {
+      std::printf("[FAIL] x agreement (err = %.3e > %.0e)\n", err, tol);
+      overall_ok = false;
+    }
+
+    std::printf("\n%s overall\n", overall_ok ? "[PASS]" : "[FAIL]");
+    return overall_ok ? 0 : 1;
+  } catch (const std::exception &e) {
+    std::fprintf(stderr, "[FAIL] exception: %s\n", e.what());
+    return 1;
+  }
+}
diff --git a/native/modules/bricksim/physx/breakage.cppm b/native/modules/bricksim/physx/breakage.cppm
index e202ce6..dc79c1d 100644
--- a/native/modules/bricksim/physx/breakage.cppm
+++ b/native/modules/bricksim/physx/breakage.cppm
@@ -55,9 +55,10 @@ using QpSolver = OsqpSolver;
 using QpSolverState = OsqpState;
 using QpSolverInfo = OsqpInfo;
 
-Transformd fit_se3(const MatrixX4d &q0, const MatrixX3d &t0,
-                   const MatrixX4d &qx, const MatrixX3d &tx,
-                   const VectorXd &mass, double total_mass, double lambda_R) {
+export Transformd fit_se3(const MatrixX4d &q0, const MatrixX3d &t0,
+                          const MatrixX4d &qx, const MatrixX3d &tx,
+                          const VectorXd &mass, double total_mass,
+                          double lambda_R) {
 	Index N = mass.size();
 	Vector3d t0_bar = t0.transpose() * mass / total_mass;
 	Vector3d tx_bar = tx.transpose() * mass / total_mass;
@@ -87,16 +88,17 @@ Transformd fit_se3(const MatrixX4d &q0, const MatrixX3d &t0,
 	return {q, t};
 }
 
-struct TwistFitResult {
+export struct TwistFitResult {
 	Vector3d w0;
 	Vector3d v0;
 	MatrixX3d v_W;
 };
 
-TwistFitResult fit_twist(const MatrixX3d &w, const MatrixX3d &v,
-                         const MatrixX3d &c_CC, const Transformd &T_W_CC,
-                         const VectorXd &mass, double total_mass,
-                         double lambda_w) {
+export TwistFitResult fit_twist(const MatrixX3d &w, const MatrixX3d &v,
+                                const MatrixX3d &c_CC,
+                                const Transformd &T_W_CC,
+                                const VectorXd &mass, double total_mass,
+                                double lambda_w) {
 	const auto &[q_W_CC, t_W_CC] = T_W_CC;
 	MatrixX3d c_W = (c_CC * q_W_CC.toRotationMatrix().transpose()).rowwise() +
 	                t_W_CC.transpose();
@@ -120,7 +122,7 @@ TwistFitResult fit_twist(const MatrixX3d &w, const MatrixX3d &v,
 	return result;
 }
 
-Vector3d so3_log_from_unit_quat(Quaterniond q) {
+export Vector3d so3_log_from_unit_quat(Quaterniond q) {
 	// log-map branch threshold on s = ||q.vec|| = sin(theta/2)
 	constexpr double kSmallS = 5e-5;
 	q.normalize();
@@ -143,7 +145,7 @@ Vector3d so3_log_from_unit_quat(Quaterniond q) {
 
 // Inverse of the Jacobian corresponding to your report's convention:
 // J(φ) = I + (1-cosθ)/θ^2 [φ]x + (θ-sinθ)/θ^3 [φ]x^2   (this is the "right Jacobian" in many texts)
-Matrix3d so3_jacobian_inv(const Vector3d &phi) {
+export Matrix3d so3_jacobian_inv(const Vector3d &phi) {
 	// J^{-1} branch threshold on theta = ||phi||
 	constexpr double kSmallTheta = 1e-4;
 	Matrix3d Phi = phi.asSkewSymmetric();
@@ -163,13 +165,13 @@ Matrix3d so3_jacobian_inv(const Vector3d &phi) {
 	return Matrix3d::Identity() - 0.5 * Phi + c * (Phi * Phi);
 }
 
-Matrix3d compute_Pi(const Quaterniond &qm, const Quaterniond &qp) {
+export Matrix3d compute_Pi(const Quaterniond &qm, const Quaterniond &qp) {
 	// Relative rotation: R_rel = Rm^T Rp
 	return so3_jacobian_inv(so3_log_from_unit_quat(qm.conjugate() * qp)) *
 	       qm.toRotationMatrix().transpose();
 }
 
-template <class Derived>
+export template <class Derived>
 MatrixX3d compute_L(const MatrixBase<Derived> &Iflat, const Quaterniond &q_W_CC,
                     const Vector3d &w0) {
 	Vector3d w_CC = q_W_CC.conjugate() * w0;
@@ -205,7 +207,7 @@ void add_block_triplets(std::vector<Triplet<double>> &out, int row0, int col0,
 	}
 }
 
-Matrix2d inv_sqrt_spd(const Matrix2d &C) {
+export Matrix2d inv_sqrt_spd(const Matrix2d &C) {
 	SelfAdjointEigenSolver<Matrix2d> eig{C};
 	if (eig.info() != Eigen::Success) {
 		throw std::invalid_argument("inv_sqrt_spd: eigen decomposition failed");
diff --git a/native/tests/TestBreakage.cpp b/native/tests/TestBreakage.cpp
new file mode 100644
index 0000000..d88a9f6
--- /dev/null
+++ b/native/tests/TestBreakage.cpp
@@ -0,0 +1,433 @@
+import std;
+import bricksim.physx.breakage;
+import bricksim.utils.transforms;
+import bricksim.vendor;
+
+#include <cassert>
+
+using namespace bricksim;
+
+using Eigen::AngleAxisd;
+using Eigen::Dynamic;
+using Eigen::Matrix;
+using Eigen::Matrix2d;
+using Eigen::Matrix3d;
+using Eigen::Quaterniond;
+using Eigen::Vector3d;
+using Eigen::Vector4d;
+using Eigen::VectorXd;
+using MatrixX3d = Matrix<double, Dynamic, 3>;
+using MatrixX4d = Matrix<double, Dynamic, 4>;
+using MatrixX9d = Matrix<double, Dynamic, 9>;
+
+// --------------------- helpers ---------------------
+
+static constexpr double kTol = 1e-9;
+
+template <class A, class B>
+static bool approx_mat(const A &a, const B &b, double tol = kTol) {
+	if (a.rows() != b.rows() || a.cols() != b.cols())
+		return false;
+	double scale = 1.0 + std::max(a.cwiseAbs().maxCoeff(),
+	                              b.cwiseAbs().maxCoeff());
+	return (a - b).cwiseAbs().maxCoeff() <= tol * scale;
+}
+
+// Eigen stores `coeffs()` as (x, y, z, w). Same convention is used by the
+// breakage code (`fit_se3` reads rows of the input quat matrix as (x,y,z,w)
+// and feeds them into Quaterniond's vector ctor).
+static Vector4d quat_xyzw(const Quaterniond &q) {
+	return q.coeffs();
+}
+
+static Quaterniond axis_angle(const Vector3d &axis_unit, double angle) {
+	return Quaterniond{AngleAxisd{angle, axis_unit}};
+}
+
+// Force evaluation of a 3-component Eigen expression into a Vector3d. Avoids
+// the brace-init ambiguity between `Vector3d(MatrixBase&)` and
+// `Vector3d(Scalar, Scalar, Scalar)` that some compilers stumble on.
+template <class Expr> static Vector3d v3(const Expr &e) {
+	Vector3d out = e;
+	return out;
+}
+
+// Closed-form right Jacobian J(phi) = I + (1-cosθ)/θ² [φ]× +
+//                                     (θ-sinθ)/θ³ [φ]×²
+// (matches the convention so3_jacobian_inv inverts).
+static Matrix3d so3_jacobian_closed_form(const Vector3d &phi) {
+	Matrix3d Phi = phi.asSkewSymmetric();
+	double theta2 = phi.squaredNorm();
+	if (theta2 < 1e-20) {
+		return Matrix3d::Identity() + 0.5 * Phi +
+		       (1.0 / 6.0) * (Phi * Phi);
+	}
+	double theta = std::sqrt(theta2);
+	double a = (1.0 - std::cos(theta)) / theta2;
+	double b = (theta - std::sin(theta)) / (theta2 * theta);
+	return Matrix3d::Identity() + a * Phi + b * (Phi * Phi);
+}
+
+// --------------------- so3_log_from_unit_quat ---------------------
+
+void test_so3_log_identity() {
+	Quaterniond q = Quaterniond::Identity();
+	Vector3d v = so3_log_from_unit_quat(q);
+	assert(approx_mat(v, Vector3d::Zero()));
+}
+
+void test_so3_log_axis_angle_round_trip() {
+	const Vector3d axis = Vector3d{0.3, -0.7, 0.4}.normalized();
+	for (double angle :
+	     {0.0, 1e-6, 1e-3, 0.1, 0.5, 1.0, 2.0, 3.0, 3.1,
+	      std::numbers::pi - 1e-3}) {
+		Quaterniond q = axis_angle(axis, angle);
+		Vector3d phi = so3_log_from_unit_quat(q);
+		assert(approx_mat(phi, axis * angle, 1e-9));
+	}
+}
+
+void test_so3_log_negative_w_canonicalizes() {
+	const Vector3d axis = Vector3d{1.0, 0.0, 0.0};
+	Quaterniond q = axis_angle(axis, 1.2);
+	Quaterniond q_neg(-q.w(), -q.x(), -q.y(), -q.z());
+	Vector3d phi_pos = so3_log_from_unit_quat(q);
+	Vector3d phi_neg = so3_log_from_unit_quat(q_neg);
+	assert(approx_mat(phi_pos, phi_neg));
+}
+
+void test_so3_log_branch_continuity() {
+	// 5e-5 is the documented small-s threshold; values just above and below
+	// must agree to many digits.
+	const Vector3d axis = Vector3d{0.0, 0.0, 1.0};
+	for (double s : {4.0e-5, 5.0e-5, 6.0e-5, 1.0e-4}) {
+		double angle = 2.0 * std::asin(s);
+		Quaterniond q = axis_angle(axis, angle);
+		Vector3d phi = so3_log_from_unit_quat(q);
+		assert(approx_mat(phi, axis * angle, 1e-12));
+	}
+}
+
+// --------------------- so3_jacobian_inv ---------------------
+
+void test_so3_jacobian_inv_at_zero() {
+	Matrix3d J = so3_jacobian_inv(Vector3d::Zero());
+	assert(approx_mat(J, Matrix3d::Identity()));
+}
+
+void test_so3_jacobian_inv_inverts_jacobian() {
+	for (Vector3d phi : {Vector3d{1e-6, 0, 0},
+	                     Vector3d{1e-3, 1e-3, 0},
+	                     Vector3d{0.1, -0.2, 0.3},
+	                     Vector3d{1.0, -0.5, 0.7},
+	                     Vector3d{2.5, 0.3, -1.1}}) {
+		Matrix3d Jinv = so3_jacobian_inv(phi);
+		Matrix3d J = so3_jacobian_closed_form(phi);
+		Matrix3d prod = Jinv * J;
+		assert(approx_mat(prod, Matrix3d::Identity(), 1e-9));
+	}
+}
+
+void test_so3_jacobian_inv_branch_continuity() {
+	// 1e-4 is the documented small-theta threshold.
+	for (double th :
+	     {0.5e-4, 1.0e-4, 1.5e-4, 2.0e-4, 5.0e-4}) {
+		Vector3d phi{th, 0.0, 0.0};
+		Matrix3d Jinv = so3_jacobian_inv(phi);
+		Matrix3d J = so3_jacobian_closed_form(phi);
+		assert(approx_mat(Jinv * J, Matrix3d::Identity(), 1e-9));
+	}
+}
+
+// --------------------- compute_Pi ---------------------
+
+void test_compute_Pi_qm_equals_qp() {
+	// log(I) = 0  =>  J^-1(0) = I  =>  Pi = R_m^T
+	for (Quaterniond q : {Quaterniond::Identity(),
+	                      axis_angle(Vector3d::UnitX(), 0.5),
+	                      axis_angle(Vector3d{1, 1, 1}.normalized(), 1.2)}) {
+		Matrix3d Pi = compute_Pi(q, q);
+		assert(approx_mat(Pi, q.toRotationMatrix().transpose()));
+	}
+}
+
+void test_compute_Pi_small_relative_rotation() {
+	// For a small relative rotation R_rel, log(R_rel) is small, J^-1 ~ I,
+	// so Pi ~ R_m^T.
+	Quaterniond qm = axis_angle(Vector3d{0.2, 0.3, -0.1}.normalized(), 0.7);
+	Quaterniond qp = qm * axis_angle(Vector3d::UnitZ(), 1e-7);
+	Matrix3d Pi = compute_Pi(qm, qp);
+	assert(approx_mat(Pi, qm.toRotationMatrix().transpose(), 1e-6));
+}
+
+// --------------------- inv_sqrt_spd ---------------------
+
+void test_inv_sqrt_spd_identity() {
+	Matrix2d M = inv_sqrt_spd(Matrix2d::Identity());
+	assert(approx_mat(M, Matrix2d::Identity()));
+}
+
+void test_inv_sqrt_spd_diagonal() {
+	Matrix2d C;
+	C << 4.0, 0.0, 0.0, 9.0;
+	Matrix2d M = inv_sqrt_spd(C);
+	Matrix2d expected;
+	expected << 0.5, 0.0, 0.0, 1.0 / 3.0;
+	assert(approx_mat(M, expected));
+	assert(approx_mat(M * C * M.transpose(), Matrix2d::Identity()));
+}
+
+void test_inv_sqrt_spd_general() {
+	Matrix2d C;
+	C << 5.0, 1.5, 1.5, 2.0;
+	Matrix2d M = inv_sqrt_spd(C);
+	Matrix2d should_be_I = M * C * M.transpose();
+	assert(approx_mat(should_be_I, Matrix2d::Identity(), 1e-12));
+	assert(approx_mat(M, M.transpose()));
+}
+
+void test_inv_sqrt_spd_rejects_non_positive() {
+	Matrix2d C;
+	C << -1.0, 0.0, 0.0, 1.0;
+	bool threw = false;
+	try {
+		(void)inv_sqrt_spd(C);
+	} catch (const std::invalid_argument &) {
+		threw = true;
+	}
+	assert(threw && "inv_sqrt_spd must reject non-positive matrices");
+}
+
+// --------------------- fit_se3 ---------------------
+
+void test_fit_se3_identity() {
+	const int N = 4;
+	MatrixX4d q0(N, 4);
+	MatrixX3d t0(N, 3);
+	q0 << 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1;
+	t0 << 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 3.0, -1.0, -2.0, -3.0;
+	VectorXd mass(N);
+	mass << 1.0, 1.0, 1.0, 1.0;
+	double total = mass.sum();
+	Transformd T = fit_se3(q0, t0, q0, t0, mass, total, 0.01);
+	const auto &[q, t] = T;
+	assert(approx_mat(q.toRotationMatrix(), Matrix3d::Identity()));
+	assert(approx_mat(t, Vector3d::Zero(), 1e-12));
+}
+
+void test_fit_se3_pure_translation() {
+	const int N = 3;
+	const Vector3d shift{0.5, -1.5, 2.0};
+	MatrixX4d q0(N, 4);
+	MatrixX3d t0(N, 3);
+	q0 << 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1;
+	t0 << 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 3.0;
+	MatrixX3d t1 = t0.rowwise() + shift.transpose();
+	VectorXd mass(N);
+	mass << 1.0, 2.0, 3.0;
+	double total = mass.sum();
+	Transformd T = fit_se3(q0, t0, q0, t1, mass, total, 0.0);
+	const auto &[q, t] = T;
+	assert(approx_mat(q.toRotationMatrix(), Matrix3d::Identity()));
+	assert(approx_mat(t, shift, 1e-12));
+}
+
+void test_fit_se3_pure_rotation() {
+	const int N = 5;
+	// Reference points spread in 3D so SVD is well-conditioned.
+	MatrixX3d t0(N, 3);
+	t0 << 1.0, 0.0, 0.0,
+	     -1.0, 0.0, 0.0,
+	      0.0, 1.0, 0.0,
+	      0.0, -1.0, 0.0,
+	      0.0, 0.0, 1.0;
+	MatrixX4d q0(N, 4);
+	for (int i = 0; i < N; ++i) {
+		q0.row(i) = quat_xyzw(Quaterniond::Identity()).transpose();
+	}
+	VectorXd mass = VectorXd::Ones(N);
+	double total = mass.sum();
+	Quaterniond R = axis_angle(Vector3d{0.2, -0.5, 0.8}.normalized(), 0.9);
+	const Vector3d t_offset{2.0, -3.0, 1.0};
+	MatrixX3d t1(N, 3);
+	for (int i = 0; i < N; ++i) {
+		t1.row(i) =
+		    (R * v3(t0.row(i).transpose()) + t_offset).transpose();
+	}
+	MatrixX4d q1(N, 4);
+	for (int i = 0; i < N; ++i) {
+		q1.row(i) = quat_xyzw(R).transpose(); // body rotates with the rigid frame
+	}
+	Transformd T = fit_se3(q0, t0, q1, t1, mass, total, 0.01);
+	const auto &[q_fit, t_fit] = T;
+	assert(approx_mat(q_fit.toRotationMatrix(), R.toRotationMatrix(), 1e-9));
+	assert(approx_mat(t_fit, t_offset, 1e-9));
+}
+
+// --------------------- fit_twist ---------------------
+
+void test_fit_twist_zero() {
+	const int N = 3;
+	MatrixX3d w = MatrixX3d::Zero(N, 3);
+	MatrixX3d v = MatrixX3d::Zero(N, 3);
+	MatrixX3d c_CC(N, 3);
+	c_CC << 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0;
+	Transformd T_W_CC{Quaterniond::Identity(), Vector3d::Zero()};
+	VectorXd mass = VectorXd::Ones(N);
+	double total = mass.sum();
+	TwistFitResult r = fit_twist(w, v, c_CC, T_W_CC, mass, total, 0.0);
+	assert(approx_mat(r.w0, Vector3d::Zero()));
+	assert(approx_mat(r.v0, Vector3d::Zero()));
+	assert(approx_mat(r.v_W, MatrixX3d::Zero(N, 3)));
+}
+
+void test_fit_twist_pure_translation() {
+	const int N = 4;
+	const Vector3d u{0.7, -0.4, 0.2};
+	MatrixX3d c_CC(N, 3);
+	c_CC << 1.0, 0.0, 0.0,
+	       -1.0, 0.0, 0.0,
+	        0.0, 2.0, 0.0,
+	        0.0, 0.0, -1.5;
+	MatrixX3d v(N, 3);
+	for (int i = 0; i < N; ++i)
+		v.row(i) = u.transpose();
+	MatrixX3d w = MatrixX3d::Zero(N, 3);
+	Transformd T_W_CC{Quaterniond::Identity(), Vector3d{1.0, 2.0, 3.0}};
+	VectorXd mass(N);
+	mass << 1.0, 2.0, 3.0, 4.0;
+	double total = mass.sum();
+	TwistFitResult r = fit_twist(w, v, c_CC, T_W_CC, mass, total, 0.0);
+	assert(approx_mat(r.w0, Vector3d::Zero(), 1e-9));
+	assert(approx_mat(r.v0, u, 1e-12));
+	for (int i = 0; i < N; ++i)
+		assert(approx_mat(v3(r.v_W.row(i).transpose()), u, 1e-9));
+}
+
+void test_fit_twist_rigid_rotation_recovers_w() {
+	const int N = 5;
+	MatrixX3d c_CC(N, 3);
+	c_CC << 1.0, 0.0, 0.0,
+	       -1.0, 0.0, 0.0,
+	        0.0, 1.0, 0.0,
+	        0.0, -1.0, 0.0,
+	        0.0, 0.0, 1.5;
+	VectorXd mass(N);
+	mass << 1.0, 1.0, 2.0, 2.0, 3.0;
+	double total = mass.sum();
+
+	// Place CC in world with a non-identity pose to make sure fit_twist
+	// projects everything via T_W_CC consistently.
+	Quaterniond q_W_CC = axis_angle(Vector3d{0.4, 0.5, 0.7}.normalized(), 0.6);
+	Vector3d t_W_CC{-0.3, 0.8, 1.2};
+	Transformd T_W_CC{q_W_CC, t_W_CC};
+
+	// Synthesize a pure rigid rotation about CoM.
+	MatrixX3d c_W(N, 3);
+	for (int i = 0; i < N; ++i)
+		c_W.row(i) =
+		    (q_W_CC * v3(c_CC.row(i).transpose()) + t_W_CC).transpose();
+	Vector3d r = c_W.transpose() * mass / total;
+	const Vector3d w_true{0.3, -0.5, 0.7};
+	MatrixX3d v(N, 3);
+	for (int i = 0; i < N; ++i) {
+		Vector3d d = v3(c_W.row(i).transpose()) - r;
+		v.row(i) = w_true.cross(d).transpose();
+	}
+	MatrixX3d w = MatrixX3d::Zero(N, 3); // body angular velocities ignored when lambda_w = 0
+
+	TwistFitResult res = fit_twist(w, v, c_CC, T_W_CC, mass, total, 0.0);
+	assert(approx_mat(res.w0, w_true, 1e-9));
+	assert(approx_mat(res.v0, Vector3d::Zero(), 1e-9));
+	for (int i = 0; i < N; ++i) {
+		Vector3d expected = v.row(i).transpose();
+		assert(approx_mat(v3(res.v_W.row(i).transpose()), expected, 1e-9));
+	}
+}
+
+// --------------------- compute_L ---------------------
+
+// Identity inertia per part: L_i = R * I * R^T * w0 = w0 (independent of R).
+void test_compute_L_identity_inertia() {
+	const int N = 3;
+	MatrixX9d Iflat(N, 9);
+	for (int i = 0; i < N; ++i) {
+		Iflat.row(i) << 1, 0, 0, 0, 1, 0, 0, 0, 1;
+	}
+	const Vector3d w0{0.4, -0.7, 1.1};
+	for (Quaterniond q :
+	     {Quaterniond::Identity(),
+	      axis_angle(Vector3d::UnitZ(), 0.7),
+	      axis_angle(Vector3d{1, -2, 3}.normalized(), 1.4)}) {
+		MatrixX3d L = compute_L(Iflat, q, w0);
+		for (int i = 0; i < N; ++i) {
+			assert(approx_mat(v3(L.row(i).transpose()), w0, 1e-12));
+		}
+	}
+}
+
+// Diagonal CC-frame inertia, identity orientation: L_i = I_i * w0.
+void test_compute_L_diagonal_inertia_identity_pose() {
+	const int N = 2;
+	MatrixX9d Iflat(N, 9);
+	Iflat.row(0) << 2.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 5.0;
+	Iflat.row(1) << 7.0, 0.0, 0.0, 0.0, 11.0, 0.0, 0.0, 0.0, 13.0;
+	Quaterniond q = Quaterniond::Identity();
+	Vector3d w0{1.0, 1.0, 1.0};
+	MatrixX3d L = compute_L(Iflat, q, w0);
+	assert(approx_mat(v3(L.row(0).transpose()), Vector3d{2.0, 3.0, 5.0},
+	                  1e-12));
+	assert(approx_mat(v3(L.row(1).transpose()), Vector3d{7.0, 11.0, 13.0},
+	                  1e-12));
+}
+
+// Rotated diagonal inertia: L_W = R * I_CC * R^T * w0.
+void test_compute_L_rotated_inertia() {
+	const int N = 1;
+	MatrixX9d Iflat(N, 9);
+	Iflat.row(0) << 2.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 5.0;
+	Quaterniond q = axis_angle(Vector3d{0.3, -0.8, 0.5}.normalized(), 1.1);
+	Vector3d w0{0.7, -0.4, 1.3};
+	MatrixX3d L = compute_L(Iflat, q, w0);
+	Matrix3d I_CC;
+	I_CC << 2, 0, 0, 0, 3, 0, 0, 0, 5;
+	Matrix3d R = q.toRotationMatrix();
+	Vector3d expected = R * I_CC * R.transpose() * w0;
+	assert(approx_mat(v3(L.row(0).transpose()), expected, 1e-12));
+}
+
+// --------------------- main ---------------------
+
+int main() {
+	test_so3_log_identity();
+	test_so3_log_axis_angle_round_trip();
+	test_so3_log_negative_w_canonicalizes();
+	test_so3_log_branch_continuity();
+
+	test_so3_jacobian_inv_at_zero();
+	test_so3_jacobian_inv_inverts_jacobian();
+	test_so3_jacobian_inv_branch_continuity();
+
+	test_compute_Pi_qm_equals_qp();
+	test_compute_Pi_small_relative_rotation();
+
+	test_inv_sqrt_spd_identity();
+	test_inv_sqrt_spd_diagonal();
+	test_inv_sqrt_spd_general();
+	test_inv_sqrt_spd_rejects_non_positive();
+
+	test_fit_se3_identity();
+	test_fit_se3_pure_translation();
+	test_fit_se3_pure_rotation();
+
+	test_fit_twist_zero();
+	test_fit_twist_pure_translation();
+	test_fit_twist_rigid_rotation_recovers_w();
+
+	test_compute_L_identity_inertia();
+	test_compute_L_diagonal_inertia_identity_pose();
+	test_compute_L_rotated_inertia();
+
+	std::cout << "All Breakage tests passed.\n";
+	return 0;
+}