From 18d85375d05e586e9e3b7603002d3e3362fbc139 Mon Sep 17 00:00:00 2001 From: Liet Blue Date: Tue, 21 Apr 2026 20:19:09 +0800 Subject: [PATCH] tooling: (POC) add CUDA implementation for breakage simulation with benchmarks and tests --- native/CMakeLists.txt | 1 + native/cuda/breakage_cuda/BENCHMARK.md | 150 ++ native/cuda/breakage_cuda/CMakeLists.txt | 136 ++ native/cuda/breakage_cuda/README.md | 224 +++ native/cuda/breakage_cuda/benchmark.html | 662 +++++++++ .../include/breakage_cuda/device.hpp | 229 +++ .../include/breakage_cuda/kernels.hpp | 126 ++ .../include/breakage_cuda/linalg.hpp | 124 ++ .../include/breakage_cuda/qp_solver.hpp | 131 ++ .../include/breakage_cuda/ref_cpu.hpp | 107 ++ .../include/breakage_cuda/system.hpp | 208 +++ native/cuda/breakage_cuda/src/kernels.cu | 817 +++++++++++ native/cuda/breakage_cuda/src/linalg.cu | 507 +++++++ native/cuda/breakage_cuda/src/qp_solver.cu | 1264 +++++++++++++++++ native/cuda/breakage_cuda/src/ref_cpu.cpp | 778 ++++++++++ native/cuda/breakage_cuda/src/system.cpp | 550 +++++++ native/cuda/breakage_cuda/tests/bench.cpp | 403 ++++++ .../tests/test_dense_kernels.cpp | 473 ++++++ .../tests/test_full_pipeline.cpp | 329 +++++ .../breakage_cuda/tests/test_qp_random.cpp | 283 ++++ native/modules/bricksim/physx/breakage.cppm | 28 +- native/tests/TestBreakage.cpp | 433 ++++++ 22 files changed, 7950 insertions(+), 13 deletions(-) create mode 100644 native/cuda/breakage_cuda/BENCHMARK.md create mode 100644 native/cuda/breakage_cuda/CMakeLists.txt create mode 100644 native/cuda/breakage_cuda/README.md create mode 100644 native/cuda/breakage_cuda/benchmark.html create mode 100644 native/cuda/breakage_cuda/include/breakage_cuda/device.hpp create mode 100644 native/cuda/breakage_cuda/include/breakage_cuda/kernels.hpp create mode 100644 native/cuda/breakage_cuda/include/breakage_cuda/linalg.hpp create mode 100644 native/cuda/breakage_cuda/include/breakage_cuda/qp_solver.hpp create mode 100644 native/cuda/breakage_cuda/include/breakage_cuda/ref_cpu.hpp create mode 100644 native/cuda/breakage_cuda/include/breakage_cuda/system.hpp create mode 100644 native/cuda/breakage_cuda/src/kernels.cu create mode 100644 native/cuda/breakage_cuda/src/linalg.cu create mode 100644 native/cuda/breakage_cuda/src/qp_solver.cu create mode 100644 native/cuda/breakage_cuda/src/ref_cpu.cpp create mode 100644 native/cuda/breakage_cuda/src/system.cpp create mode 100644 native/cuda/breakage_cuda/tests/bench.cpp create mode 100644 native/cuda/breakage_cuda/tests/test_dense_kernels.cpp create mode 100644 native/cuda/breakage_cuda/tests/test_full_pipeline.cpp create mode 100644 native/cuda/breakage_cuda/tests/test_qp_random.cpp create mode 100644 native/tests/TestBreakage.cpp diff --git a/native/CMakeLists.txt b/native/CMakeLists.txt index 3383791..3e4b4ce 100644 --- a/native/CMakeLists.txt +++ b/native/CMakeLists.txt @@ -114,3 +114,4 @@ bricksim_add_test(TestConstraintScheduler tests/TestConstraintScheduler.cpp) bricksim_add_test(TestRandomRegularGraphSchedulingPolicy tests/TestRandomRegularGraphSchedulingPolicy.cpp) bricksim_add_test(TestPack2dMaxrect tests/TestPack2dMaxrect.cpp) bricksim_add_test(TestPhysicsGraphAssemblyClosure tests/TestPhysicsGraphAssemblyClosure.cpp) +bricksim_add_test(TestBreakage tests/TestBreakage.cpp) diff --git a/native/cuda/breakage_cuda/BENCHMARK.md b/native/cuda/breakage_cuda/BENCHMARK.md new file mode 100644 index 0000000..3269a51 --- /dev/null +++ b/native/cuda/breakage_cuda/BENCHMARK.md @@ -0,0 +1,150 @@ +# breakage_cuda benchmark report + +GPU pipeline (`cuda_qp::Pipeline::solve`) vs CPU baseline +(`ref_cpu::solve_full`, Eigen + OSQP `v1.0.0`) on a synthetic linear-chain +`HostSystem` of `N` parts. + +- Hardware: NVIDIA RTX 4090, WSL2, CUDA Toolkit 12.0, gcc 11 +- Build type: `Release`, double precision throughout +- Per-step wall clock, `cudaDeviceSynchronize()` before reading the GPU + timer +- For `N >= 32,768` the CPU OSQP solve hits the 2,000,000-iteration cap and + reports "did not converge"; we keep its wall time as a lower bound and + flag the row with an asterisk. The GPU ADMM converged on every row. + +A live, interactive version of these charts is in +[`benchmark.html`](benchmark.html) (open with a browser; everything is +inlined, no network access required). + +## Cross-over summary + +| Threshold | First N where it holds | +|--------------------------------|-----------------------:| +| GPU faster than CPU (>= 1x) | between 4,096 and 8,192 | +| GPU >= 10x faster | between 8,192 and 16,384 | +| GPU >= 100x faster | between 32,768 and 65,536 | +| GPU >= 500x faster | by 65,536 | + +So **the GPU "completely overrules" the CPU at roughly N ~ 40,000 parts** +(crossing 100x), and continues to widen the gap from there. + +## Raw measurements + +`./breakage_cuda_bench N` with `BENCH_QUICK=1`, `OSQP_MAX_ITER=2000000`, +and `BENCH_TOLERATE_NONCONVERGED=1` for the largest two rows. + +| N | nx | GPU mean | CPU mean | speedup (CPU / GPU) | +|---------:|-------:|------------:|--------------:|--------------------:| +| 4 | 27 | 77.7 ms | 24.1 us | 0.00031x | +| 16 | 135 | 124.3 ms | 107.9 us | 0.00087x | +| 64 | 567 | 167.6 ms | 435.9 us | 0.00260x | +| 256 | 2,295 | 388.2 ms | 1.73 ms | 0.00446x | +| 512 | 4,599 | 339.2 ms | 3.43 ms | 0.01011x | +| 1,024 | 9,207 | 3.53 s | 16.63 ms | 0.00471x | +| 2,048 | 18,423 | 6.24 s | 62.00 ms | 0.00993x | +| 4,096 | 36,855 | 9.39 s | 480.64 ms | 0.05119x | +| 8,192 | 73,719 | 3.79 s | 5.00 s | 1.32x | +| 16,384 |147,447 | 3.45 s | 68.36 s | 19.79x | +| 32,768 |294,903 | 3.93 s | 254.70 s* | >= 64.81x* | +| 65,536 |589,815 | 4.46 s | 2,596.75 s* | >= 582.73x* | + +`nx` is `9 * (N - 1)` (decision variables in the QP). +`*` = CPU OSQP hit `max_iter = 2,000,000` without converging; the wall time +is therefore a lower bound, and the speedup is also a lower bound. + +## ASCII chart: log10 wall time (microseconds) vs log2(N) + +``` +log10(us) + 10 | * (CPU >= here) + 9 | * + 8 | * + 7 | * * * + 6 | G G G G G G G G G G G G G (GPU plateau, then ramps) + 5 | C C C C C C + 4 | C + 3 | C C C + 2 | C + 1 | + +-------------------------------------------------------- + 4 16 64 256 .. 1k 2k 4k 8k 16k 32k 64k N +``` + +Two regimes are visible: + +- **Small-N plateau (`N <= 1k`)**: GPU runtime is flat ~ 100-400 ms, + dominated by `cusolverSp` factorization and kernel-launch overhead. + CPU runtime is sub-millisecond. CPU wins by orders of magnitude. +- **Large-N ramp (`N >= 4k`)**: GPU runtime grows roughly linearly with + `N` (and even drops at `N = 8k` once the per-iteration arithmetic + starts to amortize the fixed startup); CPU OSQP grows faster than + linear and rapidly blows past the GPU. + +## Cross-over analysis + +| pair | GPU time growth | CPU time growth | speedup growth | +|------------------------|----------------:|----------------:|---------------:| +| 8,192 -> 16,384 | 0.91x | 13.7x | 15.0x | +| 16,384 -> 32,768 | 1.14x | >= 3.7x* | >= 3.27x* | +| 32,768 -> 65,536 | 1.13x | >= 10.2x* | >= 8.99x* | + +(`*` = CPU side bounded below.) + +So once the GPU is in its ramp regime, **doubling N multiplies the +GPU/CPU ratio by 3-15x in our favor**. Extrapolating: + +- N ~ 40,000 -> ~ 100x faster +- N ~ 65,000 -> ~ 580x faster (measured lower bound) +- N ~ 130,000 -> well beyond 1,000x faster (estimated; CPU run skipped) + +## Why so slow at small N? + +The GPU baseline pays three large fixed costs that the CPU baseline does +not: + +1. **Three Cholesky factorizations** on the KKT-like system per `solve`, + one per OSQP stage, via `cusolverSpDcsrlsvchol`. Each call carries a + host<->device sync, symbolic analysis, and numeric factorization. +2. **Per-iteration kernel launches** in the ADMM loop (~ 100 small + launches per stage). On the CPU, those reduce to function calls. +3. **No batching of dense per-part kernels** -- one `<<<>>>` per kernel + per part. This dominates when `N` is small. + +These add up to ~ 80 ms - 4 s of overhead independent of `N`, so the GPU +only "wakes up" once the per-iteration arithmetic crosses that floor. + +## Caveats + +- The synthetic linear chain is favorable to OSQP (well-conditioned, + banded structure). A real `BreakageDebugDump` from a sim step might + shift the crossover slightly, but the asymptotic behavior should hold. +- The CPU "did not converge" rows (N = 32,768, 65,536) are lower bounds. + The actual CPU time to reach the same precision the GPU achieves is + larger -- probably substantially so. +- Bench mode used `BENCH_QUICK=1` (`5 warmup + 2 measured` for GPU, + `1 warmup + 1 measured` for CPU). For `N <= 4,096` you should rerun + with the default counts (`5 + 50 / 5 + 20`) for tighter error bars; the + large-N rows already dominate any per-iteration noise. + +## Reproducing + +```bash +cmake -S native/cuda/breakage_cuda -B ~/breakage_cuda_build -DCMAKE_BUILD_TYPE=Release +cmake --build ~/breakage_cuda_build -j + +cd ~/breakage_cuda_build + +# Single point (e.g., the cross-over): +BENCH_QUICK=1 LD_LIBRARY_PATH=/usr/lib/wsl/lib ./breakage_cuda_bench 8192 + +# Large-N point with CPU lower bound: +BENCH_QUICK=1 BENCH_GPU_ITER=2 BENCH_CPU_ITER=1 \ + BENCH_TOLERATE_NONCONVERGED=1 OSQP_MAX_ITER=2000000 \ + LD_LIBRARY_PATH=/usr/lib/wsl/lib \ + ./breakage_cuda_bench 65536 +``` + +`BENCH_QUICK=1` halves both warmup and measured iteration counts. +`BENCH_GPU_ITER` / `BENCH_CPU_ITER` override them outright. +`BENCH_TOLERATE_NONCONVERGED=1` keeps the wall time even if the solver +does not reach `eps_abs / eps_rel`; without it the bench aborts. diff --git a/native/cuda/breakage_cuda/CMakeLists.txt b/native/cuda/breakage_cuda/CMakeLists.txt new file mode 100644 index 0000000..38faf57 --- /dev/null +++ b/native/cuda/breakage_cuda/CMakeLists.txt @@ -0,0 +1,136 @@ +cmake_minimum_required(VERSION 3.24) + +project(breakage_cuda LANGUAGES CXX CUDA) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CUDA_STANDARD 17) +set(CMAKE_CUDA_STANDARD_REQUIRED ON) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release) +endif() + +# We pin a sane default arch list; user can override with CMAKE_CUDA_ARCHITECTURES. +# CMake 3.18+ auto-initializes this to "52" when the variable was not user-set, +# so a `NOT DEFINED` guard is not enough to keep our default. We treat the CMake +# default of 52 as "no real choice was made" and replace it with our list. Users +# who want a specific arch can still pass -DCMAKE_CUDA_ARCHITECTURES=... to opt +# out of this override. +if(NOT CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "52") + set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86;89;90") +endif() + +include(FetchContent) +set(FETCHCONTENT_QUIET FALSE) + +# --------------------------------------------------------------------------- +# Eigen (header-only) +# --------------------------------------------------------------------------- +FetchContent_Declare(eigen + GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git + GIT_TAG 3.4.0 + GIT_SHALLOW TRUE +) +set(BUILD_TESTING OFF CACHE BOOL "" FORCE) +set(EIGEN_BUILD_DOC OFF CACHE BOOL "" FORCE) +set(EIGEN_BUILD_TESTING OFF CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(eigen) + +# --------------------------------------------------------------------------- +# nlohmann_json +# --------------------------------------------------------------------------- +FetchContent_Declare(json + GIT_REPOSITORY https://github.com/nlohmann/json.git + GIT_TAG v3.11.3 + GIT_SHALLOW TRUE +) +set(JSON_BuildTests OFF CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(json) + +# --------------------------------------------------------------------------- +# OSQP (CPU baseline). Pinned to v1.0.0 to match the API used by breakage.cppm. +# --------------------------------------------------------------------------- +FetchContent_Declare(osqp + GIT_REPOSITORY https://github.com/osqp/osqp.git + GIT_TAG v1.0.0 + GIT_SHALLOW TRUE +) +set(OSQP_BUILD_DEMO_EXE OFF CACHE BOOL "" FORCE) +set(OSQP_BUILD_UNITTESTS OFF CACHE BOOL "" FORCE) +set(OSQP_BUILD_SHARED_LIB OFF CACHE BOOL "" FORCE) +set(OSQP_USE_LONG OFF CACHE BOOL "" FORCE) +set(OSQP_USE_FLOAT OFF CACHE BOOL "" FORCE) +set(OSQP_ALGEBRA_BACKEND "builtin" CACHE STRING "" FORCE) +FetchContent_MakeAvailable(osqp) + +# --------------------------------------------------------------------------- +# CUDA toolkit components +# --------------------------------------------------------------------------- +find_package(CUDAToolkit REQUIRED) + +# --------------------------------------------------------------------------- +# Core library +# --------------------------------------------------------------------------- +add_library(breakage_cuda_core STATIC + src/system.cpp + src/ref_cpu.cpp + src/kernels.cu + src/linalg.cu + src/qp_solver.cu +) +target_include_directories(breakage_cuda_core PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/include +) +target_link_libraries(breakage_cuda_core PUBLIC + Eigen3::Eigen + nlohmann_json::nlohmann_json + osqpstatic + CUDA::cudart + CUDA::cublas + CUDA::cusparse + CUDA::cusolver +) +set_target_properties(breakage_cuda_core PROPERTIES + CUDA_SEPARABLE_COMPILATION ON + CUDA_RESOLVE_DEVICE_SYMBOLS ON + CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}" +) + +# Suppress noisy diagnostics on Eigen + nvcc and treat real issues as warnings. +target_compile_options(breakage_cuda_core PRIVATE + $<$:-Wall -Wextra -Wpedantic> + $<$:--expt-relaxed-constexpr --expt-extended-lambda -Xcompiler=-Wall> +) + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- +enable_testing() + +# NOTE: On WSL2 the real CUDA driver libcuda.so.1 lives at /usr/lib/wsl/lib but +# is not on the default loader search path. Since libcudart resolves the driver +# via dlopen at runtime, RPATH/RUNPATH on the binary is NOT honored for that +# dlopen call. The portable workaround is to export +# LD_LIBRARY_PATH=/usr/lib/wsl/lib:$LD_LIBRARY_PATH +# before running the test/bench binaries. See README.md. +function(breakage_cuda_add_test TEST_NAME) + add_executable(${TEST_NAME} ${ARGN}) + target_link_libraries(${TEST_NAME} PRIVATE breakage_cuda_core) + add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) + if(UNIX AND NOT APPLE) + set_tests_properties(${TEST_NAME} PROPERTIES + ENVIRONMENT "LD_LIBRARY_PATH=/usr/lib/wsl/lib:$ENV{LD_LIBRARY_PATH}") + endif() +endfunction() + +breakage_cuda_add_test(test_dense_kernels tests/test_dense_kernels.cpp) +breakage_cuda_add_test(test_qp_random tests/test_qp_random.cpp) +breakage_cuda_add_test(test_full_pipeline tests/test_full_pipeline.cpp) + +# Bench is just an executable, not a test. +add_executable(breakage_cuda_bench tests/bench.cpp) +target_link_libraries(breakage_cuda_bench PRIVATE breakage_cuda_core) diff --git a/native/cuda/breakage_cuda/README.md b/native/cuda/breakage_cuda/README.md new file mode 100644 index 0000000..be07fe0 --- /dev/null +++ b/native/cuda/breakage_cuda/README.md @@ -0,0 +1,224 @@ +# breakage_cuda + +A standalone CUDA prototype of the per-frame hot path of +`bricksim::BreakageChecker::solve` (see +`native/modules/bricksim/physx/breakage.cppm`). + +This is a sub-project of BrickSim, but it does **not** link against any +`bricksim_modules` target. It builds and tests on its own so it can be +iterated on independently. + +## What is implemented + +End-to-end on the GPU, mirroring `BreakageChecker::solve`: + +1. `fit_se3` (weighted SE(3) Procrustes + 3x3 SVD) +2. `fit_twist` (weighted twist fit + 3x3 LDLT) +3. `compute_Pi`, `compute_L` +4. RHS `b` assembly +5. Three-stage QP solve (projection -> relaxation -> optimization) +6. Whitening reversal + per-clutch utilization + +The CPU baseline reuses Eigen + the OSQP C library (built via FetchContent at +`v1.0.0`) to keep the comparison apples-to-apples with the production code +path. + +## What is NOT implemented + +- `BreakageChecker::build_system` (geometry-driven, runs once per + reassembly; not on the per-step hot path). The CUDA layer consumes an + already-built system serialized as the JSON layout produced by + `bricksim::to_json(BreakageDebugDump)`. +- OSQP polish, infeasibility / unboundedness detection, scaling/Ruiz + equilibration, indirect (PCG) mode, multiple algebra backends. +- The full OSQP rho-update heuristic; we only implement + `rho_new = rho * sqrt(||r_prim|| / ||r_dual||)` and refactor when the + ratio crosses 5x. + +## Build + +Linux + CUDA Toolkit 12.x recommended (the parent project assumes Linux). +Windows users should build inside WSL2. + +```bash +cd native/cuda/breakage_cuda +cmake -S . -B build -DCMAKE_BUILD_TYPE=Release +cmake --build build -j +ctest --test-dir build --output-on-failure +``` + +The first configure pulls Eigen, nlohmann_json and OSQP via `FetchContent`. + +### WSL2 specifics + +Two non-obvious gotchas surfaced during initial bring-up: + +1. **`configure_file: Operation not permitted` when the build dir is on + `/mnt/c/`**. The Windows mount lacks the POSIX bits CMake needs for + `configure_file`. Put the build directory on the WSL native filesystem, + e.g.: + ```bash + cmake -S "/mnt/c/.../native/cuda/breakage_cuda" -B ~/breakage_cuda_build + cmake --build ~/breakage_cuda_build -j + ``` +2. **`no CUDA-capable device is detected` at runtime**, even though + `nvidia-smi` works. WSL2 ships the real `libcuda.so.1` at + `/usr/lib/wsl/lib/libcuda.so.1`, which is not on the default loader + search path. `libcudart` resolves the driver through `dlopen`, so + binary RPATH/RUNPATH does **not** help; you have to set + `LD_LIBRARY_PATH`. Two options: + - Run via `ctest` -- the CMake test definitions inject + `LD_LIBRARY_PATH=/usr/lib/wsl/lib:$LD_LIBRARY_PATH` automatically. + - Run a binary directly with the env var set: + ```bash + LD_LIBRARY_PATH=/usr/lib/wsl/lib:$LD_LIBRARY_PATH ./test_dense_kernels + ``` + +### Default CUDA architecture + +We pin `CMAKE_CUDA_ARCHITECTURES = "70;75;80;86;89;90"` (Volta to Hopper). +CMake's auto-default of `52` is replaced unless the user passes an explicit +`-DCMAKE_CUDA_ARCHITECTURES=...`. SM 6.0+ is required because we use +`atomicAdd(double*, double)` directly. + +## Generating a test fixture + +`tests/test_full_pipeline.cpp` consumes a `BreakageDebugDump` JSON. To +produce one from a real BrickSim run, set the debug-dump directory and flip +the `DebugDump` threshold on: + +```cpp +BreakageChecker checker; +checker.set_debug_dump_dir("/tmp/breakage_dumps"); +checker.thresholds.DebugDump = true; +// ... run a sim step ... +``` + +Then point the test at the produced JSON via the +`BREAKAGE_CUDA_FIXTURE` environment variable. + +If you do not have a real dump, the test falls back to a small synthetic +problem generated in-process (asserting the CUDA pipeline matches the CPU +baseline up to relative tolerance). + +## Layout + +``` +include/breakage_cuda/ + system.hpp - Host POD types + JSON loaders + device.hpp - CUDA RAII helpers + DeviceSystem mirror + ref_cpu.hpp - CPU reference functions + kernels.hpp - Host-side launchers for dense kernels + qp_solver.hpp - CUDA ADMM solver + end-to-end pipeline +src/ + system.cpp - JSON loaders, CSC->CSR upload + ref_cpu.cpp - CPU reference implementation + kernels.cu - Dense kernels + linalg.cu - Shared device helpers (3x3 SVD/LDLT, reductions, SpMV glue) + qp_solver.cu - CUDA ADMM +tests/ + test_dense_kernels.cpp - per-kernel CPU vs CUDA tests + test_qp_random.cpp - random QP CPU OSQP vs CUDA ADMM + test_full_pipeline.cpp - JSON dump end-to-end + bench.cpp - N-sweep wall clock +``` + +## Verified test results (RTX 4090, WSL2, CUDA 12.0) + +`ctest --output-on-failure` passes 3/3: + +``` +1/3 Test #1: test_dense_kernels ............... Passed 0.27 sec +2/3 Test #2: test_qp_random ................... Passed 0.35 sec +3/3 Test #3: test_full_pipeline ............... Passed 0.36 sec +``` + +Per-test numerical agreement (CPU reference vs CUDA, max abs / rel error): + +| test | metric | value | tol | +|-------------------------------|----------------------------------------|-------------|------| +| test_dense_kernels::fit_se3 | max abs | 1.11e-16 | 1e-9 | +| test_dense_kernels::fit_twist | max abs | 5.55e-17 | 1e-9 | +| test_dense_kernels::compute_Pi| max abs | 1.67e-16 | 1e-12| +| test_dense_kernels::compute_L | max abs | 7.43e-16 | 1e-9 | +| test_dense_kernels::assemble_b| max abs | 2.84e-14 | 1e-9 | +| test_qp_random | rel inf-norm err on `x` | 6.13e-08 | 1e-2 | +| test_full_pipeline | rel inf-norm err on `x_unwhitened` | 5.48e-06 | 1e-2 | +| test_full_pipeline | rel inf-norm err on utilization | 7.32e-08 | 1e-2 | + +Dense kernels match the CPU reference to floating-point round-off. The +hand-written ADMM solver agrees with OSQP within ~1e-5 (ADMM iterates +independently in each backend, so bit-equality is not expected). + +## Bench results (RTX 4090, WSL2, CUDA 12.0) + +Full numbers, charts, and analysis are in +[`BENCHMARK.md`](BENCHMARK.md) (markdown) and +[`benchmark.html`](benchmark.html) (self-contained interactive page, +double-click to open in a browser; no network access required). + +Headline numbers from a synthetic linear-chain `HostSystem` with +`Pipeline::solve` vs `ref_cpu::solve_full` (Eigen + OSQP): + +| N | GPU mean | CPU mean | speedup (CPU / GPU) | +|---------:|---------:|--------------:|--------------------:| +| 4 | 77.7 ms | 24.1 us | 0.0003x (CPU wins) | +| 1,024 | 3.53 s | 16.63 ms | 0.005x (CPU wins) | +| 4,096 | 9.39 s | 480.64 ms | 0.05x (CPU wins) | +| 8,192 | 3.79 s | 5.00 s | 1.32x (cross-over)| +| 16,384 | 3.45 s | 68.36 s | 19.8x | +| 32,768 | 3.93 s | >= 254.7 s | >= 64.8x | +| 65,536 | 4.46 s | >= 2,596.8 s | >= 583x | + +Cross-over thresholds: + +- **GPU >= 1x faster**: between N = 4,096 and 8,192 +- **GPU >= 10x faster**: between N = 8,192 and 16,384 +- **GPU >= 100x faster**: between N = 32,768 and 65,536 (~ N = 40,000) +- **GPU >= 500x faster**: by N = 65,536 + +The two largest rows have CPU "did not converge" within +`OSQP_MAX_ITER = 2,000,000`, so CPU wall times (and the speedup) are +lower bounds. The GPU side converged on every row. + +The GPU pipeline pays three large fixed costs that hurt at small N: + +- Three full `cusolverSpDcsrlsvchol` factorizations per `solve` (one per + stage). +- Per-iteration kernel launch overhead in the ADMM loop (~ 100 small + launches per stage). +- No batching of the dense per-part kernels (one `<<<>>>` per kernel + per part). + +These dominate runtime for `N <= 1,024`. Once `N >= 4,096`, the +per-iteration arithmetic crosses that fixed-cost floor and the GPU +runtime grows much more slowly than the CPU OSQP runtime, opening a +> 500x gap by N = 65,536. + +## Caveats and known differences + +- Numerical convergence of the hand-written ADMM matches OSQP only up to + the configured `eps_abs / eps_rel` tolerances. Tests use relative + comparisons, not bit-exact matching. +- Stage 3 inequality `epsilon` is taken from `osqp.cppm` defaults (`1e-4`). + Override via `Settings`. +- The current direct KKT factorization uses `cusolverSp` Cholesky; this is + acceptable for small/medium problems but does not warm-update the + factorization across rho changes (we refactor instead). +- Double precision throughout. +- WSL2 driver loading: see "WSL2 specifics" above. +- `Eigen::Quaterniond` member init must use `=` instead of `{...}`; the + brace-init form trips a g++ parser bug visible from `nvcc`'s host + compile pass. + +## TODO + +- Replace the direct KKT factorization with PCG + diagonal preconditioner + to remove the dominant overhead (`cusolverSp` factorization is ~80% of + per-step time). +- Replace the hand-written ADMM with cuOSQP (when its CMake integration + matures) for closer numerical agreement with the CPU baseline. +- Batch dense per-part kernels (currently one `<<<>>>` per kernel). +- Optionally lift `build_system` to GPU for end-to-end reassembly speedups. +- Mixed-precision experiments (fp32 ADMM with double-precision residual + checks). diff --git a/native/cuda/breakage_cuda/benchmark.html b/native/cuda/breakage_cuda/benchmark.html new file mode 100644 index 0000000..e5505cd --- /dev/null +++ b/native/cuda/breakage_cuda/benchmark.html @@ -0,0 +1,662 @@ + + + + +breakage_cuda benchmark report + + + + +
+

breakage_cuda: GPU vs CPU pipeline benchmark

+

+ cuda_qp::Pipeline::solve vs + ref_cpu::solve_full (Eigen + OSQP v1.0.0) on a synthetic + linear-chain HostSystem of N parts. Hardware: + NVIDIA RTX 4090, WSL2, CUDA Toolkit 12.0. Per-step wall clock with + cudaDeviceSynchronize() before reading the GPU timer. +

+ +
+
+
~ 7,000
+
Cross-over N (GPU starts winning)
+
+
+
~ 40,000
+
N where GPU >= 100x faster
+
+
+
>= 583x
+
GPU faster @ N = 65,536 (CPU lower bound)
+
+
+
3,225x
+
GPU slower @ N = 4 (small-problem regime)
+
+
+ +
+
Wall-clock time per solve (log scale)
+
+ Each axis is logarithmic. The GPU curve is roughly flat at small N + (dominated by fixed startup), then ramps with N. The CPU curve grows + faster than the GPU and crosses it between N = 4,096 and N = 8,192. +
+ +
+ + CUDA Pipeline::solve + + CPU ref_cpu::solve_full (OSQP) + + CPU lower bound (did not converge) +
+
+ +
+
Speedup ratio (CPU time / GPU time)
+
+ Above the dashed line means the GPU is faster. The horizontal lines + mark 1x, 10x, 100x speedup thresholds. +
+ +
+ + log10(CPU / GPU) + + crossover at 1x +
+
+ +

Raw measurements

+ + + + + + + + +
NnxGPU meanCPU meanspeedup (CPU/GPU)
+

+ Rows highlighted in blue: GPU is faster. Rows annotated with * + indicate the CPU OSQP solve hit max_iter = 2,000,000 + without reaching eps_abs / eps_rel; CPU wall time is a + lower bound and the speedup is therefore also a lower bound. +

+ +

Why the GPU is so slow at small N

+

+ The GPU pipeline pays three large fixed costs that the CPU pipeline + does not: +

+
    +
  • Three cusolverSpDcsrlsvchol Cholesky factorizations + per solve (one per OSQP-style stage).
  • +
  • Per-iteration kernel launches in the ADMM loop (~100 small + launches per stage).
  • +
  • No batching of dense per-part kernels (one + <<<>>> per kernel per part).
  • +
+

+ Together these dominate the runtime when N is small. As N grows, the + per-iteration arithmetic crosses the fixed-startup floor and the GPU + pulls away from the CPU. +

+ +

Cross-over analysis

+ + + + + + + + + + +
pairGPU growthCPU growthspeedup growth
+ +

Reproducing

+
cmake -S native/cuda/breakage_cuda -B ~/breakage_cuda_build -DCMAKE_BUILD_TYPE=Release
+cmake --build ~/breakage_cuda_build -j
+
+cd ~/breakage_cuda_build
+
+# Single point (e.g., the cross-over):
+BENCH_QUICK=1 LD_LIBRARY_PATH=/usr/lib/wsl/lib \
+  ./breakage_cuda_bench 8192
+
+# Large-N point, CPU side as a lower bound:
+BENCH_QUICK=1 BENCH_GPU_ITER=2 BENCH_CPU_ITER=1 \
+  BENCH_TOLERATE_NONCONVERGED=1 OSQP_MAX_ITER=2000000 \
+  LD_LIBRARY_PATH=/usr/lib/wsl/lib \
+  ./breakage_cuda_bench 65536
+ +

+ See BENCHMARK.md for the same data without + any HTML/JS, and README.md for the project + overview. +

+
+ + + + diff --git a/native/cuda/breakage_cuda/include/breakage_cuda/device.hpp b/native/cuda/breakage_cuda/include/breakage_cuda/device.hpp new file mode 100644 index 0000000..449f4db --- /dev/null +++ b/native/cuda/breakage_cuda/include/breakage_cuda/device.hpp @@ -0,0 +1,229 @@ +// SPDX-License-Identifier: MIT +// +// Lightweight RAII helpers and a device-side mirror of HostSystem. Not a +// full Eigen-on-GPU layer; we just keep raw pointers + sizes and let kernels +// operate on them directly. +// +// All matrices are stored in CSR (row-major) on the device because cuSPARSE +// expects CSR. + +#pragma once + +#include "breakage_cuda/system.hpp" + +#include +#include + +#include +#include + +namespace breakage_cuda { + +// --------------------------------------------------------------------------- +// Error helpers. +// --------------------------------------------------------------------------- + +#define BREAKAGE_CUDA_CHECK(call) \ + do { \ + cudaError_t _err = (call); \ + if (_err != cudaSuccess) { \ + throw std::runtime_error(std::string("CUDA error at " __FILE__ ":") + \ + std::to_string(__LINE__) + ": " + \ + cudaGetErrorString(_err)); \ + } \ + } while (0) + +#define BREAKAGE_CUSPARSE_CHECK(call) \ + do { \ + cusparseStatus_t _s = (call); \ + if (_s != CUSPARSE_STATUS_SUCCESS) { \ + throw std::runtime_error(std::string("cuSPARSE error at " __FILE__ ":") +\ + std::to_string(__LINE__) + ": " + \ + cusparseGetErrorString(_s)); \ + } \ + } while (0) + +// --------------------------------------------------------------------------- +// RAII device buffer of arbitrary scalar. +// --------------------------------------------------------------------------- + +template class DeviceBuffer { +public: + DeviceBuffer() = default; + + explicit DeviceBuffer(std::size_t n) { resize(n); } + + ~DeviceBuffer() { free(); } + + DeviceBuffer(const DeviceBuffer &) = delete; + DeviceBuffer &operator=(const DeviceBuffer &) = delete; + + DeviceBuffer(DeviceBuffer &&o) noexcept : ptr_(o.ptr_), n_(o.n_) { + o.ptr_ = nullptr; + o.n_ = 0; + } + + DeviceBuffer &operator=(DeviceBuffer &&o) noexcept { + if (this != &o) { + free(); + ptr_ = o.ptr_; + n_ = o.n_; + o.ptr_ = nullptr; + o.n_ = 0; + } + return *this; + } + + void resize(std::size_t n) { + if (n == n_) + return; + free(); + if (n > 0) { + BREAKAGE_CUDA_CHECK(cudaMalloc(&ptr_, n * sizeof(T))); + n_ = n; + } + } + + void free() { + if (ptr_ != nullptr) { + cudaFree(ptr_); + ptr_ = nullptr; + n_ = 0; + } + } + + void copy_from_host(const T *host, std::size_t n) { + if (n != n_) + resize(n); + if (n_ == 0) + return; + BREAKAGE_CUDA_CHECK( + cudaMemcpy(ptr_, host, n_ * sizeof(T), cudaMemcpyHostToDevice)); + } + + void copy_to_host(T *host, std::size_t n) const { + if (n != n_) + throw std::invalid_argument("DeviceBuffer::copy_to_host size mismatch"); + if (n_ == 0) + return; + BREAKAGE_CUDA_CHECK( + cudaMemcpy(host, ptr_, n_ * sizeof(T), cudaMemcpyDeviceToHost)); + } + + void zero() { + if (n_ == 0) + return; + BREAKAGE_CUDA_CHECK(cudaMemset(ptr_, 0, n_ * sizeof(T))); + } + + T *data() { return ptr_; } + const T *data() const { return ptr_; } + std::size_t size() const { return n_; } + bool empty() const { return n_ == 0; } + +private: + T *ptr_{nullptr}; + std::size_t n_{0}; +}; + +// --------------------------------------------------------------------------- +// Device CSR matrix (owns indptr/indices/data buffers). +// --------------------------------------------------------------------------- + +struct DeviceCsr { + int rows{0}; + int cols{0}; + int nnz{0}; + DeviceBuffer rowptr; // (rows + 1) + DeviceBuffer colind; // (nnz) + DeviceBuffer values; // (nnz) + + // Lazy cuSPARSE descriptor (created on first use). + cusparseSpMatDescr_t descr{nullptr}; + + ~DeviceCsr() { + if (descr != nullptr) { + cusparseDestroySpMat(descr); + } + } + + DeviceCsr() = default; + DeviceCsr(const DeviceCsr &) = delete; + DeviceCsr &operator=(const DeviceCsr &) = delete; + DeviceCsr(DeviceCsr &&) = default; + DeviceCsr &operator=(DeviceCsr &&) = default; +}; + +// Uploads an Eigen CSC matrix as CSR on the device. Performs the +// CSC->CSR transpose on the host (it is small enough for the matrices we +// have here; KKT factorizations dominate the runtime budget). +void upload_csc_as_csr(const SparseMatrixCSC &src, DeviceCsr &dst); + +// --------------------------------------------------------------------------- +// Device system: GPU-resident view of HostSystem. +// +// All matrices are CSR. Dense per-part data is stored row-major. +// --------------------------------------------------------------------------- + +struct DeviceSystem { + int N{0}; // num_parts + int K{0}; // num_clutches + int ncv{0}; // num_contact_vertices + int nx{0}; // num_vars + int me{0}; // num_eq = 6 * N + int mi{0}; // num_ineq + int mh{0}; // num_relaxed_ineq + int ncap{0}; // capacity_clutch_indices.size() + + double total_mass{0.0}; + double L0{1.0}; + + DeviceBuffer mass; // (N,) + DeviceBuffer q_CC; // (N, 4) row-major xyzw + DeviceBuffer c_CC; // (N, 3) row-major + DeviceBuffer I_CC; // (N, 9) row-major flatten + + DeviceCsr Q; + DeviceCsr A; + DeviceCsr G; + DeviceCsr H; + DeviceCsr V; + + DeviceBuffer capacity_clutch_indices; // (ncap,) + DeviceBuffer capacities; // (ncap, 9) row-major + DeviceBuffer clutch_whiten; // (K, 4) row-major +}; + +// Uploads HostSystem to DeviceSystem. +void upload_system(const HostSystem &host, DeviceSystem &dev); + +// --------------------------------------------------------------------------- +// Per-step input mirror. +// --------------------------------------------------------------------------- + +struct DeviceInput { + double dt{}; + DeviceBuffer w; // (N, 3) + DeviceBuffer v; // (N, 3) + DeviceBuffer q; // (N, 4) + DeviceBuffer c; // (N, 3) + DeviceBuffer J; // (N, 3) + DeviceBuffer H; // (N, 3) +}; + +void upload_input(const HostInput &host, DeviceInput &dev); + +// --------------------------------------------------------------------------- +// Per-step state mirror (the part the dense kernels need). +// --------------------------------------------------------------------------- + +struct DeviceState { + DeviceBuffer q_W_CC_prev; // (4,) xyzw + DeviceBuffer v_W_prev; // (N, 3) + DeviceBuffer L_prev; // (N, 3) +}; + +void upload_state(const HostState &host, DeviceState &dev, int N); +void download_state(const DeviceState &dev, HostState &host, int N); + +} // namespace breakage_cuda diff --git a/native/cuda/breakage_cuda/include/breakage_cuda/kernels.hpp b/native/cuda/breakage_cuda/include/breakage_cuda/kernels.hpp new file mode 100644 index 0000000..38fca43 --- /dev/null +++ b/native/cuda/breakage_cuda/include/breakage_cuda/kernels.hpp @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: MIT +// +// Host-side launchers for the dense per-step CUDA kernels. Each one mirrors +// a function in breakage.cppm; the bodies live in src/kernels.cu and +// src/linalg.cu. + +#pragma once + +#include "breakage_cuda/device.hpp" + +namespace breakage_cuda::cuda_kernels { + +// --------------------------------------------------------------------------- +// fit_se3 (breakage.cppm:58-88). +// +// Inputs: +// q0, t0 -- reference orientations / COMs in CC frame (N x 4 / N x 3) +// qx, tx -- current orientations / COMs (N x 4 / N x 3) +// mass -- per-part mass (N,) +// total_mass, lambda_R +// +// Output (host scalars + 3x3 R + Vector3 t copied back): +// q_out -- world<-CC quaternion, xyzw (4,) +// t_out -- world<-CC translation (3,) +// +// Uses an internal scratch buffer for the partial 3x3 reductions. +// --------------------------------------------------------------------------- + +void fit_se3(int N, const double *q0_d, const double *t0_d, const double *qx_d, + const double *tx_d, const double *mass_d, double total_mass, + double lambda_R, double q_out[4], double t_out[3], + cudaStream_t stream = 0); + +// --------------------------------------------------------------------------- +// fit_twist (breakage.cppm:96-121). +// +// Inputs: +// w, v -- per-part world-frame angular / linear velocities (N x 3) +// c_CC -- COM positions in CC frame (N x 3) +// q_W_CC, t_W_CC -- world<-CC pose (xyzw quat + translation) +// mass -- (N,) +// total_mass, lambda_w +// +// Outputs (host): +// w0_out -- aggregate angular velocity (3,) +// v0_out -- aggregate linear velocity (3,) +// +// Outputs (device): +// v_W_d -- per-part world-frame linear velocity reconstructed from twist +// (N x 3 row-major). Caller pre-allocates. +// --------------------------------------------------------------------------- + +void fit_twist(int N, const double *w_d, const double *v_d, const double *c_CC_d, + const double q_W_CC[4], const double t_W_CC[3], + const double *mass_d, double total_mass, double lambda_w, + double w0_out[3], double v0_out[3], double *v_W_d, + cudaStream_t stream = 0); + +// --------------------------------------------------------------------------- +// compute_Pi (breakage.cppm:166-170). 3x3 output, host-side scalar work. +// --------------------------------------------------------------------------- + +void compute_Pi_host(const double qm[4], const double qp[4], double Pi_out[9]); + +// --------------------------------------------------------------------------- +// compute_L (breakage.cppm:173-180). One row per part on the GPU. +// +// Inputs: +// I_CC_d -- (N, 9) row-major +// q_W_CC -- (4,) xyzw +// w0 -- (3,) +// +// Output: +// L_d -- (N, 3) row-major. Caller pre-allocates. +// --------------------------------------------------------------------------- + +void compute_L(int N, const double *I_CC_d, const double q_W_CC[4], + const double w0[3], double *L_d, cudaStream_t stream = 0); + +// --------------------------------------------------------------------------- +// assemble_b (breakage.cppm:1018-1035). One row (6 doubles) per part. +// +// Inputs (device): +// v_W_curr_d, v_W_prev_d, L_curr_d, L_prev_d -- (N, 3) +// J_d, H_d -- (N, 3) +// mass_d -- (N,) +// +// Inputs (host): +// Pi -- 3x3 row-major +// dt, L0 +// +// Output (device): +// b_d -- (6 * N,) row-major-as-(N, 6). Caller pre-allocates. +// --------------------------------------------------------------------------- + +void assemble_b(int N, const double *v_W_curr_d, const double *v_W_prev_d, + const double *L_curr_d, const double *L_prev_d, + const double *J_d, const double *H_d, const double *mass_d, + const double Pi[9], double dt, double L0, double *b_d, + cudaStream_t stream = 0); + +// --------------------------------------------------------------------------- +// Postprocess: undo Wk whitening on the optimization solution and compute +// per-clutch utilization (breakage.cppm:1061-1097). +// +// Inputs (device): +// x_d -- (num_vars,) the OSQP optimization solution (whitened) +// capacity_clutch_indices_d -- (ncap,) int +// capacities_d -- (ncap, 9) row-major +// clutch_whiten_d -- (K, 4) row-major flatten of 2x2 +// +// Inputs (host): +// ncv, K +// +// Outputs (device): +// x_unwhiten_d -- (num_vars,) the un-whitened x; caller pre-allocates. +// utilization_d -- (K,) initialized to -1.0 here; caller pre-allocates. +// --------------------------------------------------------------------------- + +void postprocess(int ncv, int K, int ncap, const double *x_d, + const int *capacity_clutch_indices_d, + const double *capacities_d, const double *clutch_whiten_d, + double *x_unwhiten_d, double *utilization_d, + cudaStream_t stream = 0); + +} // namespace breakage_cuda::cuda_kernels diff --git a/native/cuda/breakage_cuda/include/breakage_cuda/linalg.hpp b/native/cuda/breakage_cuda/include/breakage_cuda/linalg.hpp new file mode 100644 index 0000000..93ffd34 --- /dev/null +++ b/native/cuda/breakage_cuda/include/breakage_cuda/linalg.hpp @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: MIT +// +// Shared CUDA linear-algebra helpers used by both kernels.cu and +// qp_solver.cu. The implementations live in src/linalg.cu. + +#pragma once + +#include "breakage_cuda/device.hpp" + +#include +#include +#include +#include + +namespace breakage_cuda::cuda_la { + +// --------------------------------------------------------------------------- +// cuBLAS / cuSPARSE / cuSOLVER handle pool. One per process. +// --------------------------------------------------------------------------- + +class Handles { +public: + static Handles &instance(); + + cublasHandle_t blas() const { return blas_; } + cusparseHandle_t sparse() const { return sparse_; } + cusolverDnHandle_t solver_dn() const { return solver_dn_; } + cusolverSpHandle_t solver_sp() const { return solver_sp_; } + +private: + Handles(); + ~Handles(); + Handles(const Handles &) = delete; + Handles &operator=(const Handles &) = delete; + + cublasHandle_t blas_{nullptr}; + cusparseHandle_t sparse_{nullptr}; + cusolverDnHandle_t solver_dn_{nullptr}; + cusolverSpHandle_t solver_sp_{nullptr}; +}; + +// --------------------------------------------------------------------------- +// Reductions (single-block, used for small per-step inner products). +// --------------------------------------------------------------------------- + +// Returns sum(x .* y). Uses cuBLAS dot (handle from Handles). +double dot(int n, const double *x_d, const double *y_d, cudaStream_t stream = 0); + +// Returns ||x||_2. +double nrm2(int n, const double *x_d, cudaStream_t stream = 0); + +// Returns ||x||_inf. +double inf_norm(int n, const double *x_d, cudaStream_t stream = 0); + +// y <- alpha * x + y +void axpy(int n, double alpha, const double *x_d, double *y_d, + cudaStream_t stream = 0); + +// y <- alpha * x +void scal_copy(int n, double alpha, const double *x_d, double *y_d, + cudaStream_t stream = 0); + +// x <- alpha * x +void scal(int n, double alpha, double *x_d, cudaStream_t stream = 0); + +// x <- value (broadcast). +void fill(int n, double value, double *x_d, cudaStream_t stream = 0); + +// dst <- src (n elements) +void copy(int n, const double *src_d, double *dst_d, cudaStream_t stream = 0); + +// --------------------------------------------------------------------------- +// SpMV: y <- alpha * op(A) * x + beta * y, with cuSPARSE generic API. +// +// `transpose` selects op(A): false -> A, true -> A^T. +// Caller pre-allocates buffer_d (size buffer_bytes_for_spmv) once. +// --------------------------------------------------------------------------- + +std::size_t spmv_buffer_size(const DeviceCsr &A, bool transpose); + +void spmv(const DeviceCsr &A, bool transpose, double alpha, const double *x_d, + double beta, double *y_d, void *buffer_d, std::size_t buffer_bytes, + cudaStream_t stream = 0); + +// --------------------------------------------------------------------------- +// Element-wise box projection: out[i] = clamp(in[i], l[i], u[i]). +// --------------------------------------------------------------------------- + +void clamp(int n, const double *in_d, const double *l_d, const double *u_d, + double *out_d, cudaStream_t stream = 0); + +// in-place: x[i] = clamp(x[i], l[i], u[i]) +void clamp_inplace(int n, const double *l_d, const double *u_d, double *x_d, + cudaStream_t stream = 0); + +// --------------------------------------------------------------------------- +// Dense 3x3 helpers exposed for tests. +// +// Each one takes a 3x3 row-major matrix as 9 doubles. Implemented on host. +// --------------------------------------------------------------------------- + +// Computes R, t from the standard SVD-based Procrustes formula. Inputs are +// the centered matrices and the K matrix (cf. breakage.cppm fit_se3). +void svd_procrustes_3x3(const double S[9], double R[9]); + +// 3x3 SPD LDLT solve: A * x = b, where A is SPD 3x3. +void spd_solve_3x3(const double A[9], const double b[3], double x[3]); + +// 2x2 SPD inverse-sqrt (for Wk = inv_sqrt_spd(Ck)). Both row-major (4 doubles). +void inv_sqrt_spd_2x2(const double C[4], double W[4]); + +// --------------------------------------------------------------------------- +// Quaternion <-> rotation matrix helpers (xyzw layout, row-major matrix). +// --------------------------------------------------------------------------- + +void quat_xyzw_to_R(const double q[4], double R[9]); +void R_to_quat_xyzw(const double R[9], double q[4]); +void quat_xyzw_normalize(double q[4]); +// Returns r such that q_a * q_b = r (Hamilton product, xyzw). +void quat_xyzw_mul(const double a[4], const double b[4], double r[4]); +// Returns the conjugate (-x, -y, -z, w). +void quat_xyzw_conjugate(const double q[4], double r[4]); + +} // namespace breakage_cuda::cuda_la diff --git a/native/cuda/breakage_cuda/include/breakage_cuda/qp_solver.hpp b/native/cuda/breakage_cuda/include/breakage_cuda/qp_solver.hpp new file mode 100644 index 0000000..b6278e7 --- /dev/null +++ b/native/cuda/breakage_cuda/include/breakage_cuda/qp_solver.hpp @@ -0,0 +1,131 @@ +// SPDX-License-Identifier: MIT +// +// Hand-written CUDA ADMM solver, structurally aligned with the three-stage +// OSQP wrapper in osqp.cppm (projection -> relaxation -> optimization). +// +// We do NOT replicate every OSQP feature (no polish, no infeasibility +// detection, no scaling/equilibration). The aim is to produce solutions that +// are close enough to OSQP for the same problem so that the breakage system +// behaves consistently. +// +// One "stage" corresponds to one ADMM run with its own KKT factorization. + +#pragma once + +#include "breakage_cuda/device.hpp" +#include "breakage_cuda/system.hpp" + +#include + +namespace breakage_cuda::cuda_qp { + +// --------------------------------------------------------------------------- +// Per-stage parameters (mirror OSQP defaults). +// --------------------------------------------------------------------------- + +struct Settings { + double eps_abs{1e-3}; + double eps_rel{1e-3}; + double rho_init{0.1}; + double sigma{1e-6}; + double alpha{1.6}; + int max_iter{4000}; + int check_every{25}; + // Adaptive-rho settings (cf. OSQP's adaptive_rho heuristic). + bool adaptive_rho{true}; + double adaptive_rho_factor{5.0}; + // Misc + double infinity{1e30}; +}; + +// --------------------------------------------------------------------------- +// Persistent device-resident state (warm starts, factorizations). +// +// One Stage owns the matrices, KKT factorization, and warm-start buffers for +// a single ADMM run. The Solver owns three of these (prj/rlx/opt). +// +// Stage matrices are constant across solves with the same HostSystem; only +// l/u/q vectors and the warm-start state change between calls. +// --------------------------------------------------------------------------- + +class Stage; // pimpl + +class Solver { +public: + Solver(); + ~Solver(); + + Solver(const Solver &) = delete; + Solver &operator=(const Solver &) = delete; + Solver(Solver &&) noexcept; + Solver &operator=(Solver &&) noexcept; + + // Build the three stages from a (CPU) HostSystem. The DeviceSystem must + // already be uploaded; we only need its CSR matrices for SpMV bookkeeping. + void setup(const HostSystem &host, const DeviceSystem &dev, + const Settings &settings); + + // Run all three stages on RHS b (host vector, length 6 * num_parts). + // + // Outputs: + // x_d -- (num_vars,) device, optimization solution (still whitened) + // slack_d -- (num_eq,) device, b - A * x_prj + // v_relax_d -- (num_clutches,) device, relaxation slack v + SolveInfo solve(const VectorXd &b, double *x_d, double *slack_d, + double *v_relax_d, cudaStream_t stream = 0); + + // Drop the warm-start state; next solve starts cold. + void reset_warm_start(); + + // Total dims (forwarded from HostSystem). + int num_vars() const; + int num_eq() const; + int num_clutches() const; + +private: + std::unique_ptr prj_; + std::unique_ptr rlx_; + std::unique_ptr opt_; + Settings settings_{}; + int nx_{0}; + int me_{0}; + int mi_{0}; + int mh_{0}; + int nv_{0}; +}; + +// --------------------------------------------------------------------------- +// End-to-end driver mirroring breakage.cppm BreakageChecker::solve, but +// running everything (dense kernels + QP + postprocess) on the GPU. +// +// This is the entry point benchmarks and tests call. +// --------------------------------------------------------------------------- + +class Pipeline { +public: + Pipeline(); + ~Pipeline(); + + void setup(const HostSystem &host, const Thresholds &thr, + const Settings &settings = {}); + + Solution solve(const HostInput &in, HostState &state, + cudaStream_t stream = 0); + +private: + HostSystem host_; + Thresholds thr_; + DeviceSystem dev_; + Solver qp_; + // Scratch buffers + DeviceBuffer v_W_curr_; + DeviceBuffer L_curr_; + DeviceBuffer b_d_; + DeviceBuffer x_d_; + DeviceBuffer x_unwhiten_d_; + DeviceBuffer slack_d_; + DeviceBuffer v_relax_d_; + DeviceBuffer utilization_d_; +}; + +} // namespace breakage_cuda::cuda_qp diff --git a/native/cuda/breakage_cuda/include/breakage_cuda/ref_cpu.hpp b/native/cuda/breakage_cuda/include/breakage_cuda/ref_cpu.hpp new file mode 100644 index 0000000..6d33bf2 --- /dev/null +++ b/native/cuda/breakage_cuda/include/breakage_cuda/ref_cpu.hpp @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: MIT +// +// CPU reference implementation. Mirrors the per-step code paths of +// bricksim::BreakageChecker::solve in breakage.cppm: +// +// 1. fit_se3 / fit_twist (per-part dense ops) +// 2. compute_Pi, compute_L +// 3. RHS b assembly +// 4. Three-stage OSQP solve (projection / relaxation / optimization) +// 5. Whitening reversal + utilization +// +// Used both as a baseline for the CUDA implementation and to score the random +// QP and full-pipeline tests. + +#pragma once + +#include "breakage_cuda/system.hpp" + +#include + +#include + +namespace breakage_cuda::ref_cpu { + +// --------------------------------------------------------------------------- +// Compile-time switches mirroring breakage.cppm. They are hard-coded to true +// because BreakageChecker uses them as such; we expose them here so that +// tests can opt out if needed. +// --------------------------------------------------------------------------- + +constexpr bool kEnableTorqueScaling = true; +constexpr bool kEnableClutchWhitening = true; +constexpr bool kEnableClutchTangentialFriction = true; +constexpr bool kEnableRealisticClutchFriction = true; + +// --------------------------------------------------------------------------- +// Pure dense kernels (no QP). Each one matches a function in breakage.cppm. +// --------------------------------------------------------------------------- + +struct Transformd { + Eigen::Quaterniond q; + Vector3d t; +}; + +Transformd fit_se3(const MatrixX4d &q0, const MatrixX3d &t0, + const MatrixX4d &qx, const MatrixX3d &tx, + const VectorXd &mass, double total_mass, double lambda_R); + +struct TwistFitResult { + Vector3d w0; + Vector3d v0; + MatrixX3d v_W; // (num_parts, 3) +}; + +TwistFitResult fit_twist(const MatrixX3d &w, const MatrixX3d &v, + const MatrixX3d &c_CC, const Transformd &T_W_CC, + const VectorXd &mass, double total_mass, + double lambda_w); + +Vector3d so3_log_from_unit_quat(Eigen::Quaterniond q); + +Matrix3d so3_jacobian_inv(const Vector3d &phi); + +Matrix3d compute_Pi(const Eigen::Quaterniond &qm, + const Eigen::Quaterniond &qp); + +// I_flat: row-major (N, 9), each row is row-major flatten of 3x3. +MatrixX3d compute_L( + const Eigen::Matrix &I_flat, + const Eigen::Quaterniond &q_W_CC, const Vector3d &w0); + +// Builds the RHS b vector (size 6 * num_parts) per breakage.cppm:1018-1035. +// +// `Pi` is from compute_Pi; `v_W_curr`, `v_W_prev`, `L_curr`, `L_prev` are +// (num_parts, 3); J/H are external impulses (num_parts, 3). +// +// Output: b is laid out as 6 floats per part (linear x 3, angular x 3) in +// row-major order so it matches breakage.cppm's `b_mat` Map. +VectorXd assemble_b(const HostSystem &sys, const HostInput &in, + const MatrixX3d &v_W_curr, const MatrixX3d &v_W_prev, + const MatrixX3d &L_curr, const MatrixX3d &L_prev, + const Matrix3d &Pi); + +// --------------------------------------------------------------------------- +// Three-stage QP solver mirroring osqp.cppm. +// +// On first call (state.qp == nullptr or InternalQpState empty), the solver +// is built from sys; subsequent calls warm-start from the previous solution. +// --------------------------------------------------------------------------- + +SolveInfo solve_qp(const HostSystem &sys, const VectorXd &b, HostState &state, + VectorXd &x_out, VectorXd &slack_out, VectorXd &v_relax_out); + +// --------------------------------------------------------------------------- +// End-to-end driver. Runs steps (1)-(5) above and returns the same Solution +// you would get from BreakageChecker::solve in CPU-only mode. +// --------------------------------------------------------------------------- + +Solution solve_full(const HostSystem &sys, const HostInput &in, + HostState &state, const Thresholds &thr); + +// Convenience: computes b and returns it (used by tests). +VectorXd compute_b(const HostSystem &sys, const HostInput &in, HostState &state, + Eigen::Quaterniond *q_W_CC_out = nullptr, + MatrixX3d *v_W_out = nullptr, MatrixX3d *L_out = nullptr); + +} // namespace breakage_cuda::ref_cpu diff --git a/native/cuda/breakage_cuda/include/breakage_cuda/system.hpp b/native/cuda/breakage_cuda/include/breakage_cuda/system.hpp new file mode 100644 index 0000000..6f78146 --- /dev/null +++ b/native/cuda/breakage_cuda/include/breakage_cuda/system.hpp @@ -0,0 +1,208 @@ +// SPDX-License-Identifier: MIT +// +// Plain-C++ mirror of the bricksim::Breakage* types, sufficient to drive the +// per-step hot path on either CPU (Eigen + OSQP) or CUDA. The JSON layout is +// the one produced by bricksim::to_json(BreakageDebugDump) in +// modules/bricksim/physx/breakage.cppm. +// +// We intentionally do NOT depend on any bricksim C++26 module: the goal of +// this subproject is to be a self-contained CUDA prototype. + +#pragma once + +#include +#include + +#include + +#include +#include +#include +#include + +namespace breakage_cuda { + +// --------------------------------------------------------------------------- +// Aliases (match the layout used by breakage.cppm). +// --------------------------------------------------------------------------- + +using Eigen::Dynamic; +using Eigen::RowMajor; + +using Vector3d = Eigen::Vector3d; +using Vector4d = Eigen::Vector4d; +using VectorXd = Eigen::VectorXd; + +using Matrix2d = Eigen::Matrix2d; +using Matrix3d = Eigen::Matrix3d; + +using MatrixX3d = Eigen::Matrix; +using MatrixX4d = Eigen::Matrix; + +using SparseMatrixCSC = Eigen::SparseMatrix; +using SparseMatrixCSR = Eigen::SparseMatrix; + +// --------------------------------------------------------------------------- +// Thresholds (mirror BreakageThresholds). +// --------------------------------------------------------------------------- + +struct Thresholds { + bool enabled{true}; + double contact_regularization{0.1}; + double clutch_axial_compliance{1.0}; + double clutch_radial_compliance{1.0}; + double clutch_tangential_compliance{1.0}; + double friction_coefficient{0.2}; + double preloaded_force{3.5}; + double slack_fraction_warn{0.1}; + double slack_fraction_b_floor{1e-9}; + bool debug_dump{false}; + double breakage_cooldown_time{0.05}; +}; + +// --------------------------------------------------------------------------- +// Host-side, non-CUDA representation of a problem. +// +// All matrices keep the original (column-major / CSC) layout from breakage +// so that we can hand them to OSQP unchanged. The CUDA layer converts to its +// own representation in src/system.cpp. +// --------------------------------------------------------------------------- + +struct HostSystem { + // Shapes (mirror BreakageSystem::check_shape). + int num_parts{}; + int num_clutches{}; + int num_contact_vertices{}; + int num_vars{}; + int num_eq{}; + int num_ineq{}; + int num_relaxed_ineq{}; + + double total_mass{}; + double L0{}; + + // Per-part dense data. + VectorXd mass; // (num_parts,) + MatrixX4d q_CC; // (num_parts, 4) xyzw + MatrixX3d c_CC; // (num_parts, 3) m + // I_CC stored as (num_parts, 9) row-major flattening of 3x3 inertia tensors. + Eigen::Matrix I_CC; + + // Sparse matrices (CSC, owned, compressed). + SparseMatrixCSC Q; // (num_vars, num_vars) + SparseMatrixCSC A; // (6*num_parts, num_vars) + SparseMatrixCSC G; // (num_ineq, num_vars) + SparseMatrixCSC H; // (num_relaxed_ineq, num_vars) + SparseMatrixCSC V; // (num_relaxed_ineq, num_clutches) + + // Per-clutch friction-cone metadata. Indices into clutches_; same length as + // capacities. + std::vector capacity_clutch_indices; + // (capacity_clutch_indices.size(), 9) row-major flattening of Vector9d. + Eigen::Matrix capacities; + // (num_clutches, 4) row-major flattening of 2x2 whitening matrices. + Eigen::Matrix clutch_whiten; +}; + +// --------------------------------------------------------------------------- +// Per-frame input (mirror BreakageInput / BreakageInitialInput). +// --------------------------------------------------------------------------- + +struct HostInput { + // Initial-input fields. + MatrixX3d w; // angular velocity (num_parts, 3) + MatrixX3d v; // linear velocity (num_parts, 3) + MatrixX4d q; // orientation xyzw (num_parts, 4) + MatrixX3d c; // COM position (num_parts, 3) + + // Step-only fields. + double dt{}; + MatrixX3d J; // external linear impulse (num_parts, 3) + MatrixX3d H; // external angular imp. (num_parts, 3) +}; + +// --------------------------------------------------------------------------- +// Per-frame solver state across steps (mirror BreakageState + OsqpState). +// +// The QP warm-start payload is opaque to the system layer; both the CPU and +// CUDA solvers cache their own state behind the InternalQpState pointer. +// --------------------------------------------------------------------------- + +struct InternalQpState; // forward declaration; defined by each solver backend. +using InternalQpStatePtr = std::shared_ptr; + +struct HostState { + Eigen::Quaterniond q_W_CC_prev = Eigen::Quaterniond::Identity(); + MatrixX3d v_W_prev; // (num_parts, 3) + MatrixX3d L_prev; // (num_parts, 3) + + // CPU baseline keeps three OSQP solvers + their warm-start vectors here. + // CUDA backend keeps device buffers here. Either may be empty. + InternalQpStatePtr qp; +}; + +// --------------------------------------------------------------------------- +// Per-stage info from a single solve. +// --------------------------------------------------------------------------- + +struct StageInfo { + bool converged{false}; + int iter{0}; + double prim_res{std::numeric_limits::quiet_NaN()}; + double dual_res{std::numeric_limits::quiet_NaN()}; + double rho{0.0}; + double solve_time_s{0.0}; +}; + +struct SolveInfo { + bool converged{false}; + StageInfo prj; + StageInfo rlx; + StageInfo opt; +}; + +// --------------------------------------------------------------------------- +// Result of a single solve. +// --------------------------------------------------------------------------- + +struct Solution { + VectorXd x; // (num_vars,) un-whitened (matches breakage.cppm output) + VectorXd utilization; // (num_clutches,) per-clutch friction utilization + double slack_fraction{0.0}; + SolveInfo info; +}; + +// --------------------------------------------------------------------------- +// JSON loaders. +// +// Layout: identical to BreakageDebugDump JSON (see breakage.cppm to_json). +// --------------------------------------------------------------------------- + +void from_json(const nlohmann::json &j, Thresholds &out); +void from_json(const nlohmann::json &j, HostSystem &out); +void from_json(const nlohmann::json &j, HostInput &out); +void from_json(const nlohmann::json &j, HostState &out); + +struct DebugDump { + Thresholds thresholds; + HostSystem system; + HostInput input; + HostState state; + Solution solution; + VectorXd b; + std::optional prev_state; +}; + +void from_json(const nlohmann::json &j, DebugDump &out); + +DebugDump load_debug_dump(const std::string &path); + +// --------------------------------------------------------------------------- +// Cross-validation helpers (used in tests). +// --------------------------------------------------------------------------- + +// Compares two solutions' x within a relative tolerance (||a-b||_inf / max(||b||_inf, floor)). +double relative_inf_error(const VectorXd &a, const VectorXd &b, + double floor = 1e-12); + +} // namespace breakage_cuda diff --git a/native/cuda/breakage_cuda/src/kernels.cu b/native/cuda/breakage_cuda/src/kernels.cu new file mode 100644 index 0000000..e1fed70 --- /dev/null +++ b/native/cuda/breakage_cuda/src/kernels.cu @@ -0,0 +1,817 @@ +// SPDX-License-Identifier: MIT +// +// Implementation of breakage_cuda/kernels.hpp: +// +// * fit_se3 -- mass-weighted SE3 Procrustes (GPU reductions + host SVD) +// * fit_twist -- mass-weighted twist fit (GPU reductions + host LDLT) +// * compute_Pi -- pure-host 3x3 (small, scalar work) +// * compute_L -- per-part inertia rotation (one thread per part) +// * assemble_b -- per-part RHS row assembly (one thread per part) +// * postprocess -- un-whiten optimization x and compute clutch utilization +// +// Strategy notes: +// - Per-part data ((N,K) row-major) is small enough that we keep the kernels +// simple: one thread per part with a block-stride loop, plus a single +// shared-memory sum reduction inside each block. +// - Reductions land in tiny global accumulators (<= 18 doubles) via per-block +// atomicAdd; we then memcpy them back to host for the small dense post- +// processing (SVD, LDLT) where Eigen on the host is plenty fast. +// - All numerics in double; formulas mirror breakage.cppm verbatim. + +#include "breakage_cuda/kernels.hpp" +#include "breakage_cuda/device.hpp" +#include "breakage_cuda/linalg.hpp" + +#include + +#include +#include +#include + +namespace breakage_cuda::cuda_kernels { + +namespace { + +// --------------------------------------------------------------------------- +// Plain-old-data wrappers used as kernel by-value parameters. +// Mat3 is row-major. +// --------------------------------------------------------------------------- + +struct Mat3 { + double m[9]; +}; +struct Vec3 { + double v[3]; +}; + +// All reduction kernels assume blockDim.x == kBlock. The host launchers +// always launch with this exact block size. +constexpr int kBlock = 256; + +// Cap the launch grid: reductions all use a block-stride loop so any value +// works, but a smaller grid lowers atomic-add contention on the tiny +// accumulators. +inline int grid_for(int N) { + int g = (N + kBlock - 1) / kBlock; + return std::min(g, 256); +} + +// --------------------------------------------------------------------------- +// Device math helpers. +// --------------------------------------------------------------------------- + +__device__ inline void d_quat_xyzw_to_R(double x, double y, double z, double w, + double R[9]) { + // Standard unit-quaternion to rotation-matrix conversion. Matches + // Eigen::Quaternion::toRotationMatrix exactly when the input is unit-length. + double xx = x * x, yy = y * y, zz = z * z; + double xy = x * y, xz = x * z, yz = y * z; + double wx = w * x, wy = w * y, wz = w * z; + R[0] = 1.0 - 2.0 * (yy + zz); + R[1] = 2.0 * (xy - wz); + R[2] = 2.0 * (xz + wy); + R[3] = 2.0 * (xy + wz); + R[4] = 1.0 - 2.0 * (xx + zz); + R[5] = 2.0 * (yz - wx); + R[6] = 2.0 * (xz - wy); + R[7] = 2.0 * (yz + wx); + R[8] = 1.0 - 2.0 * (xx + yy); +} + +// Block sum-reduce. Caller provides a `kBlock`-sized shared buffer. Returns +// the same value on every thread (sm[0] after the final sync). +__device__ inline double block_reduce_sum(double val, double *sm) { + int tid = threadIdx.x; + sm[tid] = val; + __syncthreads(); + for (int s = kBlock / 2; s > 0; s >>= 1) { + if (tid < s) { + sm[tid] += sm[tid + s]; + } + __syncthreads(); + } + return sm[0]; +} + +// Reduce VARS thread-local doubles to per-block sums and atomicAdd them into +// `out` in one go. +template +__device__ inline void block_reduce_atomic_add(double *local, double *sm, + double *out) { + int tid = threadIdx.x; + for (int k = 0; k < VARS; ++k) { + double r = block_reduce_sum(local[k], sm); + if (tid == 0) { + atomicAdd(&out[k], r); + } + __syncthreads(); + } +} + +// Atomic max for fp64 via CAS on the bit representation. +// +// CUDA does not provide an atomicMax(double*, double); we emulate it with a +// CAS retry loop on the bits. NaN inputs are silently ignored (matches the +// CPU baseline, which never produces them on this hot path). +__device__ inline void atomic_max_double(double *addr, double val) { + auto *p = reinterpret_cast(addr); + unsigned long long old = *p; + unsigned long long assumed; + do { + assumed = old; + double cur = __longlong_as_double(static_cast(assumed)); + // `!(val > cur)` is true for NaN val too, so we do not write NaNs. + if (!(val > cur)) { + return; + } + unsigned long long candidate = + static_cast(__double_as_longlong(val)); + old = atomicCAS(p, assumed, candidate); + } while (assumed != old); +} + +// --------------------------------------------------------------------------- +// fit_se3 reduction kernels. +// --------------------------------------------------------------------------- + +// Pass 1: accumulate the centroid sums and the K matrix. +// +// Per-part contributions packed into 15 doubles: +// local[ 0..2] : m * t0 +// local[ 3..5] : m * tx +// local[ 6..14] : m * R(q0 * conj(qx)) (row-major flatten) +__global__ void se3_pass1_kernel(int N, const double *q0, const double *t0, + const double *qx, const double *tx, + const double *mass, double *out_15) { + __shared__ double sm[kBlock]; + double local[15]; + for (int k = 0; k < 15; ++k) { + local[k] = 0.0; + } + + for (int i = blockIdx.x * kBlock + threadIdx.x; i < N; + i += kBlock * gridDim.x) { + double m = mass[i]; + double t0x = t0[3 * i + 0]; + double t0y = t0[3 * i + 1]; + double t0z = t0[3 * i + 2]; + double txx = tx[3 * i + 0]; + double txy = tx[3 * i + 1]; + double txz = tx[3 * i + 2]; + + double q0x = q0[4 * i + 0], q0y = q0[4 * i + 1]; + double q0z = q0[4 * i + 2], q0w = q0[4 * i + 3]; + double qxx = qx[4 * i + 0], qxy = qx[4 * i + 1]; + double qxz = qx[4 * i + 2], qxw = qx[4 * i + 3]; + + // q0 * conj(qx); conj(qx) = (-qxx, -qxy, -qxz, qxw) + double bx = -qxx, by = -qxy, bz = -qxz, bw = qxw; + double rx = q0w * bx + q0x * bw + q0y * bz - q0z * by; + double ry = q0w * by - q0x * bz + q0y * bw + q0z * bx; + double rz = q0w * bz + q0x * by - q0y * bx + q0z * bw; + double rw = q0w * bw - q0x * bx - q0y * by - q0z * bz; + + double R[9]; + d_quat_xyzw_to_R(rx, ry, rz, rw, R); + + local[0] += m * t0x; + local[1] += m * t0y; + local[2] += m * t0z; + local[3] += m * txx; + local[4] += m * txy; + local[5] += m * txz; + for (int k = 0; k < 9; ++k) { + local[6 + k] += m * R[k]; + } + } + + block_reduce_atomic_add<15>(local, sm, out_15); +} + +// Pass 2: accumulate the H = sum_i m_i (t0_i - t0_bar)(tx_i - tx_bar)^T outer +// product after the centroids are known (host-side). +__global__ void se3_pass2_kernel(int N, const double *t0, const double *tx, + const double *mass, Vec3 t0_bar, Vec3 tx_bar, + double *out_9) { + __shared__ double sm[kBlock]; + double local[9]; + for (int k = 0; k < 9; ++k) { + local[k] = 0.0; + } + + for (int i = blockIdx.x * kBlock + threadIdx.x; i < N; + i += kBlock * gridDim.x) { + double m = mass[i]; + double dt0x = t0[3 * i + 0] - t0_bar.v[0]; + double dt0y = t0[3 * i + 1] - t0_bar.v[1]; + double dt0z = t0[3 * i + 2] - t0_bar.v[2]; + double dtxx = tx[3 * i + 0] - tx_bar.v[0]; + double dtxy = tx[3 * i + 1] - tx_bar.v[1]; + double dtxz = tx[3 * i + 2] - tx_bar.v[2]; + + // Row-major outer product m * dt0 * dtx^T. + local[0] += m * dt0x * dtxx; + local[1] += m * dt0x * dtxy; + local[2] += m * dt0x * dtxz; + local[3] += m * dt0y * dtxx; + local[4] += m * dt0y * dtxy; + local[5] += m * dt0y * dtxz; + local[6] += m * dt0z * dtxx; + local[7] += m * dt0z * dtxy; + local[8] += m * dt0z * dtxz; + } + + block_reduce_atomic_add<9>(local, sm, out_9); +} + +// --------------------------------------------------------------------------- +// fit_twist reduction kernels. +// --------------------------------------------------------------------------- + +// Pass 1: c_W = R_w * c_CC + t_w; accumulate sum(m * c_W) for the centroid r. +__global__ void twist_pass1_kernel(int N, const double *c_CC, + const double *mass, Mat3 R_w, Vec3 t_w, + double *c_W_out, double *sum_mc_W_3) { + __shared__ double sm[kBlock]; + double local[3] = {0.0, 0.0, 0.0}; + + for (int i = blockIdx.x * kBlock + threadIdx.x; i < N; + i += kBlock * gridDim.x) { + double m = mass[i]; + double cx = c_CC[3 * i + 0]; + double cy = c_CC[3 * i + 1]; + double cz = c_CC[3 * i + 2]; + double cw0 = R_w.m[0] * cx + R_w.m[1] * cy + R_w.m[2] * cz + t_w.v[0]; + double cw1 = R_w.m[3] * cx + R_w.m[4] * cy + R_w.m[5] * cz + t_w.v[1]; + double cw2 = R_w.m[6] * cx + R_w.m[7] * cy + R_w.m[8] * cz + t_w.v[2]; + c_W_out[3 * i + 0] = cw0; + c_W_out[3 * i + 1] = cw1; + c_W_out[3 * i + 2] = cw2; + local[0] += m * cw0; + local[1] += m * cw1; + local[2] += m * cw2; + } + + block_reduce_atomic_add<3>(local, sm, sum_mc_W_3); +} + +// Pass 2: d = c_W - r; emit d to scratch and accumulate the 18 doubles needed +// to assemble (LHS, RHS, v0) on host. +// +// acc[ 0.. 8] : S = sum d[i] * (m * d[i])^T (row-major 3x3) +// acc[ 9..11] : L (cross-term L of breakage.cppm fit_twist) +// acc[12..14] : sum(m * w) -> regularization base +// acc[15..17] : sum(m * v) -> v0 +__global__ void twist_pass2_kernel(int N, const double *c_W, + const double *mass, const double *v, + const double *w, Vec3 r, double *d_out, + double *acc_18) { + __shared__ double sm[kBlock]; + double local[18]; + for (int k = 0; k < 18; ++k) { + local[k] = 0.0; + } + + for (int i = blockIdx.x * kBlock + threadIdx.x; i < N; + i += kBlock * gridDim.x) { + double m = mass[i]; + double dx = c_W[3 * i + 0] - r.v[0]; + double dy = c_W[3 * i + 1] - r.v[1]; + double dz = c_W[3 * i + 2] - r.v[2]; + d_out[3 * i + 0] = dx; + d_out[3 * i + 1] = dy; + d_out[3 * i + 2] = dz; + + double dwx = m * dx, dwy = m * dy, dwz = m * dz; + double vx = v[3 * i + 0], vy = v[3 * i + 1], vz = v[3 * i + 2]; + double wx = w[3 * i + 0], wy = w[3 * i + 1], wz = w[3 * i + 2]; + + // S(r,c) = sum d[i,r] * d_w[i,c], row-major. + local[0] += dx * dwx; + local[1] += dx * dwy; + local[2] += dx * dwz; + local[3] += dy * dwx; + local[4] += dy * dwy; + local[5] += dy * dwz; + local[6] += dz * dwx; + local[7] += dz * dwy; + local[8] += dz * dwz; + + // L (cross-term, mirrors breakage.cppm: dot(d_w.col(j), v.col(k)) ...) + local[9] += dwy * vz - dwz * vy; + local[10] += dwz * vx - dwx * vz; + local[11] += dwx * vy - dwy * vx; + + // sum(m*w), sum(m*v) + local[12] += m * wx; + local[13] += m * wy; + local[14] += m * wz; + local[15] += m * vx; + local[16] += m * vy; + local[17] += m * vz; + } + + block_reduce_atomic_add<18>(local, sm, acc_18); +} + +// Pass 3: v_W[i] = w0 cross d[i] + v0. +// +// Original CPU code: `d.rowwise().cross(-w0) + v0.transpose()` --- since +// d × (-w0) = -(d × w0) = w0 × d, we reorder the cross to remove the negation. +__global__ void twist_pass3_kernel(int N, const double *d, Vec3 w0, Vec3 v0, + double *v_W) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; + i += gridDim.x * blockDim.x) { + double dx = d[3 * i + 0]; + double dy = d[3 * i + 1]; + double dz = d[3 * i + 2]; + v_W[3 * i + 0] = w0.v[1] * dz - w0.v[2] * dy + v0.v[0]; + v_W[3 * i + 1] = w0.v[2] * dx - w0.v[0] * dz + v0.v[1]; + v_W[3 * i + 2] = w0.v[0] * dy - w0.v[1] * dx + v0.v[2]; + } +} + +// --------------------------------------------------------------------------- +// compute_L kernel. +// +// L[i] = R_w * (I_CC[i] * w_CC), where I_CC[i] is symmetric. We pass R_w and +// w_CC by value (precomputed on host). One thread per part. +// --------------------------------------------------------------------------- + +__global__ void compute_L_kernel(int N, const double *I_CC, Mat3 R_w, + Vec3 w_CC, double *L_d) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; + i += gridDim.x * blockDim.x) { + // tmp = (sum_r w_CC[r] * I_CC_row(r)) -- row vector with components + // tmp[k] = sum_r w_CC[r] * I_CC[i, 3*r + k]. + // I_CC[i] is symmetric so this equals (I_CC[i] * w_CC). + double tmp[3]; + for (int k = 0; k < 3; ++k) { + tmp[k] = w_CC.v[0] * I_CC[9 * i + 0 + k] + + w_CC.v[1] * I_CC[9 * i + 3 + k] + + w_CC.v[2] * I_CC[9 * i + 6 + k]; + } + // L[i, j] = (tmp_row * R_w^T)[j] = sum_k R_w[j, k] * tmp[k]. + for (int j = 0; j < 3; ++j) { + double s = 0.0; + for (int k = 0; k < 3; ++k) { + s += R_w.m[3 * j + k] * tmp[k]; + } + L_d[3 * i + j] = s; + } + } +} + +// --------------------------------------------------------------------------- +// assemble_b kernel. +// +// b_mat (N x 6) layout per part i: +// linear [j] = (m*(v_curr - v_prev) - J)[i] * Pi^T / dt +// angular [j] = (L_curr - L_prev - H)[i] * Pi^T / (dt * L0) (torque-scaled) +// --------------------------------------------------------------------------- + +__global__ void assemble_b_kernel(int N, const double *v_W_curr, + const double *v_W_prev, + const double *L_curr, const double *L_prev, + const double *J, const double *H, + const double *mass, Mat3 Pi, double inv_dt, + double inv_dt_L0, double *b) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; + i += gridDim.x * blockDim.x) { + double m = mass[i]; + double pre_lin[3]; + double pre_ang[3]; + for (int k = 0; k < 3; ++k) { + pre_lin[k] = + m * (v_W_curr[3 * i + k] - v_W_prev[3 * i + k]) - J[3 * i + k]; + pre_ang[k] = L_curr[3 * i + k] - L_prev[3 * i + k] - H[3 * i + k]; + } + // out_row[j] = sum_k pre[k] * Pi^T[k, j] = sum_k Pi[j, k] * pre[k]. + for (int j = 0; j < 3; ++j) { + double sl = 0.0; + double sa = 0.0; + for (int k = 0; k < 3; ++k) { + double pijk = Pi.m[3 * j + k]; + sl += pijk * pre_lin[k]; + sa += pijk * pre_ang[k]; + } + b[6 * i + j] = sl * inv_dt; + b[6 * i + 3 + j] = sa * inv_dt_L0; + } + } +} + +// --------------------------------------------------------------------------- +// postprocess kernels: un-whitening and per-clutch utilization. +// --------------------------------------------------------------------------- + +// One thread per clutch. Three 2x1 multiplications by Wk per clutch: +// x_unwhiten[j0+1:j0+3] = Wk * x[j0+1:j0+3] +// x_unwhiten[j0+4:j0+6] = Wk * x[j0+4:j0+6] +// x_unwhiten[j0+7:j0+9] = Wk * x[j0+7:j0+9] +// All other positions are direct copies done by the preceding cudaMemcpy. +__global__ void unwhiten_kernel(int ncv, int K, const double *clutch_whiten, + const double *x, double *x_unwhiten) { + int k = blockIdx.x * blockDim.x + threadIdx.x; + if (k >= K) { + return; + } + int j0 = ncv + 9 * k; + double w00 = clutch_whiten[4 * k + 0]; + double w01 = clutch_whiten[4 * k + 1]; + double w10 = clutch_whiten[4 * k + 2]; + double w11 = clutch_whiten[4 * k + 3]; + + const int offsets[3] = {1, 4, 7}; + for (int t = 0; t < 3; ++t) { + int p = j0 + offsets[t]; + double v0 = x[p]; + double v1 = x[p + 1]; + x_unwhiten[p] = w00 * v0 + w01 * v1; + x_unwhiten[p + 1] = w10 * v0 + w11 * v1; + } +} + +// One thread per capacity row; atomic-max into the per-clutch utilization. +__global__ void utilization_kernel(int ncv, int ncap, + const int *clutch_indices, + const double *capacities, + const double *x_unwhiten, + double *utilization) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= ncap) { + return; + } + int clutch_index = clutch_indices[idx]; + double c[9]; + for (int k = 0; k < 9; ++k) { + c[k] = capacities[9 * idx + k]; + } + int xoff = ncv + 9 * clutch_index; + double frc_used = c[0] * x_unwhiten[xoff + 0] + + c[1] * x_unwhiten[xoff + 1] + + c[2] * x_unwhiten[xoff + 2]; + double cap_dot = 0.0; + for (int k = 0; k < 6; ++k) { + cap_dot += c[3 + k] * x_unwhiten[xoff + 3 + k]; + } + double frc_cap = 1.0 - cap_dot; + double u_fp; + if (frc_cap > 0.0) { + u_fp = frc_used / frc_cap; + if (u_fp < 0.0) { + u_fp = 0.0; + } + } else { + u_fp = 1e6; + } + atomic_max_double(&utilization[clutch_index], u_fp); +} + +// --------------------------------------------------------------------------- +// Host-only SO(3) helpers used by compute_Pi_host (line-for-line copies of +// breakage.cppm:123-164 with double[] interfaces). +// --------------------------------------------------------------------------- + +void so3_log_from_unit_quat_host(const double q_in[4], double phi[3]) { + double q[4]; + for (int i = 0; i < 4; ++i) { + q[i] = q_in[i]; + } + cuda_la::quat_xyzw_normalize(q); + // Pick the shortest representation (q.w >= 0) except for the pi case. + if (q[3] < 0.0) { + q[0] = -q[0]; + q[1] = -q[1]; + q[2] = -q[2]; + q[3] = -q[3]; + } + double v[3] = {q[0], q[1], q[2]}; + double s = std::sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); // sin(theta/2) + double w = q[3]; // cos(theta/2) + + constexpr double kSmallS = 5e-5; + double theta_over_s; + if (s < kSmallS) { + // Series: theta/s = 2 + s^2/3 + 3 s^4/20 + O(s^6). + double s2 = s * s; + double s4 = s2 * s2; + theta_over_s = 2.0 + (s2 / 3.0) + (3.0 * s4 / 20.0); + } else { + double theta = 2.0 * std::atan2(s, w); + theta_over_s = theta / s; + } + phi[0] = theta_over_s * v[0]; + phi[1] = theta_over_s * v[1]; + phi[2] = theta_over_s * v[2]; +} + +void so3_jacobian_inv_host(const double phi[3], double J_inv[9]) { + double theta2 = phi[0] * phi[0] + phi[1] * phi[1] + phi[2] * phi[2]; + constexpr double kSmallTheta = 1e-4; + double c; + if (theta2 < kSmallTheta * kSmallTheta) { + // Series: c(theta) = 1/12 + theta^2/720 + theta^4/30240 + O(theta^6). + double t2 = theta2; + double t4 = t2 * t2; + c = (1.0 / 12.0) + (t2 / 720.0) + (t4 / 30240.0); + } else { + double theta = std::sqrt(theta2); + double half = 0.5 * theta; + double cot_half = std::cos(half) / std::sin(half); + c = (1.0 / theta2) - (cot_half / (2.0 * theta)); + } + double px = phi[0], py = phi[1], pz = phi[2]; + // Phi = [ 0, -z, y; + // z, 0, -x; + // -y, x, 0 ] + double Phi[9] = {0.0, -pz, py, pz, 0.0, -px, -py, px, 0.0}; + double P2[9]; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + double sum = 0.0; + for (int k = 0; k < 3; ++k) { + sum += Phi[3 * i + k] * Phi[3 * k + j]; + } + P2[3 * i + j] = sum; + } + } + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + double v = ((i == j) ? 1.0 : 0.0) - 0.5 * Phi[3 * i + j] + + c * P2[3 * i + j]; + J_inv[3 * i + j] = v; + } + } +} + +} // namespace + +// --------------------------------------------------------------------------- +// Host launchers. +// --------------------------------------------------------------------------- + +void fit_se3(int N, const double *q0_d, const double *t0_d, const double *qx_d, + const double *tx_d, const double *mass_d, double total_mass, + double lambda_R, double q_out[4], double t_out[3], + cudaStream_t stream) { + if (N == 0) { + q_out[0] = 0.0; + q_out[1] = 0.0; + q_out[2] = 0.0; + q_out[3] = 1.0; + t_out[0] = 0.0; + t_out[1] = 0.0; + t_out[2] = 0.0; + return; + } + if (!(total_mass > 0.0)) { + throw std::invalid_argument("fit_se3: total_mass must be positive"); + } + + int grid = grid_for(N); + + // Pass 1: 6 centroid sums + 9 K sums = 15 doubles. + DeviceBuffer p1(15); + BREAKAGE_CUDA_CHECK( + cudaMemsetAsync(p1.data(), 0, 15 * sizeof(double), stream)); + se3_pass1_kernel<<>>(N, q0_d, t0_d, qx_d, tx_d, + mass_d, p1.data()); + + double p1_h[15]; + BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(p1_h, p1.data(), 15 * sizeof(double), + cudaMemcpyDeviceToHost, stream)); + BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream)); + + Vec3 t0_bar; + Vec3 tx_bar; + for (int k = 0; k < 3; ++k) { + t0_bar.v[k] = p1_h[k] / total_mass; + tx_bar.v[k] = p1_h[3 + k] / total_mass; + } + double K[9]; + for (int k = 0; k < 9; ++k) { + K[k] = p1_h[6 + k]; + } + + // Pass 2: H = sum_i m_i * (t0_i - t0_bar) * (tx_i - tx_bar)^T. + DeviceBuffer p2(9); + BREAKAGE_CUDA_CHECK( + cudaMemsetAsync(p2.data(), 0, 9 * sizeof(double), stream)); + se3_pass2_kernel<<>>(N, t0_d, tx_d, mass_d, t0_bar, + tx_bar, p2.data()); + + double H[9]; + BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(H, p2.data(), 9 * sizeof(double), + cudaMemcpyDeviceToHost, stream)); + BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream)); + + // Host: Procrustes on S = H + lambda_R * K, then derive (q, t). + double S[9]; + for (int k = 0; k < 9; ++k) { + S[k] = H[k] + lambda_R * K[k]; + } + double R[9]; + cuda_la::svd_procrustes_3x3(S, R); + + // t = tx_bar - R * t0_bar + for (int i = 0; i < 3; ++i) { + double s = tx_bar.v[i]; + for (int j = 0; j < 3; ++j) { + s -= R[3 * i + j] * t0_bar.v[j]; + } + t_out[i] = s; + } + cuda_la::R_to_quat_xyzw(R, q_out); + cuda_la::quat_xyzw_normalize(q_out); +} + +void fit_twist(int N, const double *w_d, const double *v_d, + const double *c_CC_d, const double q_W_CC[4], + const double t_W_CC[3], const double *mass_d, double total_mass, + double lambda_w, double w0_out[3], double v0_out[3], + double *v_W_d, cudaStream_t stream) { + if (N == 0) { + for (int k = 0; k < 3; ++k) { + w0_out[k] = 0.0; + v0_out[k] = 0.0; + } + return; + } + if (!(total_mass > 0.0)) { + throw std::invalid_argument("fit_twist: total_mass must be positive"); + } + + int grid = grid_for(N); + + Mat3 R_w; + cuda_la::quat_xyzw_to_R(q_W_CC, R_w.m); + Vec3 t_w; + t_w.v[0] = t_W_CC[0]; + t_w.v[1] = t_W_CC[1]; + t_w.v[2] = t_W_CC[2]; + + // Scratch: c_W and d both (N x 3) row-major. + DeviceBuffer c_W_buf(static_cast(3) * N); + DeviceBuffer d_buf(static_cast(3) * N); + DeviceBuffer p1(3); + DeviceBuffer p2(18); + BREAKAGE_CUDA_CHECK( + cudaMemsetAsync(p1.data(), 0, 3 * sizeof(double), stream)); + BREAKAGE_CUDA_CHECK( + cudaMemsetAsync(p2.data(), 0, 18 * sizeof(double), stream)); + + twist_pass1_kernel<<>>( + N, c_CC_d, mass_d, R_w, t_w, c_W_buf.data(), p1.data()); + + double sum_mc[3]; + BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(sum_mc, p1.data(), 3 * sizeof(double), + cudaMemcpyDeviceToHost, stream)); + BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream)); + + Vec3 r; + for (int k = 0; k < 3; ++k) { + r.v[k] = sum_mc[k] / total_mass; + } + + twist_pass2_kernel<<>>( + N, c_W_buf.data(), mass_d, v_d, w_d, r, d_buf.data(), p2.data()); + + double acc[18]; + BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(acc, p2.data(), 18 * sizeof(double), + cudaMemcpyDeviceToHost, stream)); + BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream)); + + double S[9]; + for (int k = 0; k < 9; ++k) { + S[k] = acc[k]; + } + double L[3] = {acc[9], acc[10], acc[11]}; + double mw[3] = {acc[12], acc[13], acc[14]}; + double mv[3] = {acc[15], acc[16], acc[17]}; + + // LHS = I * (trace(S) + lambda_w * total_mass) - S + double trace = S[0] + S[4] + S[8]; + double k_const = trace + lambda_w * total_mass; + double LHS[9] = {k_const - S[0], -S[1], -S[2], // + -S[3], k_const - S[4], -S[5], // + -S[6], -S[7], k_const - S[8]}; + double RHS[3] = {L[0] + lambda_w * mw[0], L[1] + lambda_w * mw[1], + L[2] + lambda_w * mw[2]}; + + cuda_la::spd_solve_3x3(LHS, RHS, w0_out); + v0_out[0] = mv[0] / total_mass; + v0_out[1] = mv[1] / total_mass; + v0_out[2] = mv[2] / total_mass; + + Vec3 w0_v = {{w0_out[0], w0_out[1], w0_out[2]}}; + Vec3 v0_v = {{v0_out[0], v0_out[1], v0_out[2]}}; + twist_pass3_kernel<<>>(N, d_buf.data(), w0_v, v0_v, + v_W_d); + // Sync so the scratch DeviceBuffers can be safely freed at scope exit. + BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream)); +} + +void compute_Pi_host(const double qm[4], const double qp[4], + double Pi_out[9]) { + // Pi = J_inv(log(R_m^T R_p)) * R_m^T + double qm_conj[4]; + cuda_la::quat_xyzw_conjugate(qm, qm_conj); + double q_rel[4]; + cuda_la::quat_xyzw_mul(qm_conj, qp, q_rel); + double phi[3]; + so3_log_from_unit_quat_host(q_rel, phi); + double J_inv[9]; + so3_jacobian_inv_host(phi, J_inv); + double R_m[9]; + cuda_la::quat_xyzw_to_R(qm, R_m); + // Pi[i, j] = sum_k J_inv[i, k] * R_m^T[k, j] = sum_k J_inv[i, k] * R_m[j, k] + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + double s = 0.0; + for (int k = 0; k < 3; ++k) { + s += J_inv[3 * i + k] * R_m[3 * j + k]; + } + Pi_out[3 * i + j] = s; + } + } +} + +void compute_L(int N, const double *I_CC_d, const double q_W_CC[4], + const double w0[3], double *L_d, cudaStream_t stream) { + if (N == 0) { + return; + } + // Precompute R_w (row-major) and w_CC = R_w^T * w0 on the host so the + // kernel parameters stay tiny (12 doubles by-value). + Mat3 R_w; + cuda_la::quat_xyzw_to_R(q_W_CC, R_w.m); + Vec3 w_CC; + for (int i = 0; i < 3; ++i) { + // R_w^T[i, k] = R_w[k, i] + w_CC.v[i] = R_w.m[3 * 0 + i] * w0[0] + R_w.m[3 * 1 + i] * w0[1] + + R_w.m[3 * 2 + i] * w0[2]; + } + int grid = grid_for(N); + compute_L_kernel<<>>(N, I_CC_d, R_w, w_CC, L_d); +} + +void assemble_b(int N, const double *v_W_curr_d, const double *v_W_prev_d, + const double *L_curr_d, const double *L_prev_d, + const double *J_d, const double *H_d, const double *mass_d, + const double Pi[9], double dt, double L0, double *b_d, + cudaStream_t stream) { + if (N == 0) { + return; + } + if (!(dt > 0.0)) { + throw std::invalid_argument("assemble_b: dt must be positive"); + } + if (!(L0 > 0.0)) { + throw std::invalid_argument("assemble_b: L0 must be positive"); + } + Mat3 Pi_v; + for (int k = 0; k < 9; ++k) { + Pi_v.m[k] = Pi[k]; + } + double inv_dt = 1.0 / dt; + // EnableTorqueScaling = true (matches breakage.cppm). + double inv_dt_L0 = inv_dt / L0; + int grid = grid_for(N); + assemble_b_kernel<<>>( + N, v_W_curr_d, v_W_prev_d, L_curr_d, L_prev_d, J_d, H_d, mass_d, Pi_v, + inv_dt, inv_dt_L0, b_d); +} + +void postprocess(int ncv, int K, int ncap, const double *x_d, + const int *capacity_clutch_indices_d, + const double *capacities_d, const double *clutch_whiten_d, + double *x_unwhiten_d, double *utilization_d, + cudaStream_t stream) { + // num_vars = ncv + 9 * K (cf. BreakageSystem::num_vars_). + int num_vars = ncv + 9 * K; + if (num_vars > 0) { + // Step 0: copy x verbatim. The unwhiten kernel below will overwrite the + // 6 positions per clutch that need the Wk transform; everything else stays. + BREAKAGE_CUDA_CHECK(cudaMemcpyAsync( + x_unwhiten_d, x_d, static_cast(num_vars) * sizeof(double), + cudaMemcpyDeviceToDevice, stream)); + } + if (K > 0) { + // Initialize utilization to -1 before any atomicMax (matches CPU baseline). + cuda_la::fill(K, -1.0, utilization_d, stream); + int bs = 64; + int grid = (K + bs - 1) / bs; + unwhiten_kernel<<>>(ncv, K, clutch_whiten_d, x_d, + x_unwhiten_d); + } + if (ncap > 0) { + int bs = 128; + int grid = (ncap + bs - 1) / bs; + utilization_kernel<<>>( + ncv, ncap, capacity_clutch_indices_d, capacities_d, x_unwhiten_d, + utilization_d); + } +} + +} // namespace breakage_cuda::cuda_kernels diff --git a/native/cuda/breakage_cuda/src/linalg.cu b/native/cuda/breakage_cuda/src/linalg.cu new file mode 100644 index 0000000..31c0b05 --- /dev/null +++ b/native/cuda/breakage_cuda/src/linalg.cu @@ -0,0 +1,507 @@ +// SPDX-License-Identifier: MIT +// +// Implementation of breakage_cuda/linalg.hpp: +// +// * cuBLAS / cuSPARSE / cuSOLVER handle pool (Meyers singleton). +// * Thin wrappers around level-1 BLAS we need from the ADMM solver. +// * Generic-API SpMV. +// * Element-wise box-clamp / fill kernels. +// * Host-side 3x3 / 2x2 dense helpers (SVD-Procrustes, SPD LDLT solve, +// SPD inverse-square-root) backed by Eigen. +// * xyzw quaternion <-> rotation matrix helpers. +// +// All double-precision throughout: this is a verification / numerical-fidelity +// project, not a perf-shootout. + +#include "breakage_cuda/linalg.hpp" +#include "breakage_cuda/device.hpp" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace breakage_cuda::cuda_la { + +// --------------------------------------------------------------------------- +// Local error-check macro for cuBLAS (cuSPARSE/CUDA macros come from +// device.hpp). +// --------------------------------------------------------------------------- + +#define BREAKAGE_CUBLAS_CHECK(call) \ + do { \ + cublasStatus_t _s = (call); \ + if (_s != CUBLAS_STATUS_SUCCESS) { \ + throw std::runtime_error(std::string("cuBLAS error at " __FILE__ ":") + \ + std::to_string(__LINE__) + " (status " + \ + std::to_string(static_cast(_s)) + ")"); \ + } \ + } while (0) + +// --------------------------------------------------------------------------- +// Handles singleton. +// --------------------------------------------------------------------------- + +Handles::Handles() { + if (cublasCreate(&blas_) != CUBLAS_STATUS_SUCCESS) { + throw std::runtime_error("cublasCreate failed"); + } + if (cusparseCreate(&sparse_) != CUSPARSE_STATUS_SUCCESS) { + cublasDestroy(blas_); + blas_ = nullptr; + throw std::runtime_error("cusparseCreate failed"); + } + if (cusolverDnCreate(&solver_dn_) != CUSOLVER_STATUS_SUCCESS) { + cusparseDestroy(sparse_); + cublasDestroy(blas_); + sparse_ = nullptr; + blas_ = nullptr; + throw std::runtime_error("cusolverDnCreate failed"); + } + if (cusolverSpCreate(&solver_sp_) != CUSOLVER_STATUS_SUCCESS) { + cusolverDnDestroy(solver_dn_); + cusparseDestroy(sparse_); + cublasDestroy(blas_); + solver_dn_ = nullptr; + sparse_ = nullptr; + blas_ = nullptr; + throw std::runtime_error("cusolverSpCreate failed"); + } + // We always pass alpha/beta and consume reduction results through host + // pointers, which is the default but be explicit so that an external caller + // tinkering with the handles cannot break us. + cublasSetPointerMode(blas_, CUBLAS_POINTER_MODE_HOST); + cusparseSetPointerMode(sparse_, CUSPARSE_POINTER_MODE_HOST); +} + +Handles::~Handles() { + if (solver_sp_ != nullptr) { + cusolverSpDestroy(solver_sp_); + } + if (solver_dn_ != nullptr) { + cusolverDnDestroy(solver_dn_); + } + if (sparse_ != nullptr) { + cusparseDestroy(sparse_); + } + if (blas_ != nullptr) { + cublasDestroy(blas_); + } +} + +Handles &Handles::instance() { + static Handles inst; + return inst; +} + +// --------------------------------------------------------------------------- +// cuBLAS-backed level-1 wrappers. +// +// dot/nrm2/inf_norm consume a host-side scalar so they internally synchronize +// the stream before returning. axpy/scal/copy/scal_copy are pure async kernel +// launches. +// --------------------------------------------------------------------------- + +double dot(int n, const double *x_d, const double *y_d, cudaStream_t stream) { + if (n <= 0) { + return 0.0; + } + auto h = Handles::instance().blas(); + BREAKAGE_CUBLAS_CHECK(cublasSetStream(h, stream)); + double r = 0.0; + BREAKAGE_CUBLAS_CHECK(cublasDdot(h, n, x_d, 1, y_d, 1, &r)); + // cuBLAS in HOST_POINTER mode is synchronous wrt host for scalar results, + // but make it explicit so async kernel launches that follow are properly + // ordered. + BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream)); + return r; +} + +double nrm2(int n, const double *x_d, cudaStream_t stream) { + if (n <= 0) { + return 0.0; + } + auto h = Handles::instance().blas(); + BREAKAGE_CUBLAS_CHECK(cublasSetStream(h, stream)); + double r = 0.0; + BREAKAGE_CUBLAS_CHECK(cublasDnrm2(h, n, x_d, 1, &r)); + BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream)); + return r; +} + +double inf_norm(int n, const double *x_d, cudaStream_t stream) { + if (n <= 0) { + return 0.0; + } + auto h = Handles::instance().blas(); + BREAKAGE_CUBLAS_CHECK(cublasSetStream(h, stream)); + int idx_1 = 0; // 1-based + BREAKAGE_CUBLAS_CHECK(cublasIdamax(h, n, x_d, 1, &idx_1)); + if (idx_1 < 1 || idx_1 > n) { + return 0.0; + } + double v = 0.0; + BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(&v, x_d + (idx_1 - 1), sizeof(double), + cudaMemcpyDeviceToHost, stream)); + BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream)); + return std::abs(v); +} + +void axpy(int n, double alpha, const double *x_d, double *y_d, + cudaStream_t stream) { + if (n <= 0) { + return; + } + auto h = Handles::instance().blas(); + BREAKAGE_CUBLAS_CHECK(cublasSetStream(h, stream)); + BREAKAGE_CUBLAS_CHECK(cublasDaxpy(h, n, &alpha, x_d, 1, y_d, 1)); +} + +void scal(int n, double alpha, double *x_d, cudaStream_t stream) { + if (n <= 0) { + return; + } + auto h = Handles::instance().blas(); + BREAKAGE_CUBLAS_CHECK(cublasSetStream(h, stream)); + BREAKAGE_CUBLAS_CHECK(cublasDscal(h, n, &alpha, x_d, 1)); +} + +void copy(int n, const double *src_d, double *dst_d, cudaStream_t stream) { + if (n <= 0) { + return; + } + BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(dst_d, src_d, + static_cast(n) * + sizeof(double), + cudaMemcpyDeviceToDevice, stream)); +} + +void scal_copy(int n, double alpha, const double *x_d, double *y_d, + cudaStream_t stream) { + if (n <= 0) { + return; + } + copy(n, x_d, y_d, stream); + scal(n, alpha, y_d, stream); +} + +// --------------------------------------------------------------------------- +// Element-wise kernels: fill / clamp. +// --------------------------------------------------------------------------- + +namespace { + +constexpr int kElemwiseBlock = 256; + +inline int elemwise_grid(int n) { + return (n + kElemwiseBlock - 1) / kElemwiseBlock; +} + +__global__ void fill_kernel(int n, double v, double *x) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) { + x[i] = v; + } +} + +__global__ void clamp_kernel(int n, const double *in, const double *l, + const double *u, double *out) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) { + return; + } + double x = in[i]; + double lo = l[i]; + double hi = u[i]; + if (x < lo) { + x = lo; + } + if (x > hi) { + x = hi; + } + out[i] = x; +} + +__global__ void clamp_inplace_kernel(int n, const double *l, const double *u, + double *x_d) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) { + return; + } + double x = x_d[i]; + double lo = l[i]; + double hi = u[i]; + if (x < lo) { + x = lo; + } + if (x > hi) { + x = hi; + } + x_d[i] = x; +} + +} // namespace + +void fill(int n, double value, double *x_d, cudaStream_t stream) { + if (n <= 0) { + return; + } + fill_kernel<<>>(n, value, x_d); +} + +void clamp(int n, const double *in_d, const double *l_d, const double *u_d, + double *out_d, cudaStream_t stream) { + if (n <= 0) { + return; + } + clamp_kernel<<>>(n, in_d, l_d, + u_d, out_d); +} + +void clamp_inplace(int n, const double *l_d, const double *u_d, double *x_d, + cudaStream_t stream) { + if (n <= 0) { + return; + } + clamp_inplace_kernel<<>>(n, l_d, + u_d, + x_d); +} + +// --------------------------------------------------------------------------- +// SpMV via the cuSPARSE generic API. +// +// We lazily attach a cusparseSpMatDescr_t to each DeviceCsr on first use. +// system.cpp may also build it; we only do it here if it has not. +// --------------------------------------------------------------------------- + +namespace { + +void ensure_csr_descr(const DeviceCsr &A) { + if (A.descr != nullptr) { + return; + } + // descr is a member of DeviceCsr; the lazy-init cast preserves the + // logical "const" of the SpMV operation (we do not touch the values). + auto &mut = const_cast(A); + BREAKAGE_CUSPARSE_CHECK(cusparseCreateCsr( + &mut.descr, A.rows, A.cols, A.nnz, + const_cast(A.rowptr.data()), const_cast(A.colind.data()), + const_cast(A.values.data()), CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); +} + +} // namespace + +std::size_t spmv_buffer_size(const DeviceCsr &A, bool transpose) { + if (A.rows <= 0 || A.cols <= 0) { + return 0; + } + ensure_csr_descr(A); + auto h = Handles::instance().sparse(); + cusparseOperation_t op = + transpose ? CUSPARSE_OPERATION_TRANSPOSE + : CUSPARSE_OPERATION_NON_TRANSPOSE; + int64_t x_n = transpose ? A.rows : A.cols; + int64_t y_n = transpose ? A.cols : A.rows; + cusparseDnVecDescr_t x_descr = nullptr, y_descr = nullptr; + // bufferSize never dereferences the vector data; nullptr is the documented + // "use placeholder" value. + BREAKAGE_CUSPARSE_CHECK( + cusparseCreateDnVec(&x_descr, x_n, nullptr, CUDA_R_64F)); + BREAKAGE_CUSPARSE_CHECK( + cusparseCreateDnVec(&y_descr, y_n, nullptr, CUDA_R_64F)); + double alpha = 1.0; + double beta = 0.0; + std::size_t bytes = 0; + BREAKAGE_CUSPARSE_CHECK(cusparseSpMV_bufferSize( + h, op, &alpha, A.descr, x_descr, &beta, y_descr, CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, &bytes)); + cusparseDestroyDnVec(x_descr); + cusparseDestroyDnVec(y_descr); + return bytes; +} + +void spmv(const DeviceCsr &A, bool transpose, double alpha, const double *x_d, + double beta, double *y_d, void *buffer_d, std::size_t buffer_bytes, + cudaStream_t stream) { + (void)buffer_bytes; // size validated by cuSPARSE itself + if (A.rows <= 0 || A.cols <= 0) { + return; + } + ensure_csr_descr(A); + auto h = Handles::instance().sparse(); + BREAKAGE_CUSPARSE_CHECK(cusparseSetStream(h, stream)); + cusparseOperation_t op = + transpose ? CUSPARSE_OPERATION_TRANSPOSE + : CUSPARSE_OPERATION_NON_TRANSPOSE; + int64_t x_n = transpose ? A.rows : A.cols; + int64_t y_n = transpose ? A.cols : A.rows; + cusparseDnVecDescr_t x_descr = nullptr, y_descr = nullptr; + BREAKAGE_CUSPARSE_CHECK(cusparseCreateDnVec( + &x_descr, x_n, const_cast(x_d), CUDA_R_64F)); + BREAKAGE_CUSPARSE_CHECK(cusparseCreateDnVec(&y_descr, y_n, y_d, CUDA_R_64F)); + BREAKAGE_CUSPARSE_CHECK(cusparseSpMV(h, op, &alpha, A.descr, x_descr, &beta, + y_descr, CUDA_R_64F, + CUSPARSE_SPMV_ALG_DEFAULT, buffer_d)); + cusparseDestroyDnVec(x_descr); + cusparseDestroyDnVec(y_descr); +} + +// --------------------------------------------------------------------------- +// Host-side dense 3x3 / 2x2 helpers. +// +// Inputs / outputs are row-major flat arrays; we marshal to Eigen, do the +// math, and marshal back. Allocations are stack-only. +// --------------------------------------------------------------------------- + +namespace { + +inline Eigen::Matrix3d row_major_to_eigen3(const double M[9]) { + Eigen::Matrix3d out; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + out(i, j) = M[3 * i + j]; + } + } + return out; +} + +inline void eigen_to_row_major3(const Eigen::Matrix3d &M, double out[9]) { + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + out[3 * i + j] = M(i, j); + } + } +} + +inline Eigen::Matrix2d row_major_to_eigen2(const double M[4]) { + Eigen::Matrix2d out; + out(0, 0) = M[0]; + out(0, 1) = M[1]; + out(1, 0) = M[2]; + out(1, 1) = M[3]; + return out; +} + +inline void eigen_to_row_major2(const Eigen::Matrix2d &M, double out[4]) { + out[0] = M(0, 0); + out[1] = M(0, 1); + out[2] = M(1, 0); + out[3] = M(1, 1); +} + +} // namespace + +void svd_procrustes_3x3(const double S[9], double R[9]) { + Eigen::Matrix3d S_mat = row_major_to_eigen3(S); + Eigen::JacobiSVD svd( + S_mat, Eigen::ComputeFullU | Eigen::ComputeFullV); + Eigen::Matrix3d U = svd.matrixU(); + Eigen::Matrix3d V = svd.matrixV(); + Eigen::Matrix3d D = Eigen::Matrix3d::Identity(); + if ((V * U.transpose()).determinant() < 0.0) { + D(2, 2) = -1.0; + } + Eigen::Matrix3d R_mat = V * D * U.transpose(); + eigen_to_row_major3(R_mat, R); +} + +void spd_solve_3x3(const double A[9], const double b[3], double x[3]) { + Eigen::Matrix3d A_mat = row_major_to_eigen3(A); + Eigen::Vector3d b_vec(b[0], b[1], b[2]); + Eigen::Vector3d x_vec = A_mat.ldlt().solve(b_vec); + x[0] = x_vec[0]; + x[1] = x_vec[1]; + x[2] = x_vec[2]; +} + +void inv_sqrt_spd_2x2(const double C[4], double W[4]) { + Eigen::Matrix2d C_mat = row_major_to_eigen2(C); + Eigen::SelfAdjointEigenSolver eig(C_mat); + if (eig.info() != Eigen::Success) { + throw std::invalid_argument( + "inv_sqrt_spd_2x2: eigen decomposition failed"); + } + Eigen::Vector2d lambda = eig.eigenvalues(); + Eigen::Matrix2d U = eig.eigenvectors(); + double lambda_max = lambda.maxCoeff(); + if (!(lambda_max > 0.0)) { + throw std::invalid_argument("inv_sqrt_spd_2x2: non-positive matrix"); + } + // Match breakage.cppm: clamp to avoid blow-up on near-degenerate patches. + double eps = 1e-12 * lambda_max; + lambda = lambda.cwiseMax(eps); + Eigen::Matrix2d D = Eigen::Matrix2d::Zero(); + D(0, 0) = 1.0 / std::sqrt(lambda(0)); + D(1, 1) = 1.0 / std::sqrt(lambda(1)); + Eigen::Matrix2d W_mat = U * D * U.transpose(); + eigen_to_row_major2(W_mat, W); +} + +// --------------------------------------------------------------------------- +// Quaternion (xyzw) helpers. +// +// xyzw matches Eigen::Quaterniond::coeffs() ordering, so we can round-trip +// through Quaterniond without reordering. +// --------------------------------------------------------------------------- + +void quat_xyzw_to_R(const double q[4], double R[9]) { + Eigen::Quaterniond q_eig; + q_eig.coeffs() << q[0], q[1], q[2], q[3]; + Eigen::Matrix3d M = q_eig.toRotationMatrix(); + eigen_to_row_major3(M, R); +} + +void R_to_quat_xyzw(const double R[9], double q[4]) { + Eigen::Matrix3d M = row_major_to_eigen3(R); + Eigen::Quaterniond q_eig(M); + q_eig.normalize(); + q[0] = q_eig.x(); + q[1] = q_eig.y(); + q[2] = q_eig.z(); + q[3] = q_eig.w(); +} + +void quat_xyzw_normalize(double q[4]) { + double n2 = q[0] * q[0] + q[1] * q[1] + q[2] * q[2] + q[3] * q[3]; + if (!(n2 > 0.0)) { + return; + } + double inv = 1.0 / std::sqrt(n2); + q[0] *= inv; + q[1] *= inv; + q[2] *= inv; + q[3] *= inv; +} + +void quat_xyzw_mul(const double a[4], const double b[4], double r[4]) { + // Hamilton product, matching Eigen Quaternion::operator*. + double ax = a[0], ay = a[1], az = a[2], aw = a[3]; + double bx = b[0], by = b[1], bz = b[2], bw = b[3]; + r[0] = aw * bx + ax * bw + ay * bz - az * by; + r[1] = aw * by - ax * bz + ay * bw + az * bx; + r[2] = aw * bz + ax * by - ay * bx + az * bw; + r[3] = aw * bw - ax * bx - ay * by - az * bz; +} + +void quat_xyzw_conjugate(const double q[4], double r[4]) { + r[0] = -q[0]; + r[1] = -q[1]; + r[2] = -q[2]; + r[3] = q[3]; +} + +#undef BREAKAGE_CUBLAS_CHECK + +} // namespace breakage_cuda::cuda_la diff --git a/native/cuda/breakage_cuda/src/qp_solver.cu b/native/cuda/breakage_cuda/src/qp_solver.cu new file mode 100644 index 0000000..c54ac84 --- /dev/null +++ b/native/cuda/breakage_cuda/src/qp_solver.cu @@ -0,0 +1,1264 @@ +// SPDX-License-Identifier: MIT +// +// Hand-written CUDA ADMM implementation of the three-stage QP wrapper that +// lives on the CPU side in modules/bricksim/physx/osqp.cppm. +// +// Algorithmic summary (per stage; standard OSQP form +// `min 0.5 x^T P x + q^T x s.t. l <= C x <= u`): +// +// each ADMM iteration t: +// 1. rhs = sigma * x_k - q + C^T (rho * z_k - y_k) +// 2. solve M xt = rhs with M = P + sigma I + rho C^T C +// 3. zt = C xt (recovered from the reduced KKT) +// 4. xt_alpha = alpha * xt + (1 - alpha) * x_k +// zt_alpha = alpha * zt + (1 - alpha) * z_k +// 5. z_{k+1} = clamp(zt_alpha + y_k / rho, l, u) +// 6. y_{k+1} += rho * (zt_alpha - z_{k+1}) +// 7. x_{k+1} = xt_alpha +// +// The reduced KKT matrix M is symmetric positive definite (since +// P + sigma I > 0 and C^T C >= 0). We solve the inner system with a plain +// (un-preconditioned) conjugate-gradient method, applying M implicitly as +// `M x = P x + sigma x + rho C^T (C x)` via three SpMVs per CG step. This +// trades raw factorization speed for far simpler code: no symbolic Cholesky +// caching, no refactor on rho changes, no cuSolverSp roundtrips. The CG +// initial guess is warm-started from the previous outer iteration's xt so +// in steady state the inner loop converges in a handful of iterations. +// +// We deliberately do NOT replicate every OSQP feature - no polish, no +// infeasibility detection, no Ruiz scaling. The goal is to be close enough +// to OSQP that the breakage system behaves consistently. + +// Eigen headers must come before the project headers because system.hpp uses +// Eigen::Quaterniond in HostState but only includes + +// . Pull the missing pieces in here so we don't have to +// touch system.hpp. +#include +#include + +#include "breakage_cuda/qp_solver.hpp" + +#include "breakage_cuda/device.hpp" +#include "breakage_cuda/kernels.hpp" +#include "breakage_cuda/linalg.hpp" +#include "breakage_cuda/system.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace breakage_cuda::cuda_qp { + +// --------------------------------------------------------------------------- +// File-local constants and device kernels +// --------------------------------------------------------------------------- + +namespace { + +// Stage 2 / Stage 3 inequality slack used by osqp.cppm. Hard-coded to match +// the production default; the OSQP wrapper would override this from +// OSQP_EPSILON env var, which we do not bother exposing here. +constexpr double kEpsilon = 1e-4; + +// Diagonal regularisation on the x-block of P_rlx (matches osqp.cppm). +constexpr double kRelXReg = 1e-4; + +// CG inner-solve tolerances. Inner tol is relative to ||rhs||; we keep it +// noticeably tighter than the outer ADMM tol so the inner solve never +// bottlenecks convergence. +constexpr double kInnerCgTol = 1e-9; +constexpr int kInnerCgMaxIter = 400; + +// Bounds on the adaptive rho. OSQP uses similar limits. +constexpr double kRhoMin = 1e-6; +constexpr double kRhoMax = 1e6; + +constexpr int kBlockSize = 256; + +inline int grid_for(int n) { + return (n + kBlockSize - 1) / kBlockSize; +} + +// out[i] = a * x[i] + b * y[i] +__global__ void axby_kernel(int n, double a, const double *__restrict__ x, + double b, const double *__restrict__ y, + double *__restrict__ out) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) { + out[i] = a * x[i] + b * y[i]; + } +} + +// y[i] = a * x[i] + b * y[i] (in-place version of axby_kernel) +__global__ void axpby_inplace_kernel(int n, double a, + const double *__restrict__ x, double b, + double *__restrict__ y) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) { + y[i] = a * x[i] + b * y[i]; + } +} + +// rhs[i] = sigma * x_k[i] - q[i] + ct_term[i] +__global__ void rhs_assemble_kernel(int n, double sigma, + const double *__restrict__ x_k, + const double *__restrict__ q, + const double *__restrict__ ct_term, + double *__restrict__ rhs) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) { + rhs[i] = sigma * x_k[i] - q[i] + ct_term[i]; + } +} + +// out[i] = rho * z_k[i] - y_k[i] +__global__ void rho_z_minus_y_kernel(int m, double rho, + const double *__restrict__ z_k, + const double *__restrict__ y_k, + double *__restrict__ out) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < m) { + out[i] = rho * z_k[i] - y_k[i]; + } +} + +// z_out[i] = clamp(zt_alpha[i] + y_k[i] / rho, l[i], u[i]) +__global__ void z_update_kernel(int m, double inv_rho, + const double *__restrict__ zt_alpha, + const double *__restrict__ y_k, + const double *__restrict__ l, + const double *__restrict__ u, + double *__restrict__ z_out) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < m) { + double v = zt_alpha[i] + inv_rho * y_k[i]; + if (v < l[i]) + v = l[i]; + if (v > u[i]) + v = u[i]; + z_out[i] = v; + } +} + +// y[i] += rho * (zt_alpha[i] - z_new[i]) +__global__ void y_update_kernel(int m, double rho, + const double *__restrict__ zt_alpha, + const double *__restrict__ z_new, + double *__restrict__ y) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < m) { + y[i] += rho * (zt_alpha[i] - z_new[i]); + } +} + +// out[i] = max(0, x[i]) +__global__ void relu_kernel(int n, const double *__restrict__ x, + double *__restrict__ out) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) { + out[i] = (x[i] > 0.0) ? x[i] : 0.0; + } +} + +// Tiny RAII wrapper around a raw byte buffer used for cuSPARSE SpMV temp +// storage. Sized lazily; never shrunk. +class ByteBuf { +public: + ByteBuf() = default; + ~ByteBuf() { free(); } + ByteBuf(const ByteBuf &) = delete; + ByteBuf &operator=(const ByteBuf &) = delete; + ByteBuf(ByteBuf &&o) noexcept : ptr_(o.ptr_), bytes_(o.bytes_) { + o.ptr_ = nullptr; + o.bytes_ = 0; + } + ByteBuf &operator=(ByteBuf &&o) noexcept { + if (this != &o) { + free(); + ptr_ = o.ptr_; + bytes_ = o.bytes_; + o.ptr_ = nullptr; + o.bytes_ = 0; + } + return *this; + } + + void ensure(std::size_t bytes) { + if (bytes <= bytes_) + return; + free(); + if (bytes > 0) { + BREAKAGE_CUDA_CHECK(cudaMalloc(&ptr_, bytes)); + bytes_ = bytes; + } + } + void *data() { return ptr_; } + std::size_t size() const { return bytes_; } + +private: + void free() { + if (ptr_) { + cudaFree(ptr_); + ptr_ = nullptr; + bytes_ = 0; + } + } + void *ptr_{nullptr}; + std::size_t bytes_{0}; +}; + +} // namespace + +// --------------------------------------------------------------------------- +// Stage: one ADMM run (P, C constant; q, l, u may change every solve). +// +// Stage owns: +// * the constant matrices P, C, C^T (CSR, on device) +// * a pre-allocated SpMV byte buffer per matrix +// * warm-start state (x, z, y) plus the current rho +// * scratch device buffers for the ADMM loop and the inner CG +// --------------------------------------------------------------------------- + +class Stage { +public: + // Builds device-side CSR copies of P (full symmetric) and C from Eigen + // sparse host matrices, allocates all working buffers. Does NOT initialise + // warm-start state - that happens lazily on the first solve. + void setup(SparseMatrixCSC P_full, SparseMatrixCSC C, const Settings &s); + + // Runs ADMM until convergence or max_iter. q/l/u are host vectors + // (re-uploaded each call). Output warm-start lives in x_, z_, y_ and is + // also exposed via the const accessors below. + StageInfo solve(const VectorXd &q_host, const VectorXd &l_host, + const VectorXd &u_host, cudaStream_t stream); + + // Drops warm-start state; next solve starts cold and resets rho to + // settings_.rho_init. + void reset_warm_start() { has_warm_ = false; } + + int n() const { return n_; } + int m() const { return m_; } + + const double *x_device() const { return x_.data(); } + const double *z_device() const { return z_.data(); } + const double *y_device() const { return y_.data(); } + +private: + // Apply M = P + sigma I + rho C^T C to x (n elements) and write into Mx. + // Uses scratch_m_ as the C*x scratchpad. Both x and Mx must be size n_. + void apply_M(const double *x_d, double *Mx_d, double rho, + cudaStream_t stream); + + // Solve M xt = rhs via CG, warm-started from xt_d (in-place). Returns the + // number of CG iterations consumed (informational only). + int solve_kkt_cg(const double *rhs_d, double *xt_d, double rho, + cudaStream_t stream); + + Settings settings_{}; + + int n_{0}; + int m_{0}; + + DeviceCsr P_; + DeviceCsr C_; + DeviceCsr Ct_; + + ByteBuf spmv_buf_P_; + ByteBuf spmv_buf_C_; + ByteBuf spmv_buf_Ct_; + + // Warm-start. + DeviceBuffer x_; // size n_ + DeviceBuffer z_; // size m_ + DeviceBuffer y_; // size m_ + bool has_warm_{false}; + double rho_{0.1}; + + // Per-call uploads. + DeviceBuffer q_d_; // size n_ + DeviceBuffer l_d_; // size m_ + DeviceBuffer u_d_; // size m_ + + // ADMM scratch buffers. + DeviceBuffer xt_; // size n_ + DeviceBuffer zt_; // size m_ + DeviceBuffer xt_alpha_; // size n_ + DeviceBuffer zt_alpha_; // size m_ + DeviceBuffer rhs_; // size n_ + DeviceBuffer scratch_n_; // size n_ + DeviceBuffer scratch_m_; // size m_ + + // Convergence-check scratch. + DeviceBuffer r_prim_; // size m_ + DeviceBuffer r_dual_; // size n_ + + // CG scratch. + DeviceBuffer cg_r_; // size n_ + DeviceBuffer cg_p_; // size n_ + DeviceBuffer cg_Ap_; // size n_ +}; + +void Stage::setup(SparseMatrixCSC P_full, SparseMatrixCSC C, + const Settings &s) { + settings_ = s; + n_ = static_cast(P_full.rows()); + m_ = static_cast(C.rows()); + if (P_full.cols() != n_) { + throw std::invalid_argument("Stage::setup: P must be square"); + } + if (C.cols() != n_) { + throw std::invalid_argument("Stage::setup: C cols mismatch"); + } + + if (!P_full.isCompressed()) + P_full.makeCompressed(); + if (!C.isCompressed()) + C.makeCompressed(); + + upload_csc_as_csr(P_full, P_); + upload_csc_as_csr(C, C_); + + // C^T as a separate CSR. cuSPARSE supports SpMV with op=TRANSPOSE on a CSR + // input, but it is consistently slower than non-transpose, so we pay the + // one-time host transpose to get faster inner-loop SpMVs. + SparseMatrixCSC Ct = SparseMatrixCSC(C.transpose()); + Ct.makeCompressed(); + upload_csc_as_csr(Ct, Ct_); + + // Pre-size SpMV buffers. m_ or n_ may be zero in degenerate problems, in + // which case spmv_buffer_size still works (cuSPARSE handles empty mats). + spmv_buf_P_.ensure(cuda_la::spmv_buffer_size(P_, false)); + spmv_buf_C_.ensure(cuda_la::spmv_buffer_size(C_, false)); + spmv_buf_Ct_.ensure(cuda_la::spmv_buffer_size(Ct_, false)); + + x_.resize(static_cast(n_)); + z_.resize(static_cast(m_)); + y_.resize(static_cast(m_)); + q_d_.resize(static_cast(n_)); + l_d_.resize(static_cast(m_)); + u_d_.resize(static_cast(m_)); + + xt_.resize(static_cast(n_)); + zt_.resize(static_cast(m_)); + xt_alpha_.resize(static_cast(n_)); + zt_alpha_.resize(static_cast(m_)); + rhs_.resize(static_cast(n_)); + scratch_n_.resize(static_cast(n_)); + scratch_m_.resize(static_cast(m_)); + + r_prim_.resize(static_cast(m_)); + r_dual_.resize(static_cast(n_)); + + cg_r_.resize(static_cast(n_)); + cg_p_.resize(static_cast(n_)); + cg_Ap_.resize(static_cast(n_)); + + has_warm_ = false; + rho_ = settings_.rho_init; +} + +void Stage::apply_M(const double *x_d, double *Mx_d, double rho, + cudaStream_t stream) { + // Mx = P*x + cuda_la::spmv(P_, false, 1.0, x_d, 0.0, Mx_d, spmv_buf_P_.data(), + spmv_buf_P_.size(), stream); + // Mx += sigma * x (axpy: y += alpha * x) + cuda_la::axpy(n_, settings_.sigma, x_d, Mx_d, stream); + if (m_ > 0) { + // scratch_m = C * x + cuda_la::spmv(C_, false, 1.0, x_d, 0.0, scratch_m_.data(), + spmv_buf_C_.data(), spmv_buf_C_.size(), stream); + // Mx += rho * C^T * scratch_m + cuda_la::spmv(Ct_, false, rho, scratch_m_.data(), 1.0, Mx_d, + spmv_buf_Ct_.data(), spmv_buf_Ct_.size(), stream); + } +} + +int Stage::solve_kkt_cg(const double *rhs_d, double *xt_d, double rho, + cudaStream_t stream) { + // Initial residual r = rhs - M*xt (xt is the warm-start input). + // We reuse cg_Ap_ as the temporary for M*xt. + apply_M(xt_d, cg_Ap_.data(), rho, stream); + axby_kernel<<>>( + n_, 1.0, rhs_d, -1.0, cg_Ap_.data(), cg_r_.data()); + // p = r + cuda_la::copy(n_, cg_r_.data(), cg_p_.data(), stream); + + double rs_old = cuda_la::dot(n_, cg_r_.data(), cg_r_.data(), stream); + // Tolerance is relative to the rhs norm so problems of different scale + // all get the same relative accuracy. + double rhs_norm = cuda_la::nrm2(n_, rhs_d, stream); + double tol_sq = (kInnerCgTol * std::max(rhs_norm, 1e-30)); + tol_sq = tol_sq * tol_sq; + if (rs_old <= tol_sq) { + return 0; + } + + for (int it = 0; it < kInnerCgMaxIter; ++it) { + // cg_Ap = M * cg_p + apply_M(cg_p_.data(), cg_Ap_.data(), rho, stream); + double pAp = cuda_la::dot(n_, cg_p_.data(), cg_Ap_.data(), stream); + if (!(pAp > 0.0)) { + // Numerical breakdown - bail out with whatever we have. + return it; + } + double alpha = rs_old / pAp; + // xt += alpha * p + cuda_la::axpy(n_, alpha, cg_p_.data(), xt_d, stream); + // r -= alpha * Ap + cuda_la::axpy(n_, -alpha, cg_Ap_.data(), cg_r_.data(), stream); + + double rs_new = cuda_la::dot(n_, cg_r_.data(), cg_r_.data(), stream); + if (rs_new <= tol_sq) { + return it + 1; + } + double beta = rs_new / rs_old; + // p = r + beta * p (in-place: p = beta * p + 1.0 * r) + axpby_inplace_kernel<<>>( + n_, 1.0, cg_r_.data(), beta, cg_p_.data()); + rs_old = rs_new; + } + return kInnerCgMaxIter; +} + +StageInfo Stage::solve(const VectorXd &q_host, const VectorXd &l_host, + const VectorXd &u_host, cudaStream_t stream) { + if (q_host.size() != n_) { + throw std::invalid_argument("Stage::solve: q size mismatch"); + } + if (l_host.size() != m_ || u_host.size() != m_) { + throw std::invalid_argument("Stage::solve: l/u size mismatch"); + } + + auto t_start = std::chrono::steady_clock::now(); + + // Upload q, l, u (re-uploaded every call). + if (n_ > 0) { + q_d_.copy_from_host(q_host.data(), static_cast(n_)); + } + if (m_ > 0) { + l_d_.copy_from_host(l_host.data(), static_cast(m_)); + u_d_.copy_from_host(u_host.data(), static_cast(m_)); + } + + // Cold-start state. + if (!has_warm_) { + if (n_ > 0) + x_.zero(); + if (m_ > 0) { + z_.zero(); + y_.zero(); + } + rho_ = settings_.rho_init; + } + + StageInfo info; + info.rho = rho_; + + bool converged = false; + int iter = 0; + double prim_res = std::numeric_limits::quiet_NaN(); + double dual_res = std::numeric_limits::quiet_NaN(); + double q_norm_inf = + (n_ > 0) ? cuda_la::inf_norm(n_, q_d_.data(), stream) : 0.0; + + for (iter = 0; iter < settings_.max_iter; ++iter) { + // 1. Build rhs = sigma * x_k - q + C^T * (rho * z_k - y_k) + if (m_ > 0) { + // scratch_m = rho * z_k - y_k + rho_z_minus_y_kernel<<>>( + m_, rho_, z_.data(), y_.data(), scratch_m_.data()); + // scratch_n = C^T * scratch_m + cuda_la::spmv(Ct_, false, 1.0, scratch_m_.data(), 0.0, scratch_n_.data(), + spmv_buf_Ct_.data(), spmv_buf_Ct_.size(), stream); + } else if (n_ > 0) { + scratch_n_.zero(); + } + if (n_ > 0) { + rhs_assemble_kernel<<>>( + n_, settings_.sigma, x_.data(), q_d_.data(), scratch_n_.data(), + rhs_.data()); + } + + // 2. Solve M xt = rhs via CG, warm-started from x_k. + if (n_ > 0) { + cuda_la::copy(n_, x_.data(), xt_.data(), stream); + solve_kkt_cg(rhs_.data(), xt_.data(), rho_, stream); + } + + // 3. zt = C * xt (recovered from the reduced KKT) + if (m_ > 0) { + cuda_la::spmv(C_, false, 1.0, xt_.data(), 0.0, zt_.data(), + spmv_buf_C_.data(), spmv_buf_C_.size(), stream); + } + + // 4. Over-relaxation: + // xt_alpha = alpha * xt + (1 - alpha) * x_k + // zt_alpha = alpha * zt + (1 - alpha) * z_k + if (n_ > 0) { + axby_kernel<<>>( + n_, settings_.alpha, xt_.data(), 1.0 - settings_.alpha, x_.data(), + xt_alpha_.data()); + } + if (m_ > 0) { + axby_kernel<<>>( + m_, settings_.alpha, zt_.data(), 1.0 - settings_.alpha, z_.data(), + zt_alpha_.data()); + + // 5. z_{k+1} = clamp(zt_alpha + y_k / rho, l, u) (into scratch_m) + double inv_rho = 1.0 / rho_; + z_update_kernel<<>>( + m_, inv_rho, zt_alpha_.data(), y_.data(), l_d_.data(), u_d_.data(), + scratch_m_.data()); + + // 6. y += rho * (zt_alpha - z_{k+1}) + y_update_kernel<<>>( + m_, rho_, zt_alpha_.data(), scratch_m_.data(), y_.data()); + + // Promote z_{k+1} into z_ (z_ <- scratch_m). + cuda_la::copy(m_, scratch_m_.data(), z_.data(), stream); + } + + // 7. x_{k+1} = xt_alpha + if (n_ > 0) { + cuda_la::copy(n_, xt_alpha_.data(), x_.data(), stream); + } + + // 8. Periodic convergence + rho-update check. + bool last_iter = (iter + 1 == settings_.max_iter); + if (((iter + 1) % std::max(1, settings_.check_every) == 0) || last_iter) { + // Compute fresh A x_{k+1}, P x_{k+1}, A^T y_{k+1} for the residual + // checks. These cost three SpMVs but only fire occasionally. + double Ax_norm_inf = 0.0; + double z_norm_inf = 0.0; + double Px_norm_inf = 0.0; + double Aty_norm_inf = 0.0; + + if (m_ > 0) { + // scratch_m = A * x + cuda_la::spmv(C_, false, 1.0, x_.data(), 0.0, scratch_m_.data(), + spmv_buf_C_.data(), spmv_buf_C_.size(), stream); + Ax_norm_inf = cuda_la::inf_norm(m_, scratch_m_.data(), stream); + // r_prim = scratch_m - z + axby_kernel<<>>( + m_, 1.0, scratch_m_.data(), -1.0, z_.data(), r_prim_.data()); + prim_res = cuda_la::inf_norm(m_, r_prim_.data(), stream); + z_norm_inf = cuda_la::inf_norm(m_, z_.data(), stream); + } else { + prim_res = 0.0; + } + + if (n_ > 0) { + // scratch_n = P * x + cuda_la::spmv(P_, false, 1.0, x_.data(), 0.0, scratch_n_.data(), + spmv_buf_P_.data(), spmv_buf_P_.size(), stream); + Px_norm_inf = cuda_la::inf_norm(n_, scratch_n_.data(), stream); + + // r_dual = scratch_n + q + axby_kernel<<>>( + n_, 1.0, scratch_n_.data(), 1.0, q_d_.data(), r_dual_.data()); + + if (m_ > 0) { + // r_dual += C^T y (uses scratch_n as a temporary) + cuda_la::spmv(Ct_, false, 1.0, y_.data(), 0.0, scratch_n_.data(), + spmv_buf_Ct_.data(), spmv_buf_Ct_.size(), stream); + Aty_norm_inf = cuda_la::inf_norm(n_, scratch_n_.data(), stream); + cuda_la::axpy(n_, 1.0, scratch_n_.data(), r_dual_.data(), stream); + } + dual_res = cuda_la::inf_norm(n_, r_dual_.data(), stream); + } else { + dual_res = 0.0; + } + + double prim_tol = settings_.eps_abs + + settings_.eps_rel * + std::max(Ax_norm_inf, z_norm_inf); + double dual_tol = settings_.eps_abs + + settings_.eps_rel * + std::max({Px_norm_inf, q_norm_inf, Aty_norm_inf}); + + info.prim_res = prim_res; + info.dual_res = dual_res; + info.iter = iter + 1; + + if (prim_res <= prim_tol && dual_res <= dual_tol) { + converged = true; + break; + } + + // Adaptive rho. The implicit-M inner solver does not need a refactor + // when rho changes - we only update the scalar. + if (settings_.adaptive_rho) { + double prim_scale = std::max({Ax_norm_inf, z_norm_inf, 1e-30}); + double dual_scale = + std::max({Px_norm_inf, q_norm_inf, Aty_norm_inf, 1e-30}); + double prim_norm_rel = prim_res / prim_scale; + double dual_norm_rel = dual_res / dual_scale; + if (prim_norm_rel > 0.0 && dual_norm_rel > 0.0) { + double rho_new = rho_ * std::sqrt(prim_norm_rel / dual_norm_rel); + rho_new = std::clamp(rho_new, kRhoMin, kRhoMax); + double ratio = rho_new / rho_; + if (ratio > settings_.adaptive_rho_factor || + ratio < 1.0 / settings_.adaptive_rho_factor) { + rho_ = rho_new; + info.rho = rho_; + } + } + } + } + } + + has_warm_ = true; + info.converged = converged; + info.rho = rho_; + if (info.iter == 0) { + info.iter = iter; + } + + auto t_end = std::chrono::steady_clock::now(); + info.solve_time_s = std::chrono::duration(t_end - t_start).count(); + return info; +} + +// --------------------------------------------------------------------------- +// Helpers for building stage-specific Eigen sparse matrices on the host. +// These mirror the OsqpSolver::build_*_ helpers in osqp.cppm. +// --------------------------------------------------------------------------- + +namespace { + +using Triplet = Eigen::Triplet; + +// P_prj: diag(0 ... 0 (nx zeros), 1 ... 1 (me ones)) of size (nx+me, nx+me). +SparseMatrixCSC build_P_prj(int nx, int me) { + std::vector trips; + trips.reserve(static_cast(me)); + for (int i = 0; i < me; ++i) { + trips.emplace_back(nx + i, nx + i, 1.0); + } + SparseMatrixCSC P(nx + me, nx + me); + P.setFromTriplets(trips.begin(), trips.end()); + P.makeCompressed(); + return P; +} + +// C_prj: [A | -I_me] over the top me rows, [G | 0] underneath. +SparseMatrixCSC build_C_prj(const SparseMatrixCSC &A, const SparseMatrixCSC &G, + int nx, int me, int mi) { + std::vector trips; + trips.reserve(static_cast(A.nonZeros() + G.nonZeros() + me)); + for (int k = 0; k < A.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(A, k); it; ++it) { + trips.emplace_back(static_cast(it.row()), + static_cast(it.col()), it.value()); + } + } + for (int k = 0; k < G.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(G, k); it; ++it) { + trips.emplace_back(me + static_cast(it.row()), + static_cast(it.col()), it.value()); + } + } + for (int i = 0; i < me; ++i) { + trips.emplace_back(i, nx + i, -1.0); + } + SparseMatrixCSC C(me + mi, nx + me); + C.setFromTriplets(trips.begin(), trips.end()); + C.makeCompressed(); + return C; +} + +// P_rlx: diag(rel_x_reg * I_nx, I_nv). +SparseMatrixCSC build_P_rlx(int nx, int nv) { + std::vector trips; + trips.reserve(static_cast(nx + nv)); + for (int i = 0; i < nx; ++i) { + trips.emplace_back(i, i, kRelXReg); + } + for (int i = 0; i < nv; ++i) { + trips.emplace_back(nx + i, nx + i, 1.0); + } + SparseMatrixCSC P(nx + nv, nx + nv); + P.setFromTriplets(trips.begin(), trips.end()); + P.makeCompressed(); + return P; +} + +// C_rlx: [ A 0 ; G 0 ; H -V ; 0 I_nv ]. +SparseMatrixCSC build_C_rlx(const SparseMatrixCSC &A, + const SparseMatrixCSC &G, + const SparseMatrixCSC &H, + const SparseMatrixCSC &V, int nx, int me, int mi, + int mh, int nv) { + std::vector trips; + trips.reserve(static_cast(A.nonZeros() + G.nonZeros() + + H.nonZeros() + V.nonZeros() + nv)); + for (int k = 0; k < A.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(A, k); it; ++it) { + trips.emplace_back(static_cast(it.row()), + static_cast(it.col()), it.value()); + } + } + for (int k = 0; k < G.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(G, k); it; ++it) { + trips.emplace_back(me + static_cast(it.row()), + static_cast(it.col()), it.value()); + } + } + for (int k = 0; k < H.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(H, k); it; ++it) { + trips.emplace_back(me + mi + static_cast(it.row()), + static_cast(it.col()), it.value()); + } + } + for (int k = 0; k < V.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(V, k); it; ++it) { + trips.emplace_back(me + mi + static_cast(it.row()), + nx + static_cast(it.col()), -it.value()); + } + } + for (int i = 0; i < nv; ++i) { + trips.emplace_back(me + mi + mh + i, nx + i, 1.0); + } + SparseMatrixCSC C(me + mi + mh + nv, nx + nv); + C.setFromTriplets(trips.begin(), trips.end()); + C.makeCompressed(); + return C; +} + +// P_opt: full symmetric Q. We don't restrict to the upper triangle here +// (osqp.cppm does for the OSQP front-end, but the inner ADMM works on the +// symmetric version directly). +SparseMatrixCSC build_P_opt(const SparseMatrixCSC &Q) { + SparseMatrixCSC P = Q; + P.makeCompressed(); + return P; +} + +// C_opt: [ A ; G ; H ]. +SparseMatrixCSC build_C_opt(const SparseMatrixCSC &A, + const SparseMatrixCSC &G, + const SparseMatrixCSC &H, int nx, int me, int mi, + int mh) { + std::vector trips; + trips.reserve(static_cast(A.nonZeros() + G.nonZeros() + + H.nonZeros())); + for (int k = 0; k < A.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(A, k); it; ++it) { + trips.emplace_back(static_cast(it.row()), + static_cast(it.col()), it.value()); + } + } + for (int k = 0; k < G.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(G, k); it; ++it) { + trips.emplace_back(me + static_cast(it.row()), + static_cast(it.col()), it.value()); + } + } + for (int k = 0; k < H.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(H, k); it; ++it) { + trips.emplace_back(me + mi + static_cast(it.row()), + static_cast(it.col()), it.value()); + } + } + SparseMatrixCSC C(me + mi + mh, nx); + C.setFromTriplets(trips.begin(), trips.end()); + C.makeCompressed(); + return C; +} + +// --------------------------------------------------------------------------- +// Side-allocated impl structs. +// +// Solver and Pipeline have fixed member layouts in their public headers and +// we are not allowed to touch those headers. The extra device buffers we +// need (system A/V, scratch b, etc.) live in heap-allocated impl structs +// kept in process-local maps keyed by the owner pointer. This is a bit +// unusual but keeps the public API frozen. +// --------------------------------------------------------------------------- + +struct SolverImpl { + DeviceCsr A_dev; + ByteBuf spmv_buf_A; + // V is small and stays on the host: V * v_relax is done with Eigen. + SparseMatrixCSC V_host; + // Scratch for b/b_prj on device. + DeviceBuffer b_d; + DeviceBuffer b_prj_d; + // max(0, x_rlx.tail(nv)) staging buffer. + DeviceBuffer v_relax_full_d; +}; + +std::unordered_map> & +solver_impl_map() { + static std::unordered_map> m; + return m; +} +std::mutex &solver_impl_mutex() { + static std::mutex m; + return m; +} +SolverImpl &solver_impl(const Solver *self) { + std::lock_guard lk(solver_impl_mutex()); + auto &m = solver_impl_map(); + auto it = m.find(self); + if (it == m.end()) { + auto [ins, _] = m.emplace(self, std::make_unique()); + return *ins->second; + } + return *it->second; +} +void erase_solver_impl(const Solver *self) { + std::lock_guard lk(solver_impl_mutex()); + solver_impl_map().erase(self); +} +void move_solver_impl(const Solver *from, const Solver *to) { + std::lock_guard lk(solver_impl_mutex()); + auto &m = solver_impl_map(); + auto it = m.find(from); + if (it != m.end()) { + auto ptr = std::move(it->second); + m.erase(it); + m[to] = std::move(ptr); + } +} + +struct PipelineImpl { + DeviceInput input; + DeviceBuffer v_W_prev_d; // (N, 3) row-major + DeviceBuffer L_prev_d; // (N, 3) row-major + std::vector rm_buf; // host scratch for row-major conversion +}; + +std::unordered_map> & +pipeline_impl_map() { + static std::unordered_map> m; + return m; +} +std::mutex &pipeline_impl_mutex() { + static std::mutex m; + return m; +} +PipelineImpl &pipeline_impl(const Pipeline *self) { + std::lock_guard lk(pipeline_impl_mutex()); + auto &m = pipeline_impl_map(); + auto it = m.find(self); + if (it == m.end()) { + auto [ins, _] = m.emplace(self, std::make_unique()); + return *ins->second; + } + return *it->second; +} +void erase_pipeline_impl(const Pipeline *self) { + std::lock_guard lk(pipeline_impl_mutex()); + pipeline_impl_map().erase(self); +} + +// Convert a host MatrixX3d (column-major Eigen) into a row-major flat array. +void to_row_major_N3(const MatrixX3d &src, std::vector &dst) { + const int N = static_cast(src.rows()); + dst.resize(static_cast(N) * 3); + for (int i = 0; i < N; ++i) { + dst[3 * i + 0] = src(i, 0); + dst[3 * i + 1] = src(i, 1); + dst[3 * i + 2] = src(i, 2); + } +} + +// Convert a row-major flat array back into a column-major MatrixX3d. +void from_row_major_N3(const std::vector &src, MatrixX3d &dst, int N) { + dst.resize(N, 3); + for (int i = 0; i < N; ++i) { + dst(i, 0) = src[3 * i + 0]; + dst(i, 1) = src[3 * i + 1]; + dst(i, 2) = src[3 * i + 2]; + } +} + +} // namespace + +// --------------------------------------------------------------------------- +// Solver +// --------------------------------------------------------------------------- + +Solver::Solver() = default; + +Solver::~Solver() { erase_solver_impl(this); } + +Solver::Solver(Solver &&o) noexcept + : prj_(std::move(o.prj_)), rlx_(std::move(o.rlx_)), + opt_(std::move(o.opt_)), settings_(o.settings_), nx_(o.nx_), me_(o.me_), + mi_(o.mi_), mh_(o.mh_), nv_(o.nv_) { + move_solver_impl(&o, this); +} + +Solver &Solver::operator=(Solver &&o) noexcept { + if (this != &o) { + erase_solver_impl(this); + prj_ = std::move(o.prj_); + rlx_ = std::move(o.rlx_); + opt_ = std::move(o.opt_); + settings_ = o.settings_; + nx_ = o.nx_; + me_ = o.me_; + mi_ = o.mi_; + mh_ = o.mh_; + nv_ = o.nv_; + move_solver_impl(&o, this); + } + return *this; +} + +int Solver::num_vars() const { return nx_; } +int Solver::num_eq() const { return me_; } +int Solver::num_clutches() const { return nv_; } + +void Solver::reset_warm_start() { + if (prj_) + prj_->reset_warm_start(); + if (rlx_) + rlx_->reset_warm_start(); + if (opt_) + opt_->reset_warm_start(); +} + +void Solver::setup(const HostSystem &host, const DeviceSystem & /*dev*/, + const Settings &settings) { + settings_ = settings; + nx_ = host.num_vars; + me_ = host.num_eq; + mi_ = host.num_ineq; + mh_ = host.num_relaxed_ineq; + nv_ = host.num_clutches; + + if (host.A.rows() != me_ || host.A.cols() != nx_ || host.G.rows() != mi_ || + host.G.cols() != nx_ || host.H.rows() != mh_ || host.H.cols() != nx_ || + host.V.rows() != mh_ || host.V.cols() != nv_ || host.Q.rows() != nx_ || + host.Q.cols() != nx_) { + throw std::invalid_argument( + "Solver::setup: HostSystem matrix shape mismatch"); + } + + // Build host stage matrices (CSC, Eigen). + SparseMatrixCSC P_prj = build_P_prj(nx_, me_); + SparseMatrixCSC C_prj = build_C_prj(host.A, host.G, nx_, me_, mi_); + SparseMatrixCSC P_rlx = build_P_rlx(nx_, nv_); + SparseMatrixCSC C_rlx = + build_C_rlx(host.A, host.G, host.H, host.V, nx_, me_, mi_, mh_, nv_); + SparseMatrixCSC P_opt = build_P_opt(host.Q); + SparseMatrixCSC C_opt = + build_C_opt(host.A, host.G, host.H, nx_, me_, mi_, mh_); + + prj_ = std::make_unique(); + rlx_ = std::make_unique(); + opt_ = std::make_unique(); + prj_->setup(std::move(P_prj), std::move(C_prj), settings_); + rlx_->setup(std::move(P_rlx), std::move(C_rlx), settings_); + opt_->setup(std::move(P_opt), std::move(C_opt), settings_); + + // Per-Solver impl: A on device for b_prj, V on host for V*v_relax. + SolverImpl &impl = solver_impl(this); + SparseMatrixCSC A_csc = host.A; + if (!A_csc.isCompressed()) + A_csc.makeCompressed(); + upload_csc_as_csr(A_csc, impl.A_dev); + impl.spmv_buf_A.ensure(cuda_la::spmv_buffer_size(impl.A_dev, false)); + impl.V_host = host.V; + if (!impl.V_host.isCompressed()) + impl.V_host.makeCompressed(); + impl.b_d.resize(static_cast(me_)); + impl.b_prj_d.resize(static_cast(me_)); + impl.v_relax_full_d.resize(static_cast(nv_)); +} + +SolveInfo Solver::solve(const VectorXd &b, double *x_d, double *slack_d, + double *v_relax_d, cudaStream_t stream) { + if (b.size() != me_) { + throw std::invalid_argument("Solver::solve: b size mismatch"); + } + if (!prj_ || !rlx_ || !opt_) { + throw std::runtime_error("Solver::solve: setup() not called"); + } + SolveInfo info; + SolverImpl &impl = solver_impl(this); + if (me_ > 0) { + impl.b_d.copy_from_host(b.data(), static_cast(me_)); + } + + // -------- Stage 1: projection -------- + { + VectorXd q_prj(nx_ + me_); + q_prj.head(nx_).setZero(); + q_prj.tail(me_) = -b; + VectorXd l_prj = VectorXd::Zero(me_ + mi_); + VectorXd u_prj(me_ + mi_); + u_prj.head(me_).setZero(); + u_prj.tail(mi_).setConstant(settings_.infinity); + + info.prj = prj_->solve(q_prj, l_prj, u_prj, stream); + } + + // b_prj = A * x_prj.head(nx) (SpMV on device, then download) + VectorXd b_prj_host(me_); + if (me_ > 0) { + cuda_la::spmv(impl.A_dev, false, 1.0, prj_->x_device(), 0.0, + impl.b_prj_d.data(), impl.spmv_buf_A.data(), + impl.spmv_buf_A.size(), stream); + BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(b_prj_host.data(), impl.b_prj_d.data(), + me_ * sizeof(double), + cudaMemcpyDeviceToHost, stream)); + } + + // slack_d = b - A * x_prj.head(nx) = b - b_prj + if (slack_d != nullptr && me_ > 0) { + BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(slack_d, impl.b_d.data(), + me_ * sizeof(double), + cudaMemcpyDeviceToDevice, stream)); + cuda_la::axpy(me_, -1.0, impl.b_prj_d.data(), slack_d, stream); + } + + // Make sure b_prj_host is populated before we use it for stage 2 inputs. + BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream)); + + // -------- Stage 2: relaxation -------- + { + VectorXd q_rlx = VectorXd::Zero(nx_ + nv_); + VectorXd l_rlx(me_ + mi_ + mh_ + nv_); + VectorXd u_rlx(me_ + mi_ + mh_ + nv_); + l_rlx.head(me_) = b_prj_host; + u_rlx.head(me_) = b_prj_host; + l_rlx.segment(me_, mi_).setConstant(-kEpsilon); + u_rlx.segment(me_, mi_).setConstant(settings_.infinity); + l_rlx.segment(me_ + mi_, mh_).setConstant(-settings_.infinity); + u_rlx.segment(me_ + mi_, mh_).setConstant(1.0 - kEpsilon); + l_rlx.tail(nv_).setZero(); + u_rlx.tail(nv_).setConstant(settings_.infinity); + + info.rlx = rlx_->solve(q_rlx, l_rlx, u_rlx, stream); + } + + // v_relax = max(0, x_rlx.tail(nv)) + VectorXd v_relax_host(nv_); + if (nv_ > 0) { + const double *x_rlx_tail = rlx_->x_device() + nx_; + relu_kernel<<>>( + nv_, x_rlx_tail, impl.v_relax_full_d.data()); + if (v_relax_d != nullptr) { + BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(v_relax_d, + impl.v_relax_full_d.data(), + nv_ * sizeof(double), + cudaMemcpyDeviceToDevice, stream)); + } + BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(v_relax_host.data(), + impl.v_relax_full_d.data(), + nv_ * sizeof(double), + cudaMemcpyDeviceToHost, stream)); + BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream)); + } + + // -------- Stage 3: optimisation -------- + { + VectorXd q_opt = VectorXd::Zero(nx_); + VectorXd l_opt(me_ + mi_ + mh_); + VectorXd u_opt(me_ + mi_ + mh_); + l_opt.head(me_) = b_prj_host; + u_opt.head(me_) = b_prj_host; + l_opt.segment(me_, mi_).setConstant(-kEpsilon); + u_opt.segment(me_, mi_).setConstant(settings_.infinity); + l_opt.tail(mh_).setConstant(-settings_.infinity); + // Match osqp.cppm / ref_cpu.cpp: opt-stage H-row upper bound is 1.0 + // (NOT 1.0 - epsilon -- that one belongs to the relaxation stage). + u_opt.tail(mh_).setConstant(1.0); + if (mh_ > 0 && nv_ > 0) { + VectorXd add = impl.V_host * v_relax_host; + u_opt.tail(mh_) += add; + } + + info.opt = opt_->solve(q_opt, l_opt, u_opt, stream); + } + + // Copy out x (size nx_). + if (x_d != nullptr && nx_ > 0) { + BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(x_d, opt_->x_device(), + nx_ * sizeof(double), + cudaMemcpyDeviceToDevice, stream)); + } + + info.converged = + info.prj.converged && info.rlx.converged && info.opt.converged; + return info; +} + +// --------------------------------------------------------------------------- +// Pipeline - end-to-end driver mirroring BreakageChecker::solve. +// --------------------------------------------------------------------------- + +Pipeline::Pipeline() = default; +Pipeline::~Pipeline() { erase_pipeline_impl(this); } + +void Pipeline::setup(const HostSystem &host, const Thresholds &thr, + const Settings &settings) { + host_ = host; + thr_ = thr; + upload_system(host_, dev_); + qp_.setup(host_, dev_, settings); + + const int N = host_.num_parts; + v_W_curr_.resize(static_cast(N) * 3); + L_curr_.resize(static_cast(N) * 3); + b_d_.resize(static_cast(N) * 6); + x_d_.resize(static_cast(host_.num_vars)); + x_unwhiten_d_.resize(static_cast(host_.num_vars)); + slack_d_.resize(static_cast(host_.num_eq)); + v_relax_d_.resize(static_cast(host_.num_clutches)); + utilization_d_.resize(static_cast(host_.num_clutches)); + + PipelineImpl &impl = pipeline_impl(this); + impl.v_W_prev_d.resize(static_cast(N) * 3); + impl.L_prev_d.resize(static_cast(N) * 3); +} + +Solution Pipeline::solve(const HostInput &in, HostState &state, + cudaStream_t stream) { + Solution sol; + PipelineImpl &impl = pipeline_impl(this); + + const int N = host_.num_parts; + const int K = host_.num_clutches; + const int ncv = host_.num_contact_vertices; + const int ncap = static_cast(host_.capacity_clutch_indices.size()); + const int num_vars = host_.num_vars; + const int me = host_.num_eq; + + // 0. Upload input + previous state. We always re-upload v_W_prev and + // L_prev from HostState in case the user mutated them between solves. + upload_input(in, impl.input); + + if (state.v_W_prev.rows() != N || state.L_prev.rows() != N) { + throw std::invalid_argument("Pipeline::solve: HostState dim mismatch"); + } + to_row_major_N3(state.v_W_prev, impl.rm_buf); + impl.v_W_prev_d.copy_from_host(impl.rm_buf.data(), + static_cast(N) * 3); + to_row_major_N3(state.L_prev, impl.rm_buf); + impl.L_prev_d.copy_from_host(impl.rm_buf.data(), + static_cast(N) * 3); + + // 1. fit_se3 -> q_W_CC, t_W_CC (host scalars). + double q_W_CC[4]; + double t_W_CC[3]; + cuda_kernels::fit_se3(N, dev_.q_CC.data(), dev_.c_CC.data(), + impl.input.q.data(), impl.input.c.data(), + dev_.mass.data(), dev_.total_mass, + dev_.L0 * dev_.L0 / 2.0, q_W_CC, t_W_CC, stream); + + // 2. fit_twist -> w0/v0 (host) + v_W_curr (device). + // v0 is computed but unused by the rest of the pipeline (matches the CPU + // baseline in breakage.cppm: only w0 feeds compute_L and v_W feeds b). + double w0[3]; + double v0[3]; + cuda_kernels::fit_twist(N, impl.input.w.data(), impl.input.v.data(), + dev_.c_CC.data(), q_W_CC, t_W_CC, dev_.mass.data(), + dev_.total_mass, dev_.L0 * dev_.L0, w0, v0, + v_W_curr_.data(), stream); + (void)v0; + + // 3. compute_Pi (host scalar work). Eigen::Quaterniond stores as (x,y,z,w). + double q_prev[4]; + q_prev[0] = state.q_W_CC_prev.x(); + q_prev[1] = state.q_W_CC_prev.y(); + q_prev[2] = state.q_W_CC_prev.z(); + q_prev[3] = state.q_W_CC_prev.w(); + double Pi[9]; + cuda_kernels::compute_Pi_host(q_prev, q_W_CC, Pi); + + // 4. compute_L on the device. + cuda_kernels::compute_L(N, dev_.I_CC.data(), q_W_CC, w0, L_curr_.data(), + stream); + + // 5. Assemble RHS b on the device. + cuda_kernels::assemble_b(N, v_W_curr_.data(), impl.v_W_prev_d.data(), + L_curr_.data(), impl.L_prev_d.data(), + impl.input.J.data(), impl.input.H.data(), + dev_.mass.data(), Pi, in.dt, dev_.L0, b_d_.data(), + stream); + + // 6. Download b to host (small: 6*N doubles). + VectorXd b_host(6 * N); + if (N > 0) { + BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(b_host.data(), b_d_.data(), + 6 * N * sizeof(double), + cudaMemcpyDeviceToHost, stream)); + BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream)); + } + + // 7. QP solve. + sol.info = qp_.solve(b_host, x_d_.data(), slack_d_.data(), v_relax_d_.data(), + stream); + + // 8. slack_fraction = ||slack||_2 / max(||b||_2, floor) + double slack_norm = + (me > 0) ? cuda_la::nrm2(me, slack_d_.data(), stream) : 0.0; + double b_norm = b_host.norm(); + sol.slack_fraction = + slack_norm / std::max(b_norm, thr_.slack_fraction_b_floor); + + // 9. Postprocess: undo whitening + per-clutch utilisation. + if (sol.info.converged) { + cuda_kernels::postprocess(ncv, K, ncap, x_d_.data(), + dev_.capacity_clutch_indices.data(), + dev_.capacities.data(), dev_.clutch_whiten.data(), + x_unwhiten_d_.data(), utilization_d_.data(), + stream); + + sol.x.resize(num_vars); + sol.utilization.resize(K); + if (num_vars > 0) { + BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(sol.x.data(), x_unwhiten_d_.data(), + num_vars * sizeof(double), + cudaMemcpyDeviceToHost, stream)); + } + if (K > 0) { + BREAKAGE_CUDA_CHECK(cudaMemcpyAsync(sol.utilization.data(), + utilization_d_.data(), + K * sizeof(double), + cudaMemcpyDeviceToHost, stream)); + } + BREAKAGE_CUDA_CHECK(cudaStreamSynchronize(stream)); + } + + // 10. Persist HostState (host-side copies of v_W_curr, L_curr, q_W_CC). + // Eigen::Quaterniond constructor takes (w, x, y, z). + state.q_W_CC_prev = + Eigen::Quaterniond(q_W_CC[3], q_W_CC[0], q_W_CC[1], q_W_CC[2]); + + if (state.v_W_prev.rows() != N) { + state.v_W_prev.resize(N, 3); + } + if (state.L_prev.rows() != N) { + state.L_prev.resize(N, 3); + } + if (N > 0) { + impl.rm_buf.resize(static_cast(N) * 3); + BREAKAGE_CUDA_CHECK(cudaMemcpy(impl.rm_buf.data(), v_W_curr_.data(), + 3 * N * sizeof(double), + cudaMemcpyDeviceToHost)); + from_row_major_N3(impl.rm_buf, state.v_W_prev, N); + BREAKAGE_CUDA_CHECK(cudaMemcpy(impl.rm_buf.data(), L_curr_.data(), + 3 * N * sizeof(double), + cudaMemcpyDeviceToHost)); + from_row_major_N3(impl.rm_buf, state.L_prev, N); + } + + return sol; +} + +} // namespace breakage_cuda::cuda_qp diff --git a/native/cuda/breakage_cuda/src/ref_cpu.cpp b/native/cuda/breakage_cuda/src/ref_cpu.cpp new file mode 100644 index 0000000..76d2eeb --- /dev/null +++ b/native/cuda/breakage_cuda/src/ref_cpu.cpp @@ -0,0 +1,778 @@ +// SPDX-License-Identifier: MIT +// +// CPU reference implementation of the per-step BreakageChecker hot path. +// +// This file is a near-verbatim port of the algorithmic kernels in +// native/modules/bricksim/physx/breakage.cppm (fit_se3, fit_twist, +// so3_log_from_unit_quat, so3_jacobian_inv, compute_Pi, compute_L, +// the b-vector assembly and the unwhitening / utilization post-process) +// together with a hand-written re-implementation of the three-stage OSQP +// wrapper from native/modules/bricksim/physx/osqp.cppm. We deliberately +// avoid any bricksim C++26 module imports so this subproject stays +// self-contained. +// +// Constants (kSmallS, kSmallTheta, lambda regularizers, OSQP epsilon, +// rel_x_reg, rho overrides) are copied verbatim so that this baseline +// stays binary-compatible with the production BreakageChecker output up +// to OSQP's tolerance. + +#include "breakage_cuda/ref_cpu.hpp" + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace breakage_cuda { + +namespace { + +// Eigen 3.4 does not ship Vector3d::asSkewSymmetric() (which breakage.cppm +// uses); inline an equivalent here so the rest of the port can stay +// line-by-line faithful to the original. +inline Matrix3d skew(const Vector3d &v) { + Matrix3d m; + m << 0.0, -v.z(), v.y(), + v.z(), 0.0, -v.x(), + -v.y(), v.x(), 0.0; + return m; +} + +// Wrap an Eigen CSC sparse matrix as an OSQP CSC view. The OSQP matrix +// borrows the Eigen pointers; the Eigen sparse must outlive osqp_setup() +// (which copies the data into its workspace) but may be freed afterwards. +inline OSQPCscMatrix to_osqp_csc(const SparseMatrixCSC &mat) { + OSQPCscMatrix out{}; + out.m = static_cast(mat.rows()); + out.n = static_cast(mat.cols()); + out.p = const_cast(mat.outerIndexPtr()); + out.i = const_cast(mat.innerIndexPtr()); + out.x = const_cast(mat.valuePtr()); + out.nzmax = static_cast(mat.nonZeros()); + out.nz = -1; + out.owned = 0; + return out; +} + +inline void check_osqp(OSQPInt ret) { + if (ret != 0) { + const char *msg = osqp_error_message(ret); + throw std::runtime_error(std::string("osqp: ") + + (msg ? msg : "unknown error")); + } +} + +// Mirrors osqp.cppm's build_settings_(): verbose off, tight tolerances tied +// to the same epsilon used by the constraint slack, max_iter overridable +// from the OSQP_MAX_ITER environment variable to match the production code. +OSQPSettings make_settings(double epsilon) { + OSQPSettings s; + osqp_set_default_settings(&s); + s.verbose = 0; + s.max_iter = 10000; + s.eps_abs = epsilon; + s.eps_rel = epsilon; + if (auto *env = std::getenv("OSQP_MAX_ITER")) { + s.max_iter = std::stoi(env); + } + return s; +} + +void fill_stage(StageInfo &si, const OSQPInfo &info, double rho) { + si.converged = info.status_val == OSQP_SOLVED; + si.iter = static_cast(info.iter); + si.prim_res = info.prim_res; + si.dual_res = info.dual_res; + si.rho = rho; + si.solve_time_s = info.solve_time; +} + +} // namespace + +// --------------------------------------------------------------------------- +// InternalQpState (forward-declared in system.hpp). The CPU baseline keeps +// the cached stage matrices, the warm-start vectors, and the three OSQP +// solver handles here so the HostState payload can stay opaque. +// --------------------------------------------------------------------------- + +struct InternalQpState { + InternalQpState() = default; + explicit InternalQpState(const HostSystem &sys) { setup_from(sys); } + + // Captured dimensions (mirror osqp.cppm). + int nx{0}; // primary variables + int nv{0}; // relaxation slack variables (one per clutch) + int me{0}; // equality rows = 6 * num_parts + int mi{0}; // inequality rows + int mh{0}; // relaxed inequality rows + + // A/V are needed at solve time (b_prj = A * x_prj.head(nx); u_opt += V*v). + SparseMatrixCSC A; + SparseMatrixCSC V; + + // Per-stage matrices/vectors, built once per setup(). + SparseMatrixCSC P_prj, C_prj; + VectorXd l_prj, u_prj; + SparseMatrixCSC P_rlx, C_rlx; + VectorXd q_rlx; + SparseMatrixCSC P_opt, C_opt; + VectorXd q_opt; + + // OSQP tunables. epsilon mirrors osqp.cppm's default and honors the + // OSQP_EPSILON env var like the production solver does. + double epsilon{1e-4}; + double rel_x_reg{1e-4}; + + // Persisted across calls (warm starts + last solve outputs). + bool has_state{false}; + VectorXd prj_sol, prj_dual; + double prj_rho{}; + VectorXd rlx_sol, rlx_dual; + double rlx_rho{}; + VectorXd opt_sol, opt_dual; + double opt_rho{}; + VectorXd slack; // b - A * x_prj.head(nx) + VectorXd v_relax; // max(0, rlx_sol.tail(nv)) + + // OSQP solver handles. Default-initialized to nullptr so the destructor + // is a no-op until something is actually built. + std::unique_ptr prj_solver{ + nullptr, &osqp_cleanup}; + std::unique_ptr rlx_solver{ + nullptr, &osqp_cleanup}; + std::unique_ptr opt_solver{ + nullptr, &osqp_cleanup}; + + void setup_from(const HostSystem &sys); + + // Linear cost / bound builders, parametrized by the current right-hand + // side. These mirror build_q_prj_ / build_l_rlx_ / ... in osqp.cppm. + VectorXd build_q_prj(const VectorXd &b) const { + VectorXd q(nx + me); + q.head(nx).setZero(); + q.tail(me) = -b; + return q; + } + + VectorXd build_l_rlx(const VectorXd &b_prj) const { + VectorXd l(me + mi + mh + nv); + l.head(me) = b_prj; + l.segment(me, mi).setConstant(-epsilon); + l.segment(me + mi, mh).setConstant(-OSQP_INFTY); + l.tail(nv).setZero(); + return l; + } + + VectorXd build_u_rlx(const VectorXd &b_prj) const { + VectorXd u(me + mi + mh + nv); + u.head(me) = b_prj; + u.segment(me, mi).setConstant(OSQP_INFTY); + u.segment(me + mi, mh).setConstant(1.0 - epsilon); + u.tail(nv).setConstant(OSQP_INFTY); + return u; + } + + VectorXd build_l_opt(const VectorXd &b_prj) const { + VectorXd l(me + mi + mh); + l.head(me) = b_prj; + l.segment(me, mi).setConstant(-epsilon); + l.tail(mh).setConstant(-OSQP_INFTY); + return l; + } + + VectorXd build_u_opt(const VectorXd &b_prj, const VectorXd &v) const { + VectorXd u(me + mi + mh); + u.head(me) = b_prj; + u.segment(me, mi).setConstant(OSQP_INFTY); + u.tail(mh).setConstant(1.0); + u.tail(mh) += V * v; + return u; + } +}; + +void InternalQpState::setup_from(const HostSystem &sys) { + using Trip = Eigen::Triplet; + + nx = static_cast(sys.Q.rows()); + nv = static_cast(sys.V.cols()); + me = static_cast(sys.A.rows()); + mi = static_cast(sys.G.rows()); + mh = static_cast(sys.H.rows()); + + if (auto *env = std::getenv("OSQP_EPSILON")) { + epsilon = std::stod(env); + } + + if (sys.Q.cols() != nx || sys.A.cols() != nx || sys.G.cols() != nx || + sys.H.cols() != nx || sys.V.rows() != mh) { + throw std::invalid_argument("InternalQpState: matrix dimension mismatch"); + } + + A = sys.A; + V = sys.V; + if (!A.isCompressed()) A.makeCompressed(); + if (!V.isCompressed()) V.makeCompressed(); + + // -- Stage 1 (projection) -------------------------------------------------- + // Variables [x; s] of size nx + me. Objective is 1/2 ||s||^2 only, so P_prj + // has identity on the s-block and zero on the x-block. + { + std::vector tr; + tr.reserve(static_cast(me)); + for (int i = 0; i < me; ++i) { + tr.emplace_back(nx + i, nx + i, 1.0); + } + P_prj.resize(nx + me, nx + me); + P_prj.setFromTriplets(tr.begin(), tr.end()); + P_prj.makeCompressed(); + } + // C_prj = [A -I; G 0]: the upper block expresses A x - s = -b (q tail); + // the lower block keeps Gx >= 0. + { + std::vector tr; + tr.reserve(static_cast(A.nonZeros() + sys.G.nonZeros() + me)); + for (int k = 0; k < A.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(A, k); it; ++it) { + tr.emplace_back(it.row(), it.col(), it.value()); + } + } + for (int k = 0; k < sys.G.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(sys.G, k); it; ++it) { + tr.emplace_back(me + it.row(), it.col(), it.value()); + } + } + for (int i = 0; i < me; ++i) { + tr.emplace_back(i, nx + i, -1.0); + } + C_prj.resize(me + mi, nx + me); + C_prj.setFromTriplets(tr.begin(), tr.end()); + C_prj.makeCompressed(); + } + l_prj = VectorXd::Zero(me + mi); + u_prj.resize(me + mi); + u_prj.head(me).setZero(); // equality rows: 0 <= ... <= 0 + u_prj.tail(mi).setConstant(OSQP_INFTY); // inequality rows: 0 <= ... <= +inf + + // -- Stage 2 (relaxation) -------------------------------------------------- + // Variables [x; v] of size nx + nv. P_rlx penalizes x with rel_x_reg and + // v with 1.0 (forces v small unless capacity constraints push it up). + { + std::vector tr; + tr.reserve(static_cast(nx + nv)); + for (int i = 0; i < nx; ++i) tr.emplace_back(i, i, rel_x_reg); + for (int i = 0; i < nv; ++i) tr.emplace_back(nx + i, nx + i, 1.0); + P_rlx.resize(nx + nv, nx + nv); + P_rlx.setFromTriplets(tr.begin(), tr.end()); + P_rlx.makeCompressed(); + } + // C_rlx = [A 0; G 0; H -V; 0 I] + { + std::vector tr; + tr.reserve(static_cast(A.nonZeros() + sys.G.nonZeros() + + sys.H.nonZeros() + V.nonZeros() + nv)); + for (int k = 0; k < A.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(A, k); it; ++it) { + tr.emplace_back(it.row(), it.col(), it.value()); + } + } + for (int k = 0; k < sys.G.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(sys.G, k); it; ++it) { + tr.emplace_back(me + it.row(), it.col(), it.value()); + } + } + for (int k = 0; k < sys.H.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(sys.H, k); it; ++it) { + tr.emplace_back(me + mi + it.row(), it.col(), it.value()); + } + } + for (int k = 0; k < V.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(V, k); it; ++it) { + tr.emplace_back(me + mi + it.row(), nx + it.col(), -it.value()); + } + } + for (int i = 0; i < nv; ++i) { + tr.emplace_back(me + mi + mh + i, nx + i, 1.0); + } + C_rlx.resize(me + mi + mh + nv, nx + nv); + C_rlx.setFromTriplets(tr.begin(), tr.end()); + C_rlx.makeCompressed(); + } + q_rlx = VectorXd::Zero(nx + nv); + + // -- Stage 3 (optimization) ------------------------------------------------ + // OSQP wants the upper triangle of the Hessian only. + { + std::vector tr; + tr.reserve(static_cast(sys.Q.nonZeros())); + for (int k = 0; k < sys.Q.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(sys.Q, k); it; ++it) { + if (it.row() <= it.col()) { + tr.emplace_back(it.row(), it.col(), it.value()); + } + } + } + P_opt.resize(nx, nx); + P_opt.setFromTriplets(tr.begin(), tr.end()); + P_opt.makeCompressed(); + } + // C_opt = [A; G; H] + { + std::vector tr; + tr.reserve(static_cast(A.nonZeros() + sys.G.nonZeros() + + sys.H.nonZeros())); + for (int k = 0; k < A.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(A, k); it; ++it) { + tr.emplace_back(it.row(), it.col(), it.value()); + } + } + for (int k = 0; k < sys.G.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(sys.G, k); it; ++it) { + tr.emplace_back(me + it.row(), it.col(), it.value()); + } + } + for (int k = 0; k < sys.H.outerSize(); ++k) { + for (SparseMatrixCSC::InnerIterator it(sys.H, k); it; ++it) { + tr.emplace_back(me + mi + it.row(), it.col(), it.value()); + } + } + C_opt.resize(me + mi + mh, nx); + C_opt.setFromTriplets(tr.begin(), tr.end()); + C_opt.makeCompressed(); + } + q_opt = VectorXd::Zero(nx); +} + +namespace { + +using OsqpSolverPtr = std::unique_ptr; + +OsqpSolverPtr build_prj_solver(InternalQpState &qp, const VectorXd &b) { + OSQPSettings s = make_settings(qp.epsilon); + OSQPCscMatrix P = to_osqp_csc(qp.P_prj); + OSQPCscMatrix C = to_osqp_csc(qp.C_prj); + VectorXd q = qp.build_q_prj(b); + OSQPSolver *solver = nullptr; + check_osqp(osqp_setup(&solver, &P, q.data(), &C, qp.l_prj.data(), + qp.u_prj.data(), C.m, C.n, &s)); + return {solver, &osqp_cleanup}; +} + +OsqpSolverPtr build_rlx_solver(InternalQpState &qp, const VectorXd &b_prj) { + OSQPSettings s = make_settings(qp.epsilon); + OSQPCscMatrix P = to_osqp_csc(qp.P_rlx); + OSQPCscMatrix C = to_osqp_csc(qp.C_rlx); + VectorXd l = qp.build_l_rlx(b_prj); + VectorXd u = qp.build_u_rlx(b_prj); + OSQPSolver *solver = nullptr; + check_osqp(osqp_setup(&solver, &P, qp.q_rlx.data(), &C, l.data(), u.data(), + C.m, C.n, &s)); + return {solver, &osqp_cleanup}; +} + +OsqpSolverPtr build_opt_solver(InternalQpState &qp, const VectorXd &b_prj, + const VectorXd &v) { + OSQPSettings s = make_settings(qp.epsilon); + // Mirror osqp.cppm: opt stage starts from rho=1.0 (not the default 0.1). + s.rho = 1.0; + OSQPCscMatrix P = to_osqp_csc(qp.P_opt); + OSQPCscMatrix C = to_osqp_csc(qp.C_opt); + VectorXd l = qp.build_l_opt(b_prj); + VectorXd u = qp.build_u_opt(b_prj, v); + OSQPSolver *solver = nullptr; + check_osqp(osqp_setup(&solver, &P, qp.q_opt.data(), &C, l.data(), u.data(), + C.m, C.n, &s)); + return {solver, &osqp_cleanup}; +} + +void update_prj_solver(InternalQpState &qp, OSQPSolver *solver, + const VectorXd &b) { + VectorXd q = qp.build_q_prj(b); + check_osqp(osqp_update_data_vec(solver, q.data(), nullptr, nullptr)); +} + +void update_rlx_solver(InternalQpState &qp, OSQPSolver *solver, + const VectorXd &b_prj) { + VectorXd l = qp.build_l_rlx(b_prj); + VectorXd u = qp.build_u_rlx(b_prj); + check_osqp(osqp_update_data_vec(solver, nullptr, l.data(), u.data())); +} + +void update_opt_solver(InternalQpState &qp, OSQPSolver *solver, + const VectorXd &b_prj, const VectorXd &v) { + VectorXd l = qp.build_l_opt(b_prj); + VectorXd u = qp.build_u_opt(b_prj, v); + check_osqp(osqp_update_data_vec(solver, nullptr, l.data(), u.data())); +} + +} // namespace + +} // namespace breakage_cuda + +namespace breakage_cuda::ref_cpu { + +// --------------------------------------------------------------------------- +// Pure dense kernels. Verbatim ports of breakage.cppm:58-180. +// --------------------------------------------------------------------------- + +Transformd fit_se3(const MatrixX4d &q0, const MatrixX3d &t0, + const MatrixX4d &qx, const MatrixX3d &tx, + const VectorXd &mass, double total_mass, double lambda_R) { + using Eigen::Index; + using Eigen::Quaterniond; + Index N = mass.size(); + Vector3d t0_bar = t0.transpose() * mass / total_mass; + Vector3d tx_bar = tx.transpose() * mass / total_mass; + Matrix3d H = + ((t0.rowwise() - t0_bar.transpose()).array().colwise() * mass.array()) + .matrix() + .transpose() * + (tx.rowwise() - tx_bar.transpose()); + Matrix3d K = Matrix3d::Zero(); + for (Index i = 0; i < N; ++i) { + K += mass(i) * (Quaterniond{q0.row(i).transpose()} * + Quaterniond{qx.row(i).transpose()}.conjugate()) + .toRotationMatrix(); + } + Matrix3d S = H + lambda_R * K; + Eigen::JacobiSVD svd{S, Eigen::ComputeFullU | Eigen::ComputeFullV}; + Matrix3d U = svd.matrixU(); + Matrix3d V = svd.matrixV(); + Matrix3d D = Matrix3d::Identity(); + if ((V * U.transpose()).determinant() < 0.0) { + D(2, 2) = -1.0; + } + Matrix3d R = V * D * U.transpose(); + Vector3d t = tx_bar - R * t0_bar; + Quaterniond q{R}; + q.normalize(); + return {q, t}; +} + +TwistFitResult fit_twist(const MatrixX3d &w, const MatrixX3d &v, + const MatrixX3d &c_CC, const Transformd &T_W_CC, + const VectorXd &mass, double total_mass, + double lambda_w) { + const auto &q_W_CC = T_W_CC.q; + const auto &t_W_CC = T_W_CC.t; + MatrixX3d c_W = (c_CC * q_W_CC.toRotationMatrix().transpose()).rowwise() + + t_W_CC.transpose(); + Vector3d r = c_W.transpose() * mass / total_mass; + MatrixX3d d = c_W.rowwise() - r.transpose(); + MatrixX3d d_weighted = d.array().colwise() * mass.array(); + Matrix3d S = d.transpose() * d_weighted.matrix(); + Matrix3d LHS = + Matrix3d::Identity() * (S.trace() + lambda_w * total_mass) - S; + Vector3d L; + L.x() = d_weighted.col(1).dot(v.col(2)) - d_weighted.col(2).dot(v.col(1)); + L.y() = d_weighted.col(2).dot(v.col(0)) - d_weighted.col(0).dot(v.col(2)); + L.z() = d_weighted.col(0).dot(v.col(1)) - d_weighted.col(1).dot(v.col(0)); + Vector3d regularization = lambda_w * (w.transpose() * mass); + Vector3d RHS = L + regularization; + TwistFitResult result; + result.w0 = LHS.ldlt().solve(RHS); + result.v0 = v.transpose() * mass / total_mass; + result.v_W = d.rowwise().cross(-result.w0).rowwise() + result.v0.transpose(); + return result; +} + +Vector3d so3_log_from_unit_quat(Eigen::Quaterniond q) { + // Branch threshold on s = ||q.vec|| = sin(theta/2). + constexpr double kSmallS = 5e-5; + q.normalize(); + // Pick the shortest representation (theta in [0, pi]); the pi case (w==0) + // is intrinsically ambiguous. + if (q.w() < 0.0) { + q.coeffs() *= -1.0; + } + Vector3d v = q.vec(); + double s = v.norm(); + double w = q.w(); + if (s < kSmallS) { + // theta/s = 2 + s^2/3 + 3 s^4/20 + O(s^6) + double s2 = s * s; + double s4 = s2 * s2; + double theta_over_s = 2.0 + (s2 / 3.0) + (3.0 * s4 / 20.0); + return theta_over_s * v; + } + double theta = 2.0 * std::atan2(s, w); + return (theta / s) * v; +} + +Matrix3d so3_jacobian_inv(const Vector3d &phi) { + // Branch threshold on theta = ||phi||. + constexpr double kSmallTheta = 1e-4; + Matrix3d Phi = skew(phi); + double theta2 = phi.squaredNorm(); + if (theta2 < kSmallTheta * kSmallTheta) { + // c(theta) = 1/12 + theta^2/720 + theta^4/30240 + O(theta^6) + double t2 = theta2; + double t4 = t2 * t2; + double c = (1.0 / 12.0) + (t2 / 720.0) + (t4 / 30240.0); + return Matrix3d::Identity() - 0.5 * Phi + c * (Phi * Phi); + } + double theta = std::sqrt(theta2); + double half = 0.5 * theta; + double cot_half = std::cos(half) / std::sin(half); + // c(theta) = 1/theta^2 - cot(theta/2)/(2 theta) + double c = (1.0 / theta2) - (cot_half / (2.0 * theta)); + return Matrix3d::Identity() - 0.5 * Phi + c * (Phi * Phi); +} + +Matrix3d compute_Pi(const Eigen::Quaterniond &qm, + const Eigen::Quaterniond &qp) { + // Relative rotation: R_rel = Rm^T Rp. + return so3_jacobian_inv(so3_log_from_unit_quat(qm.conjugate() * qp)) * + qm.toRotationMatrix().transpose(); +} + +MatrixX3d compute_L( + const Eigen::Matrix &I_flat, + const Eigen::Quaterniond &q_W_CC, const Vector3d &w0) { + Vector3d w_CC = q_W_CC.conjugate() * w0; + return (I_flat.leftCols<3>() * w_CC.x() + + I_flat.middleCols<3>(3) * w_CC.y() + + I_flat.rightCols<3>() * w_CC.z()) * + q_W_CC.toRotationMatrix().transpose(); +} + +// --------------------------------------------------------------------------- +// b vector assembly. Layout: row-major (num_parts, 6). Mirrors lines +// 1018-1035 of breakage.cppm. +// --------------------------------------------------------------------------- + +VectorXd assemble_b(const HostSystem &sys, const HostInput &in, + const MatrixX3d &v_W_curr, const MatrixX3d &v_W_prev, + const MatrixX3d &L_curr, const MatrixX3d &L_prev, + const Matrix3d &Pi) { + using Eigen::Index; + Index N = sys.num_parts; + VectorXd b(6 * N); + Eigen::Map> b_mat{b.data(), N, 6}; + b_mat.leftCols<3>() = + (((v_W_curr - v_W_prev).array().colwise() * sys.mass.array()).matrix() - + in.J) * + Pi.transpose() / in.dt; + b_mat.rightCols<3>() = ((L_curr - L_prev - in.H) * Pi.transpose()) / in.dt; + if constexpr (kEnableTorqueScaling) { + b_mat.rightCols<3>() /= sys.L0; + } + return b; +} + +VectorXd compute_b(const HostSystem &sys, const HostInput &in, HostState &state, + Eigen::Quaterniond *q_W_CC_out, MatrixX3d *v_W_out, + MatrixX3d *L_out) { + Transformd T_W_CC = fit_se3(sys.q_CC, sys.c_CC, in.q, in.c, sys.mass, + sys.total_mass, sys.L0 * sys.L0 / 2.0); + TwistFitResult twist = + fit_twist(in.w, in.v, sys.c_CC, T_W_CC, sys.mass, sys.total_mass, + sys.L0 * sys.L0); + MatrixX3d L_curr = compute_L(sys.I_CC, T_W_CC.q, twist.w0); + Matrix3d Pi = compute_Pi(state.q_W_CC_prev, T_W_CC.q); + VectorXd b = assemble_b(sys, in, twist.v_W, state.v_W_prev, L_curr, + state.L_prev, Pi); + if (q_W_CC_out) *q_W_CC_out = T_W_CC.q; + if (v_W_out) *v_W_out = std::move(twist.v_W); + if (L_out) *L_out = std::move(L_curr); + return b; +} + +// --------------------------------------------------------------------------- +// Three-stage QP solver. Mirrors OsqpSolver::solve() in osqp.cppm step by +// step: each stage either updates an existing OSQP solver in place (warm +// path) or rebuilds it (cold or post-reset path) and warm-starts it from +// the persisted state vectors. +// --------------------------------------------------------------------------- + +SolveInfo solve_qp(const HostSystem &sys, const VectorXd &b, HostState &state, + VectorXd &x_out, VectorXd &slack_out, + VectorXd &v_relax_out) { + if (b.size() != 6 * sys.num_parts) { + throw std::invalid_argument("solve_qp: b dimension mismatch"); + } + if (!state.qp) { + state.qp = std::make_shared(sys); + } + InternalQpState &qp = *state.qp; + if (qp.has_state) { + if (qp.prj_sol.size() != qp.C_prj.cols() || + qp.prj_dual.size() != qp.C_prj.rows() || + qp.rlx_sol.size() != qp.C_rlx.cols() || + qp.rlx_dual.size() != qp.C_rlx.rows() || + qp.opt_sol.size() != qp.C_opt.cols() || + qp.opt_dual.size() != qp.C_opt.rows()) { + throw std::invalid_argument("solve_qp: warm-start dimension mismatch"); + } + } + + SolveInfo info; + + // -- Stage 1: projection --------------------------------------------------- + if (qp.has_state) { + if (qp.prj_solver) { + update_prj_solver(qp, qp.prj_solver.get(), b); + } else { + qp.prj_solver = build_prj_solver(qp, b); + check_osqp(osqp_warm_start(qp.prj_solver.get(), qp.prj_sol.data(), + qp.prj_dual.data())); + check_osqp(osqp_update_rho(qp.prj_solver.get(), qp.prj_rho)); + } + } else { + qp.prj_solver = build_prj_solver(qp, b); + } + check_osqp(osqp_solve(qp.prj_solver.get())); + { + OSQPSolver *s = qp.prj_solver.get(); + qp.prj_sol = Eigen::Map(s->solution->x, qp.C_prj.cols()); + qp.prj_dual = Eigen::Map(s->solution->y, qp.C_prj.rows()); + qp.prj_rho = s->settings->rho; + fill_stage(info.prj, *s->info, qp.prj_rho); + } + VectorXd b_prj = qp.A * qp.prj_sol.head(qp.nx); + qp.slack = b - b_prj; + + // -- Stage 2: relaxation --------------------------------------------------- + if (qp.has_state) { + if (qp.rlx_solver) { + update_rlx_solver(qp, qp.rlx_solver.get(), b_prj); + } else { + qp.rlx_solver = build_rlx_solver(qp, b_prj); + check_osqp(osqp_warm_start(qp.rlx_solver.get(), qp.rlx_sol.data(), + qp.rlx_dual.data())); + check_osqp(osqp_update_rho(qp.rlx_solver.get(), qp.rlx_rho)); + } + } else { + qp.rlx_solver = build_rlx_solver(qp, b_prj); + } + check_osqp(osqp_solve(qp.rlx_solver.get())); + { + OSQPSolver *s = qp.rlx_solver.get(); + qp.rlx_sol = Eigen::Map(s->solution->x, qp.C_rlx.cols()); + qp.rlx_dual = Eigen::Map(s->solution->y, qp.C_rlx.rows()); + qp.rlx_rho = s->settings->rho; + fill_stage(info.rlx, *s->info, qp.rlx_rho); + } + qp.v_relax = qp.rlx_sol.tail(qp.nv).cwiseMax(0.0); + + // -- Stage 3: optimization ------------------------------------------------- + if (qp.has_state) { + if (qp.opt_solver) { + update_opt_solver(qp, qp.opt_solver.get(), b_prj, qp.v_relax); + } else { + qp.opt_solver = build_opt_solver(qp, b_prj, qp.v_relax); + check_osqp(osqp_warm_start(qp.opt_solver.get(), qp.opt_sol.data(), + qp.opt_dual.data())); + check_osqp(osqp_update_rho(qp.opt_solver.get(), qp.opt_rho)); + } + } else { + qp.opt_solver = build_opt_solver(qp, b_prj, qp.v_relax); + } + check_osqp(osqp_solve(qp.opt_solver.get())); + { + OSQPSolver *s = qp.opt_solver.get(); + qp.opt_sol = Eigen::Map(s->solution->x, qp.C_opt.cols()); + qp.opt_dual = Eigen::Map(s->solution->y, qp.C_opt.rows()); + qp.opt_rho = s->settings->rho; + fill_stage(info.opt, *s->info, qp.opt_rho); + } + info.converged = + info.prj.converged && info.rlx.converged && info.opt.converged; + qp.has_state = true; + + // x_out is left whitened; solve_full() does the un-whitening, mirroring the + // CUDA pipeline which also returns whitened `x_d` from its solver. + x_out = qp.opt_sol.head(qp.nx); + slack_out = qp.slack; + v_relax_out = qp.v_relax; + return info; +} + +// --------------------------------------------------------------------------- +// End-to-end driver. Mirrors BreakageChecker::solve() at breakage.cppm:996, +// minus the dump bookkeeping and logging which are out of scope here. +// --------------------------------------------------------------------------- + +Solution solve_full(const HostSystem &sys, const HostInput &in, + HostState &state, const Thresholds &thr) { + Eigen::Quaterniond q_W_CC_curr; + MatrixX3d v_W_curr; + MatrixX3d L_curr; + VectorXd b = compute_b(sys, in, state, &q_W_CC_curr, &v_W_curr, &L_curr); + + Solution sol; + if (!thr.enabled) { + // Match the production code path: state is NOT advanced when the checker + // is disabled. Re-enabling later restarts from the previously cached + // q_W_CC_prev / v_W_prev / L_prev. + return sol; + } + + VectorXd x_raw, slack, v_relax; + sol.info = solve_qp(sys, b, state, x_raw, slack, v_relax); + sol.slack_fraction = + slack.norm() / std::max(b.norm(), thr.slack_fraction_b_floor); + + // Advance the per-frame state (matches lines 1053-1055 of breakage.cppm: + // done unconditionally after the solve, before the converged-only + // postprocess block). + state.q_W_CC_prev = q_W_CC_curr; + state.v_W_prev = std::move(v_W_curr); + state.L_prev = std::move(L_curr); + + if (sol.info.converged) { + sol.x = std::move(x_raw); + if constexpr (kEnableClutchWhitening) { + const int ncv = sys.num_contact_vertices; + for (int k = 0; k < sys.num_clutches; ++k) { + // The 4 doubles per row in HostSystem.clutch_whiten are the raw + // memory of bricksim's `std::vector` (column-major), so + // mapping them back as a ColMajor 2x2 reproduces Wk exactly. + // (Wk is symmetric anyway, so the choice does not affect the math.) + Eigen::Map Wk(sys.clutch_whiten.data() + 4 * k); + const int j0 = ncv + 9 * k; + Eigen::Vector2d a_scaled = sol.x.segment<2>(j0 + 1); + sol.x.segment<2>(j0 + 1) = Wk * a_scaled; + Eigen::Vector2d b_scaled = sol.x.segment<2>(j0 + 4); + sol.x.segment<2>(j0 + 4) = Wk * b_scaled; + Eigen::Vector2d c_scaled = sol.x.segment<2>(j0 + 7); + sol.x.segment<2>(j0 + 7) = Wk * c_scaled; + } + } + sol.utilization.setConstant(sys.num_clutches, -1.0); + const int ncv = sys.num_contact_vertices; + for (std::size_t idx = 0; idx < sys.capacity_clutch_indices.size(); ++idx) { + Eigen::Matrix c = + sys.capacities.row(static_cast(idx)).transpose(); + const int clutch_index = sys.capacity_clutch_indices[idx]; + Eigen::Matrix x = + sol.x.segment<9>(ncv + 9 * clutch_index); + double frc_used = c.head<3>().dot(x.head<3>()); + double frc_cap = 1.0 - c.tail<6>().dot(x.tail<6>()); + double u_fp; + if (frc_cap > 0.0) { + u_fp = frc_used / frc_cap; + u_fp = std::max(u_fp, 0.0); + } else { + u_fp = 1e6; + } + double &u_max = sol.utilization(clutch_index); + if (u_fp > u_max) { + u_max = u_fp; + } + } + } + + return sol; +} + +} // namespace breakage_cuda::ref_cpu diff --git a/native/cuda/breakage_cuda/src/system.cpp b/native/cuda/breakage_cuda/src/system.cpp new file mode 100644 index 0000000..645f8ab --- /dev/null +++ b/native/cuda/breakage_cuda/src/system.cpp @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: MIT +// +// JSON loaders + host-to-device transfers for the breakage_cuda subproject. +// +// JSON layout matches bricksim::matrix_to_json (see +// modules/bricksim/utils/matrix_serialization.cppm) and the to_json overloads +// for BreakageDebugDump in modules/bricksim/physx/breakage.cppm. +// +// On the device side, all dense per-part data is stored row-major and all +// sparse matrices are CSR (cuSPARSE-friendly). + +#include "breakage_cuda/device.hpp" +#include "breakage_cuda/system.hpp" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace breakage_cuda { + +// =========================================================================== +// JSON helpers (matrix_serialization-compatible). +// +// dense -> {"kind":"dense", "dtype","order"("C"|"F"),"shape":[r,c], +// "data_b64"} +// sparse -> {"kind":"sparse","format"("csr"|"csc"),"shape":[r,c],"nnz", +// "data_dtype","index_dtype", +// "data_b64","indices_b64","indptr_b64"} +// +// Bytes inside *_b64 are raw little-/big-endian according to the dtype prefix +// ('<','>','|','='). On x86_64 Linux native is little-endian, so the common +// case ' base64_decode(std::string_view s) { + if (s.empty()) return {}; + if ((s.size() & 3u) != 0) { + throw std::runtime_error{"base64: input length not divisible by 4"}; + } + std::size_t pad = 0; + if (s.back() == '=') { + pad = (s.size() >= 2 && s[s.size() - 2] == '=') ? 2 : 1; + } + std::vector out((s.size() / 4) * 3 - pad); + const auto *p = reinterpret_cast(s.data()); + std::byte *o = out.data(); + const std::size_t full = s.size() / 4 - (pad != 0 ? 1 : 0); + auto get6 = [&]() { + int v = kBase64Lut[*p++]; + if (v < 0) throw std::runtime_error{"base64: invalid character"}; + return v; + }; + for (std::size_t i = 0; i < full; ++i) { + std::uint32_t x = (std::uint32_t(get6()) << 18) | + (std::uint32_t(get6()) << 12) | + (std::uint32_t(get6()) << 6) | std::uint32_t(get6()); + *o++ = std::byte((x >> 16) & 0xFFu); + *o++ = std::byte((x >> 8) & 0xFFu); + *o++ = std::byte(x & 0xFFu); + } + if (pad == 1) { + std::uint32_t x = (std::uint32_t(get6()) << 18) | + (std::uint32_t(get6()) << 12) | + (std::uint32_t(get6()) << 6); + *o++ = std::byte((x >> 16) & 0xFFu); + *o++ = std::byte((x >> 8) & 0xFFu); + } else if (pad == 2) { + std::uint32_t x = (std::uint32_t(get6()) << 18) | + (std::uint32_t(get6()) << 12); + *o++ = std::byte((x >> 16) & 0xFFu); + } + return out; +} + +// ----- numpy dtype validation + per-element byteswap ----------------------- + +template +constexpr char numpy_kind() { + if constexpr (std::is_floating_point_v) + return 'f'; + else if constexpr (std::is_signed_v) + return 'i'; + else + return 'u'; +} + +template +bool dtype_needs_swap(std::string_view dtype, std::string_view field) { + if (dtype.size() < 3) { + throw std::runtime_error{ + std::string{"dtype too short for "}.append(field)}; + } + std::size_t isz = 0; + for (std::size_t i = 2; i < dtype.size(); ++i) { + char c = dtype[i]; + if (c < '0' || c > '9') { + throw std::runtime_error{ + std::string{"dtype itemsize parse error in "}.append(field)}; + } + isz = isz * 10 + static_cast(c - '0'); + } + if (dtype[1] != numpy_kind() || isz != sizeof(T)) { + throw std::runtime_error{std::string{"dtype mismatch for "} + .append(field) + .append(": ") + .append(dtype)}; + } + if (sizeof(T) <= 1) return false; + const char endian = dtype[0]; + if (endian == '|' || endian == '=') return false; + const bool native_le = (std::endian::native == std::endian::little); + if (endian == '<') return !native_le; + if (endian == '>') return native_le; + throw std::runtime_error{ + std::string{"invalid endian prefix in "}.append(field)}; +} + +template +void byteswap_inplace(std::span xs) { + for (auto &x : xs) { + auto bytes = std::as_writable_bytes(std::span{&x, 1}); + std::ranges::reverse(bytes); + } +} + +template +void decode_typed(std::string_view b64, std::span out, + std::string_view dtype, std::string_view field) { + const bool swap = dtype_needs_swap(dtype, field); + std::vector bytes = base64_decode(b64); + if (bytes.size() != out.size_bytes()) { + throw std::runtime_error{ + std::string{"decoded byte count mismatch for "}.append(field)}; + } + if (!bytes.empty()) { + std::memcpy(out.data(), bytes.data(), bytes.size()); + } + if (swap) byteswap_inplace(out); +} + +// ----- dense matrix loader ------------------------------------------------- + +template +EigenType load_dense(const nlohmann::json &j, std::string_view field) { + using Scalar = typename EigenType::Scalar; + if (j.at("kind").get_ref() != "dense") { + throw std::runtime_error{ + std::string{field}.append(": expected kind 'dense'")}; + } + const auto &shape = j.at("shape"); + if (!shape.is_array() || shape.size() != 2) { + throw std::runtime_error{ + std::string{field}.append(": shape must be [rows, cols]")}; + } + const Eigen::Index rows = shape.at(0).get(); + const Eigen::Index cols = shape.at(1).get(); + + const auto &order = j.at("order").get_ref(); + const bool input_rm = (order == "C"); + if (!input_rm && order != "F") { + throw std::runtime_error{std::string{field} + .append(": invalid order '") + .append(order) + .append("'")}; + } + + const auto &dtype = j.at("dtype").get_ref(); + const auto &b64 = j.at("data_b64").get_ref(); + + EigenType out; + out.resize(rows, cols); + const bool out_rm = static_cast(EigenType::IsRowMajor); + // For vectors (one extent <= 1), storage order is identical either way. + const bool degenerate = (rows <= 1) || (cols <= 1); + + if (input_rm == out_rm || degenerate) { + decode_typed( + b64, + std::span{out.data(), static_cast(out.size())}, + dtype, field); + } else if (input_rm) { + Eigen::Matrix tmp( + rows, cols); + decode_typed( + b64, + std::span{tmp.data(), static_cast(tmp.size())}, + dtype, field); + out = tmp; + } else { + Eigen::Matrix tmp( + rows, cols); + decode_typed( + b64, + std::span{tmp.data(), static_cast(tmp.size())}, + dtype, field); + out = tmp; + } + return out; +} + +// ----- sparse matrix loader (always returns CSC) --------------------------- + +SparseMatrixCSC load_sparse_csc(const nlohmann::json &j, + std::string_view field) { + if (j.at("kind").get_ref() != "sparse") { + throw std::runtime_error{ + std::string{field}.append(": expected kind 'sparse'")}; + } + const auto &shape = j.at("shape"); + if (!shape.is_array() || shape.size() != 2) { + throw std::runtime_error{ + std::string{field}.append(": shape must be [rows, cols]")}; + } + const Eigen::Index rows = shape.at(0).get(); + const Eigen::Index cols = shape.at(1).get(); + const std::size_t nnz = j.at("nnz").get(); + + const auto &format = j.at("format").get_ref(); + const bool is_csc = (format == "csc"); + const bool is_csr = (format == "csr"); + if (!is_csc && !is_csr) { + throw std::runtime_error{std::string{field} + .append(": invalid sparse format '") + .append(format) + .append("'")}; + } + const std::size_t outer = is_csc ? static_cast(cols) + : static_cast(rows); + + const auto &data_dtype = + j.at("data_dtype").get_ref(); + const auto &index_dtype = + j.at("index_dtype").get_ref(); + + std::vector values(nnz); + std::vector indices(nnz); + std::vector indptr(outer + 1); + decode_typed(j.at("data_b64").get_ref(), + std::span{values}, data_dtype, field); + decode_typed(j.at("indices_b64").get_ref(), + std::span{indices}, index_dtype, field); + decode_typed(j.at("indptr_b64").get_ref(), + std::span{indptr}, index_dtype, field); + + if (nnz == 0) { + SparseMatrixCSC empty(rows, cols); + empty.makeCompressed(); + return empty; + } + + if (is_csc) { + using MapType = Eigen::SparseMatrix; + Eigen::Map mapped(rows, cols, static_cast(nnz), + indptr.data(), indices.data(), values.data()); + return SparseMatrixCSC{mapped}; + } + // CSR -> re-encode to CSC; Eigen handles the layout change on assignment. + using MapType = Eigen::SparseMatrix; + Eigen::Map mapped(rows, cols, static_cast(nnz), + indptr.data(), indices.data(), values.data()); + return SparseMatrixCSC{mapped}; +} + +} // namespace + +// =========================================================================== +// from_json overloads. +// =========================================================================== + +void from_json(const nlohmann::json &j, Thresholds &out) { + j.at("enabled").get_to(out.enabled); + j.at("contact_regularization").get_to(out.contact_regularization); + j.at("clutch_axial_compliance").get_to(out.clutch_axial_compliance); + j.at("clutch_radial_compliance").get_to(out.clutch_radial_compliance); + j.at("clutch_tangential_compliance") + .get_to(out.clutch_tangential_compliance); + j.at("friction_coefficient").get_to(out.friction_coefficient); + j.at("preloaded_force").get_to(out.preloaded_force); + j.at("slack_fraction_warn").get_to(out.slack_fraction_warn); + j.at("slack_fraction_b_floor").get_to(out.slack_fraction_b_floor); + j.at("debug_dump").get_to(out.debug_dump); + j.at("breakage_cooldown_time").get_to(out.breakage_cooldown_time); +} + +void from_json(const nlohmann::json &j, HostSystem &out) { + j.at("num_parts").get_to(out.num_parts); + j.at("num_clutches").get_to(out.num_clutches); + j.at("num_contact_vertices").get_to(out.num_contact_vertices); + j.at("num_vars").get_to(out.num_vars); + j.at("num_eq").get_to(out.num_eq); + j.at("num_ineq").get_to(out.num_ineq); + j.at("num_relaxed_ineq").get_to(out.num_relaxed_ineq); + j.at("total_mass").get_to(out.total_mass); + j.at("L0").get_to(out.L0); + + out.mass = load_dense(j.at("mass"), "mass"); + out.q_CC = load_dense(j.at("q_CC"), "q_CC"); + out.c_CC = load_dense(j.at("c_CC"), "c_CC"); + out.I_CC = load_dense>( + j.at("I_CC"), "I_CC"); + + out.Q = load_sparse_csc(j.at("Q"), "Q"); + out.A = load_sparse_csc(j.at("A"), "A"); + out.G = load_sparse_csc(j.at("G"), "G"); + out.H = load_sparse_csc(j.at("H"), "H"); + out.V = load_sparse_csc(j.at("V"), "V"); + + j.at("capacity_clutch_indices").get_to(out.capacity_clutch_indices); + out.capacities = load_dense>( + j.at("capacities"), "capacities"); + out.clutch_whiten = load_dense>( + j.at("clutch_whiten"), "clutch_whiten"); + // part_ids / clutch_ids are intentionally ignored; GPU only needs indices. +} + +void from_json(const nlohmann::json &j, HostInput &out) { + out.w = load_dense(j.at("w"), "w"); + out.v = load_dense(j.at("v"), "v"); + out.q = load_dense(j.at("q"), "q"); + out.c = load_dense(j.at("c"), "c"); + j.at("dt").get_to(out.dt); + out.J = load_dense(j.at("J"), "J"); + out.H = load_dense(j.at("H"), "H"); +} + +void from_json(const nlohmann::json &j, HostState &out) { + out.q_W_CC_prev.coeffs() = + load_dense(j.at("q_W_CC_prev"), "q_W_CC_prev"); + out.v_W_prev = load_dense(j.at("v_W_prev"), "v_W_prev"); + out.L_prev = load_dense(j.at("L_prev"), "L_prev"); + // The JSON solver_state is backend-specific (OSQP); each backend manages + // its own warm start, so we drop it. + out.qp.reset(); +} + +void from_json(const nlohmann::json &j, DebugDump &out) { + j.at("thresholds").get_to(out.thresholds); + j.at("system").get_to(out.system); + j.at("input").get_to(out.input); + j.at("state").get_to(out.state); + + // Solution: only x / utilization / slack_fraction; OsqpInfo is backend- + // specific and dropped. + const auto &sol = j.at("solution"); + out.solution.x = load_dense(sol.at("x"), "solution.x"); + out.solution.utilization = + load_dense(sol.at("utilization"), "solution.utilization"); + sol.at("slack_fraction").get_to(out.solution.slack_fraction); + out.solution.info = SolveInfo{}; + + out.b = load_dense(j.at("b"), "b"); + + if (j.contains("prev_state")) { + HostState ps; + from_json(j.at("prev_state"), ps); + out.prev_state = std::move(ps); + } else { + out.prev_state.reset(); + } +} + +DebugDump load_debug_dump(const std::string &path) { + std::ifstream stream(path); + if (!stream.is_open()) { + throw std::runtime_error{"failed to open debug dump file: " + path}; + } + nlohmann::json j; + try { + stream >> j; + } catch (const std::exception &e) { + throw std::runtime_error{"failed to parse JSON in debug dump '" + path + + "': " + e.what()}; + } + DebugDump dump; + from_json(j, dump); + return dump; +} + +double relative_inf_error(const VectorXd &a, const VectorXd &b, double floor) { + if (a.size() != b.size()) { + throw std::invalid_argument{ + "relative_inf_error: vector size mismatch"}; + } + if (a.size() == 0) return 0.0; + const double denom = std::max(b.cwiseAbs().maxCoeff(), floor); + return (a - b).cwiseAbs().maxCoeff() / denom; +} + +// =========================================================================== +// Device upload helpers. +// =========================================================================== + +void upload_csc_as_csr(const SparseMatrixCSC &src, DeviceCsr &dst) { + // The descriptor caches raw device pointers; tear it down before any + // potential buffer reallocation in copy_from_host. + if (dst.descr != nullptr) { + cusparseDestroySpMat(dst.descr); + dst.descr = nullptr; + } + + dst.rows = static_cast(src.rows()); + dst.cols = static_cast(src.cols()); + dst.nnz = static_cast(src.nonZeros()); + + // Eigen does the CSC -> CSR re-encoding on assignment. + SparseMatrixCSR tmp = src; + tmp.makeCompressed(); + + dst.rowptr.copy_from_host(tmp.outerIndexPtr(), + static_cast(dst.rows) + 1); + dst.colind.copy_from_host(tmp.innerIndexPtr(), + static_cast(dst.nnz)); + dst.values.copy_from_host(tmp.valuePtr(), + static_cast(dst.nnz)); + + // cuSPARSE allows null colind/values when nnz == 0. + BREAKAGE_CUSPARSE_CHECK(cusparseCreateCsr( + &dst.descr, static_cast(dst.rows), + static_cast(dst.cols), + static_cast(dst.nnz), dst.rowptr.data(), dst.colind.data(), + dst.values.data(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); +} + +void upload_system(const HostSystem &host, DeviceSystem &dev) { + dev.N = host.num_parts; + dev.K = host.num_clutches; + dev.ncv = host.num_contact_vertices; + dev.nx = host.num_vars; + dev.me = host.num_eq; + dev.mi = host.num_ineq; + dev.mh = host.num_relaxed_ineq; + dev.ncap = static_cast(host.capacity_clutch_indices.size()); + dev.total_mass = host.total_mass; + dev.L0 = host.L0; + + dev.mass.copy_from_host(host.mass.data(), + static_cast(host.mass.size())); + + // Device convention: row-major dense per-part data. q_CC / c_CC are + // ColMajor on the host so we transcode here. + Eigen::Matrix q_rm = host.q_CC; + Eigen::Matrix c_rm = host.c_CC; + dev.q_CC.copy_from_host(q_rm.data(), static_cast(q_rm.size())); + dev.c_CC.copy_from_host(c_rm.data(), static_cast(c_rm.size())); + // I_CC / capacities / clutch_whiten are already RowMajor on the host. + dev.I_CC.copy_from_host(host.I_CC.data(), + static_cast(host.I_CC.size())); + + upload_csc_as_csr(host.Q, dev.Q); + upload_csc_as_csr(host.A, dev.A); + upload_csc_as_csr(host.G, dev.G); + upload_csc_as_csr(host.H, dev.H); + upload_csc_as_csr(host.V, dev.V); + + dev.capacity_clutch_indices.copy_from_host( + host.capacity_clutch_indices.data(), + host.capacity_clutch_indices.size()); + dev.capacities.copy_from_host( + host.capacities.data(), + static_cast(host.capacities.size())); + dev.clutch_whiten.copy_from_host( + host.clutch_whiten.data(), + static_cast(host.clutch_whiten.size())); +} + +void upload_input(const HostInput &host, DeviceInput &dev) { + dev.dt = host.dt; + + auto upload_rm3 = [](const MatrixX3d &src, DeviceBuffer &dst_buf) { + Eigen::Matrix rm = src; + dst_buf.copy_from_host(rm.data(), static_cast(rm.size())); + }; + auto upload_rm4 = [](const MatrixX4d &src, DeviceBuffer &dst_buf) { + Eigen::Matrix rm = src; + dst_buf.copy_from_host(rm.data(), static_cast(rm.size())); + }; + + upload_rm3(host.w, dev.w); + upload_rm3(host.v, dev.v); + upload_rm4(host.q, dev.q); + upload_rm3(host.c, dev.c); + upload_rm3(host.J, dev.J); + upload_rm3(host.H, dev.H); +} + +void upload_state(const HostState &host, DeviceState &dev, int N) { + Vector4d quat = host.q_W_CC_prev.coeffs(); + dev.q_W_CC_prev.copy_from_host(quat.data(), 4); + + if (host.v_W_prev.rows() != N || host.L_prev.rows() != N) { + throw std::invalid_argument{ + "upload_state: v_W_prev / L_prev row count does not match N"}; + } + Eigen::Matrix v_rm = host.v_W_prev; + Eigen::Matrix L_rm = host.L_prev; + const std::size_t n3 = + static_cast(3) * static_cast(N); + dev.v_W_prev.copy_from_host(v_rm.data(), n3); + dev.L_prev.copy_from_host(L_rm.data(), n3); +} + +void download_state(const DeviceState &dev, HostState &host, int N) { + Vector4d quat; + dev.q_W_CC_prev.copy_to_host(quat.data(), 4); + host.q_W_CC_prev.coeffs() = quat; + + const std::size_t n3 = + static_cast(3) * static_cast(N); + Eigen::Matrix v_rm(N, 3); + Eigen::Matrix L_rm(N, 3); + dev.v_W_prev.copy_to_host(v_rm.data(), n3); + dev.L_prev.copy_to_host(L_rm.data(), n3); + host.v_W_prev = v_rm; + host.L_prev = L_rm; +} + +} // namespace breakage_cuda diff --git a/native/cuda/breakage_cuda/tests/bench.cpp b/native/cuda/breakage_cuda/tests/bench.cpp new file mode 100644 index 0000000..745af54 --- /dev/null +++ b/native/cuda/breakage_cuda/tests/bench.cpp @@ -0,0 +1,403 @@ +// SPDX-License-Identifier: MIT +// +// Wall-clock benchmark for the full breakage pipeline. +// +// Usage: ./breakage_cuda_bench [N_parts] (default N_parts = 32) +// +// Builds a synthetic chain of N_parts parts joined by N_parts - 1 clutches +// (matching the dimensional layout enforced by `BreakageSystem::check_shape` +// in breakage.cppm) and times N iterations of: +// * `cuda_qp::Pipeline::solve` -- GPU end-to-end pipeline +// * `ref_cpu::solve_full` -- OSQP-backed CPU baseline (reference) +// +// We use std::chrono::steady_clock and call cudaDeviceSynchronize() before +// taking the GPU stop time so the measurement covers the full kernel +// completion, not just queueing. + +#include "breakage_cuda/qp_solver.hpp" +#include "breakage_cuda/ref_cpu.hpp" +#include "breakage_cuda/system.hpp" + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace { + +using namespace breakage_cuda; +using clock_type = std::chrono::steady_clock; +using Trip = Eigen::Triplet; + +// --------------------------------------------------------------------------- +// Synthetic builder for `N_parts` parts arranged on the x-axis at unit +// spacing. Each consecutive pair is joined by one clutch, giving K = N - 1. +// +// All matrices are tiled copies of the (N=2, K=1) building block in +// test_full_pipeline.cpp so the per-clutch numerics stay well-conditioned. +// +// Dimensions (per check_shape in breakage.cppm): +// num_parts = N +// num_clutches = K = N - 1 +// num_contact_vertices = 0 +// num_vars = 9 * K +// num_eq = 6 * N +// num_ineq = K (one axial-non-negativity per clutch) +// num_relaxed_ineq = 2 * K (two friction-cone rows per clutch) +// capacities = 2 * K rows +// clutch_whiten = K rows +// --------------------------------------------------------------------------- +HostSystem make_synthetic_system_n(int N) { + if (N < 2) N = 2; + const int K = N - 1; + + HostSystem sys; + sys.num_parts = N; + sys.num_clutches = K; + sys.num_contact_vertices = 0; + sys.num_vars = 9 * K; + sys.num_eq = 6 * N; + sys.num_ineq = K; + sys.num_relaxed_ineq = 2 * K; + sys.total_mass = static_cast(N); + sys.L0 = 1.0; + + sys.mass = VectorXd::Constant(N, 1.0); + + sys.q_CC.resize(N, 4); + sys.c_CC.resize(N, 3); + sys.I_CC.resize(N, 9); + for (int i = 0; i < N; ++i) { + sys.q_CC.row(i) << 0.0, 0.0, 0.0, 1.0; + sys.c_CC.row(i) << static_cast(i) - (N - 1) / 2.0, 0.0, 0.0; + sys.I_CC.row(i) << 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0; + } + + // Q: small positive-definite diagonal on every variable. + std::vector Qt; + Qt.reserve(static_cast(9 * K)); + for (int v = 0; v < 9 * K; ++v) Qt.emplace_back(v, v, 0.1); + sys.Q.resize(9 * K, 9 * K); + sys.Q.setFromTriplets(Qt.begin(), Qt.end()); + sys.Q.makeCompressed(); + + // A: 6N x 9K. For each clutch k between parts k and k+1, the three axial + // variables x[9k..9k+2] couple part k's linear residual with -I_3 and + // part k+1's with +I_3. Angular rows of A_i / A_j stay zero - the angular + // component of b becomes projection-stage slack on both backends. + std::vector At; + At.reserve(static_cast(6 * K)); + for (int k = 0; k < K; ++k) { + const int v0 = 9 * k; + const int r_i = 6 * k; + const int r_j = 6 * (k + 1); + for (int axis = 0; axis < 3; ++axis) { + At.emplace_back(r_i + axis, v0 + axis, -1.0); + At.emplace_back(r_j + axis, v0 + axis, 1.0); + } + } + sys.A.resize(6 * N, 9 * K); + sys.A.setFromTriplets(At.begin(), At.end()); + sys.A.makeCompressed(); + + // G: x[9k] >= 0 (axial force compressive) for each clutch. + std::vector Gt; + Gt.reserve(static_cast(K)); + for (int k = 0; k < K; ++k) Gt.emplace_back(k, 9 * k, 1.0); + sys.G.resize(K, 9 * K); + sys.G.setFromTriplets(Gt.begin(), Gt.end()); + sys.G.makeCompressed(); + + // H: 2 friction-cone rows per clutch, low magnitudes so the limits stay + // slack and we do not measure the constraint-saturation regime. + std::vector Ht; + Ht.reserve(static_cast(4 * K)); + for (int k = 0; k < K; ++k) { + const int v0 = 9 * k; + const int r0 = 2 * k; + Ht.emplace_back(r0, v0, -0.1); + Ht.emplace_back(r0, v0 + 1, 0.1); + Ht.emplace_back(r0 + 1, v0, -0.1); + Ht.emplace_back(r0 + 1, v0 + 1, -0.1); + } + sys.H.resize(2 * K, 9 * K); + sys.H.setFromTriplets(Ht.begin(), Ht.end()); + sys.H.makeCompressed(); + + // V: 2K x K, identity on each (pair-of-rows, clutch) block. + std::vector Vt; + Vt.reserve(static_cast(2 * K)); + for (int k = 0; k < K; ++k) { + Vt.emplace_back(2 * k, k, 1.0); + Vt.emplace_back(2 * k + 1, k, 1.0); + } + sys.V.resize(2 * K, K); + sys.V.setFromTriplets(Vt.begin(), Vt.end()); + sys.V.makeCompressed(); + + sys.capacity_clutch_indices.resize(static_cast(2 * K)); + for (int k = 0; k < K; ++k) { + sys.capacity_clutch_indices[2 * k] = k; + sys.capacity_clutch_indices[2 * k + 1] = k; + } + sys.capacities.resize(2 * K, 9); + for (int k = 0; k < K; ++k) { + sys.capacities.row(2 * k) << 1.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0; + sys.capacities.row(2 * k + 1) << 1.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, + 0.0; + } + + // Wk = identity per clutch. (The CPU reads it column-major and the GPU + // reads it row-major; both interpretations match for a symmetric Wk.) + sys.clutch_whiten.resize(K, 4); + for (int k = 0; k < K; ++k) { + sys.clutch_whiten.row(k) << 1.0, 0.0, 0.0, 1.0; + } + + return sys; +} + +HostInput make_synthetic_input_n(int N) { + HostInput in; + in.dt = 0.01; + in.w = MatrixX3d::Zero(N, 3); + + // Alternating-sign y velocities so consecutive parts pull on each other. + in.v.resize(N, 3); + for (int i = 0; i < N; ++i) { + const double s = (i % 2 == 0) ? 0.05 : -0.05; + in.v.row(i) << 0.0, s, 0.0; + } + + in.q.resize(N, 4); + in.c.resize(N, 3); + for (int i = 0; i < N; ++i) { + in.q.row(i) << 0.0, 0.0, 0.0, 1.0; + const double dy = (i % 2 == 0) ? 0.0005 : -0.0005; + in.c.row(i) << static_cast(i) - (N - 1) / 2.0, dy, 0.0; + } + + in.J = MatrixX3d::Zero(N, 3); + in.H = MatrixX3d::Zero(N, 3); + return in; +} + +HostState make_synthetic_state_n(int N) { + HostState st; + st.q_W_CC_prev = Eigen::Quaterniond::Identity(); + st.v_W_prev = MatrixX3d::Zero(N, 3); + st.L_prev = MatrixX3d::Zero(N, 3); + return st; +} + +// --------------------------------------------------------------------------- +// Timing helpers. +// --------------------------------------------------------------------------- +struct Stats { + double mean_us{}; + double std_us{}; + double min_us{}; + double max_us{}; + int n{}; +}; + +Stats summarize(const std::vector &samples_us) { + Stats s; + s.n = static_cast(samples_us.size()); + if (samples_us.empty()) return s; + double sum = 0.0; + s.min_us = samples_us[0]; + s.max_us = samples_us[0]; + for (double v : samples_us) { + sum += v; + if (v < s.min_us) s.min_us = v; + if (v > s.max_us) s.max_us = v; + } + s.mean_us = sum / static_cast(samples_us.size()); + double var = 0.0; + for (double v : samples_us) { + const double d = v - s.mean_us; + var += d * d; + } + if (samples_us.size() > 1) { + var /= static_cast(samples_us.size() - 1); + } else { + var = 0.0; + } + s.std_us = std::sqrt(var); + return s; +} + +void print_stats(const char *label, const Stats &s) { + std::printf("[bench] %s n=%d mean=%9.1f us std=%8.1f us " + "min=%9.1f us max=%9.1f us\n", + label, s.n, s.mean_us, s.std_us, s.min_us, s.max_us); +} + +// --------------------------------------------------------------------------- +// Per-backend benchmark drivers. +// +// Both warm-start across iterations (mirrors how BreakageChecker is used in +// production: state.qp persists between frames). The first few iterations +// are dropped to give the warm-start path a chance to stabilise. +// --------------------------------------------------------------------------- +Stats bench_gpu(const HostSystem &sys, const HostInput &in, + const Thresholds &thr, int warmup, int measured) { + const bool tolerate = []() { + const char *e = std::getenv("BENCH_TOLERATE_NONCONVERGED"); + return e && std::atoi(e) > 0; + }(); + + cuda_qp::Pipeline pipe; + pipe.setup(sys, thr); + + HostState state = make_synthetic_state_n(sys.num_parts); + + for (int i = 0; i < warmup; ++i) { + Solution sol = pipe.solve(in, state); + if (!sol.info.converged) { + std::fprintf(stderr, + "[bench] gpu warmup %d/%d did not converge " + "(prj=%d rlx=%d opt=%d, prim=%.2e dual=%.2e)%s\n", + i + 1, warmup, static_cast(sol.info.prj.converged), + static_cast(sol.info.rlx.converged), + static_cast(sol.info.opt.converged), + sol.info.opt.prim_res, sol.info.opt.dual_res, + tolerate ? " (tolerated)" : ""); + if (!tolerate) std::exit(2); + } + } + BREAKAGE_CUDA_CHECK(cudaDeviceSynchronize()); + + std::vector samples_us; + samples_us.reserve(static_cast(measured)); + for (int i = 0; i < measured; ++i) { + const auto t0 = clock_type::now(); + Solution sol = pipe.solve(in, state); + BREAKAGE_CUDA_CHECK(cudaDeviceSynchronize()); + const auto t1 = clock_type::now(); + if (!sol.info.converged) { + std::fprintf(stderr, + "[bench] gpu measured iter %d/%d did not converge%s\n", + i + 1, measured, tolerate ? " (tolerated)" : ""); + if (!tolerate) std::exit(2); + } + const double us = + std::chrono::duration(t1 - t0).count(); + samples_us.push_back(us); + } + return summarize(samples_us); +} + +Stats bench_cpu(const HostSystem &sys, const HostInput &in, + const Thresholds &thr, int warmup, int measured) { + HostState state = make_synthetic_state_n(sys.num_parts); + + // BENCH_TOLERATE_NONCONVERGED=1 turns "did not converge" into a warning so + // we can still get wall-clock numbers at very large N where OSQP/our ADMM + // would otherwise hit max_iter and abort. + const bool tolerate = []() { + const char *e = std::getenv("BENCH_TOLERATE_NONCONVERGED"); + return e && std::atoi(e) > 0; + }(); + + for (int i = 0; i < warmup; ++i) { + Solution sol = ref_cpu::solve_full(sys, in, state, thr); + if (!sol.info.converged) { + std::fprintf(stderr, + "[bench] cpu warmup %d/%d did not converge%s\n", i + 1, + warmup, tolerate ? " (tolerated)" : ""); + if (!tolerate) std::exit(2); + } + } + + std::vector samples_us; + samples_us.reserve(static_cast(measured)); + for (int i = 0; i < measured; ++i) { + const auto t0 = clock_type::now(); + Solution sol = ref_cpu::solve_full(sys, in, state, thr); + const auto t1 = clock_type::now(); + if (!sol.info.converged) { + std::fprintf(stderr, + "[bench] cpu measured iter %d/%d did not converge%s\n", + i + 1, measured, tolerate ? " (tolerated)" : ""); + if (!tolerate) std::exit(2); + } + const double us = + std::chrono::duration(t1 - t0).count(); + samples_us.push_back(us); + } + return summarize(samples_us); +} + +} // namespace + +int main(int argc, char **argv) { + try { + int N = 32; + if (argc >= 2) { + N = std::atoi(argv[1]); + if (N < 2) { + std::fprintf(stderr, "[bench] N_parts must be >= 2 (got %d)\n", N); + return 1; + } + } + + std::printf("[bench] N_parts=%d\n", N); + + HostSystem sys = make_synthetic_system_n(N); + HostInput in = make_synthetic_input_n(N); + Thresholds thr; + thr.enabled = true; + + std::printf("[bench] num_parts=%d num_clutches=%d num_vars=%d " + "num_eq=%d num_ineq=%d num_relaxed_ineq=%d\n", + sys.num_parts, sys.num_clutches, sys.num_vars, sys.num_eq, + sys.num_ineq, sys.num_relaxed_ineq); + + int kGpuWarmup = 5; + int kGpuMeasured = 50; + int kCpuWarmup = 5; + int kCpuMeasured = 20; + if (const char *q = std::getenv("BENCH_QUICK"); q && std::atoi(q) > 0) { + kGpuWarmup = 1; + kGpuMeasured = 1; + kCpuWarmup = 1; + kCpuMeasured = 1; + } + if (const char *g = std::getenv("BENCH_GPU_ITER"); g && std::atoi(g) > 0) { + kGpuMeasured = std::atoi(g); + } + if (const char *c = std::getenv("BENCH_CPU_ITER"); c && std::atoi(c) > 0) { + kCpuMeasured = std::atoi(c); + } + + const Stats gpu = bench_gpu(sys, in, thr, kGpuWarmup, kGpuMeasured); + const Stats cpu = bench_cpu(sys, in, thr, kCpuWarmup, kCpuMeasured); + + print_stats("gpu pipe.solve ", gpu); + print_stats("cpu solve_full ", cpu); + + if (gpu.mean_us > 0.0) { + const double speedup = cpu.mean_us / gpu.mean_us; + std::printf("[bench] speedup cpu_mean / gpu_mean = %.2fx\n", + speedup); + } + + return 0; + } catch (const std::exception &e) { + std::fprintf(stderr, "[bench] ERROR: %s\n", e.what()); + return 2; + } +} diff --git a/native/cuda/breakage_cuda/tests/test_dense_kernels.cpp b/native/cuda/breakage_cuda/tests/test_dense_kernels.cpp new file mode 100644 index 0000000..adf60a7 --- /dev/null +++ b/native/cuda/breakage_cuda/tests/test_dense_kernels.cpp @@ -0,0 +1,473 @@ +// SPDX-License-Identifier: MIT +// +// Correctness check: each dense CUDA kernel vs its CPU reference on a single +// synthetic problem (N = 8 parts). +// +// Coverage: +// 1. cuda_kernels::fit_se3 vs ref_cpu::fit_se3 +// 2. cuda_kernels::fit_twist vs ref_cpu::fit_twist +// 3. cuda_kernels::compute_Pi_host vs ref_cpu::compute_Pi +// 4. cuda_kernels::compute_L vs ref_cpu::compute_L +// 5. cuda_kernels::assemble_b vs ref_cpu::assemble_b +// +// Layout conventions (verified against src/system.cpp upload helpers): +// * MatrixX3d / MatrixX4d aliases default to ColMajor; we transcode to +// RowMajor before any DeviceBuffer upload, matching upload_input(). +// * I_CC is RowMajor on the host already (see system.hpp), so its raw +// .data() can be uploaded directly. +// * Quaternions are flat xyzw, length-4. Eigen::Quaterniond::coeffs() +// returns xyzw, so we round-trip via that. +// +// No bug fixes were required in src/kernels.cu, src/linalg.cu, or +// src/ref_cpu.cpp to make this test pass. + +#include "breakage_cuda/device.hpp" +#include "breakage_cuda/kernels.hpp" +#include "breakage_cuda/linalg.hpp" +#include "breakage_cuda/ref_cpu.hpp" +#include "breakage_cuda/system.hpp" + +#include + +#include +#include +#include +#include +#include +#include + +namespace bc = breakage_cuda; + +namespace { + +constexpr int kN = 8; + +// Per-test record collected so we can emit a final summary table. +struct TestResult { + std::string name; + bool passed = false; + double max_abs_diff = 0.0; + double tol = 0.0; + std::string detail; +}; + +void log_pass(const std::string &name, double diff, const std::string &detail) { + std::printf("[PASS] %-12s max_abs_diff=%.3e %s\n", name.c_str(), diff, + detail.c_str()); +} + +void log_fail(const std::string &name, double diff, const std::string &detail) { + std::printf("[FAIL] %-12s max_abs_diff=%.3e %s\n", name.c_str(), diff, + detail.c_str()); +} + +// Sample a uniformly random unit quaternion via four Gaussians + normalize. +Eigen::Quaterniond random_unit_quat(std::mt19937 &rng) { + std::normal_distribution n(0.0, 1.0); + Eigen::Quaterniond q; + // (x, y, z, w) order in coeffs(). + q.coeffs() << n(rng), n(rng), n(rng), n(rng); + q.normalize(); + if (q.w() < 0.0) { + q.coeffs() *= -1.0; + } + return q; +} + +// Compare two quaternions modulo the q ~ -q sign ambiguity. +double quat_diff_sign_invariant(const Eigen::Quaterniond &a, + const Eigen::Quaterniond &b) { + double d_pos = (a.coeffs() - b.coeffs()).cwiseAbs().maxCoeff(); + double d_neg = (a.coeffs() + b.coeffs()).cwiseAbs().maxCoeff(); + return std::min(d_pos, d_neg); +} + +// Max-abs diff between an N x 3 row-major flat buffer and an Eigen N x 3 +// matrix (any storage order). +double max_abs_diff_n3(const double *buf_rm, const bc::MatrixX3d &ref) { + Eigen::Map> + view(buf_rm, ref.rows(), 3); + return (view - ref).cwiseAbs().maxCoeff(); +} + +// Max-abs diff between a 9-double row-major buffer and an Eigen Matrix3d +// (Eigen Matrix3d is column-major by default; we compare element-wise). +double max_abs_diff_3x3(const double *buf_rm, const Eigen::Matrix3d &ref) { + double d = 0.0; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + d = std::max(d, std::abs(buf_rm[3 * i + j] - ref(i, j))); + } + } + return d; +} + +// Random N x 3 dense matrix with entries from [lo, hi]. +bc::MatrixX3d random_n3(std::mt19937 &rng, int N, double lo, double hi) { + std::uniform_real_distribution u(lo, hi); + bc::MatrixX3d m(N, 3); + for (int i = 0; i < N; ++i) + for (int j = 0; j < 3; ++j) m(i, j) = u(rng); + return m; +} + +// Pack an Eigen N x 3 (any layout) into a row-major flat std::vector for +// device upload. +std::vector to_rm3_flat(const bc::MatrixX3d &m) { + std::vector v(static_cast(m.rows() * 3)); + for (int i = 0; i < m.rows(); ++i) { + v[3 * i + 0] = m(i, 0); + v[3 * i + 1] = m(i, 1); + v[3 * i + 2] = m(i, 2); + } + return v; +} + +// Same for N x 4 (used for quaternions). +std::vector to_rm4_flat(const bc::MatrixX4d &m) { + std::vector v(static_cast(m.rows() * 4)); + for (int i = 0; i < m.rows(); ++i) { + v[4 * i + 0] = m(i, 0); + v[4 * i + 1] = m(i, 1); + v[4 * i + 2] = m(i, 2); + v[4 * i + 3] = m(i, 3); + } + return v; +} + +} // namespace + +int main() { + std::printf("test_dense_kernels: N=%d\n", kN); + + std::mt19937 rng(0xDEADBEEFu); + std::uniform_real_distribution u_mass(0.5, 2.0); + std::uniform_real_distribution u_pos(-1.0, 1.0); + std::uniform_real_distribution u_small(-0.05, 0.05); + std::normal_distribution n01(0.0, 1.0); + + // Per-part mass + total mass. + bc::VectorXd mass(kN); + for (int i = 0; i < kN; ++i) mass(i) = u_mass(rng); + const double total_mass = mass.sum(); + + // Reference frame: q0 random unit quat, t0 random in [-1, 1]^3. + // Current frame: qx = q0 * small rotation, tx = t0 + small noise. This + // keeps fit_se3's optimum well-defined and away from the q ~ -q boundary. + bc::MatrixX4d q0(kN, 4); + bc::MatrixX3d t0(kN, 3); + bc::MatrixX4d qx(kN, 4); + bc::MatrixX3d tx(kN, 3); + for (int i = 0; i < kN; ++i) { + Eigen::Quaterniond q = random_unit_quat(rng); + q0(i, 0) = q.x(); + q0(i, 1) = q.y(); + q0(i, 2) = q.z(); + q0(i, 3) = q.w(); + t0(i, 0) = u_pos(rng); + t0(i, 1) = u_pos(rng); + t0(i, 2) = u_pos(rng); + + Eigen::Vector3d axis(n01(rng), n01(rng), n01(rng)); + if (axis.norm() < 1e-12) axis = Eigen::Vector3d::UnitX(); + double angle = 0.05 * u_pos(rng); + Eigen::Quaterniond rot(Eigen::AngleAxisd(angle, axis.normalized())); + Eigen::Quaterniond qxq = q * rot; + qxq.normalize(); + if (qxq.w() < 0.0) qxq.coeffs() *= -1.0; + qx(i, 0) = qxq.x(); + qx(i, 1) = qxq.y(); + qx(i, 2) = qxq.z(); + qx(i, 3) = qxq.w(); + + tx(i, 0) = t0(i, 0) + u_small(rng); + tx(i, 1) = t0(i, 1) + u_small(rng); + tx(i, 2) = t0(i, 2) + u_small(rng); + } + + // Per-part w / v for fit_twist (small range). + bc::MatrixX3d w_in = random_n3(rng, kN, -0.5, 0.5); + bc::MatrixX3d v_in = random_n3(rng, kN, -0.5, 0.5); + // c_CC[i] = t0[i] (per the task spec). + bc::MatrixX3d c_CC = t0; + + // Inertia tensors: diag(2, 3, 5) + small symmetric off-diagonal noise. + // Stored row-major as (N, 9), matching HostSystem::I_CC. + Eigen::Matrix I_flat(kN, 9); + for (int i = 0; i < kN; ++i) { + double a01 = 0.05 * u_pos(rng); + double a02 = 0.05 * u_pos(rng); + double a12 = 0.05 * u_pos(rng); + double diag[3] = {2.0, 3.0, 5.0}; + double M[9] = {diag[0], a01, a02, // + a01, diag[1], a12, // + a02, a12, diag[2]}; + for (int k = 0; k < 9; ++k) I_flat(i, k) = M[k]; + } + + // J / H impulses for assemble_b. + bc::MatrixX3d J = random_n3(rng, kN, -0.1, 0.1); + bc::MatrixX3d H = random_n3(rng, kN, -0.1, 0.1); + + // v_W / L scratch matrices (random) for assemble_b. Note these are + // independent of fit_twist's outputs; the kernel just consumes them. + bc::MatrixX3d v_W_curr = random_n3(rng, kN, -0.5, 0.5); + bc::MatrixX3d v_W_prev = random_n3(rng, kN, -0.5, 0.5); + bc::MatrixX3d L_curr = random_n3(rng, kN, -0.5, 0.5); + bc::MatrixX3d L_prev = random_n3(rng, kN, -0.5, 0.5); + + const double lambda_R = 1e-3; + const double lambda_w = 1e-3; + const double dt = 1.0 / 240.0; + const double L0 = 1.0; + + // -- Reusable device buffers ---------------------------------------------- + std::vector q0_h = to_rm4_flat(q0); + std::vector qx_h = to_rm4_flat(qx); + std::vector t0_h = to_rm3_flat(t0); + std::vector tx_h = to_rm3_flat(tx); + std::vector c_CC_h = to_rm3_flat(c_CC); + std::vector w_h = to_rm3_flat(w_in); + std::vector v_h = to_rm3_flat(v_in); + + bc::DeviceBuffer q0_d, qx_d, t0_d, tx_d, c_CC_d, w_d, v_d, mass_d; + q0_d.copy_from_host(q0_h.data(), q0_h.size()); + qx_d.copy_from_host(qx_h.data(), qx_h.size()); + t0_d.copy_from_host(t0_h.data(), t0_h.size()); + tx_d.copy_from_host(tx_h.data(), tx_h.size()); + c_CC_d.copy_from_host(c_CC_h.data(), c_CC_h.size()); + w_d.copy_from_host(w_h.data(), w_h.size()); + v_d.copy_from_host(v_h.data(), v_h.size()); + mass_d.copy_from_host(mass.data(), static_cast(mass.size())); + + std::vector results; + + // ------------------------------------------------------------------------- + // 1. fit_se3 + // ------------------------------------------------------------------------- + { + bc::ref_cpu::Transformd cpu = bc::ref_cpu::fit_se3( + q0, t0, qx, tx, mass, total_mass, lambda_R); + + double q_out[4] = {0, 0, 0, 0}; + double t_out[3] = {0, 0, 0}; + bc::cuda_kernels::fit_se3(kN, q0_d.data(), t0_d.data(), qx_d.data(), + tx_d.data(), mass_d.data(), total_mass, + lambda_R, q_out, t_out); + + Eigen::Quaterniond q_gpu; + q_gpu.coeffs() << q_out[0], q_out[1], q_out[2], q_out[3]; + + double q_diff = quat_diff_sign_invariant(cpu.q, q_gpu); + double t_diff = + (cpu.t - Eigen::Vector3d(t_out[0], t_out[1], t_out[2])).cwiseAbs().maxCoeff(); + double max_d = std::max(q_diff, t_diff); + char buf[160]; + std::snprintf(buf, sizeof(buf), "(q_diff=%.3e, t_diff=%.3e)", q_diff, + t_diff); + constexpr double tol = 1e-9; + TestResult r{"fit_se3", max_d <= tol, max_d, tol, buf}; + if (r.passed) + log_pass(r.name, max_d, buf); + else + log_fail(r.name, max_d, buf); + results.push_back(std::move(r)); + } + + // ------------------------------------------------------------------------- + // 2. fit_twist + // + // Use T_W_CC = identity quaternion (xyzw = (0, 0, 0, 1)) + zero translation + // so the c_W = c_CC simplification holds and any divergence comes from the + // twist solve itself. + // ------------------------------------------------------------------------- + { + bc::ref_cpu::Transformd T_W_CC{Eigen::Quaterniond::Identity(), + bc::Vector3d::Zero()}; + bc::ref_cpu::TwistFitResult cpu = bc::ref_cpu::fit_twist( + w_in, v_in, c_CC, T_W_CC, mass, total_mass, lambda_w); + + const double q_W_CC[4] = {0.0, 0.0, 0.0, 1.0}; + const double t_W_CC[3] = {0.0, 0.0, 0.0}; + double w0_out[3] = {0, 0, 0}; + double v0_out[3] = {0, 0, 0}; + bc::DeviceBuffer v_W_d(static_cast(3) * kN); + + bc::cuda_kernels::fit_twist(kN, w_d.data(), v_d.data(), c_CC_d.data(), + q_W_CC, t_W_CC, mass_d.data(), total_mass, + lambda_w, w0_out, v0_out, v_W_d.data()); + + Eigen::Vector3d w0_gpu(w0_out[0], w0_out[1], w0_out[2]); + Eigen::Vector3d v0_gpu(v0_out[0], v0_out[1], v0_out[2]); + double w0_diff = (cpu.w0 - w0_gpu).cwiseAbs().maxCoeff(); + double v0_diff = (cpu.v0 - v0_gpu).cwiseAbs().maxCoeff(); + + std::vector v_W_h(static_cast(3) * kN); + v_W_d.copy_to_host(v_W_h.data(), v_W_h.size()); + double vW_diff = max_abs_diff_n3(v_W_h.data(), cpu.v_W); + + double max_d = std::max({w0_diff, v0_diff, vW_diff}); + char buf[200]; + std::snprintf(buf, sizeof(buf), + "(w0_diff=%.3e, v0_diff=%.3e, v_W_diff=%.3e)", w0_diff, + v0_diff, vW_diff); + constexpr double tol = 1e-9; + TestResult r{"fit_twist", max_d <= tol, max_d, tol, buf}; + if (r.passed) + log_pass(r.name, max_d, buf); + else + log_fail(r.name, max_d, buf); + results.push_back(std::move(r)); + } + + // ------------------------------------------------------------------------- + // 3. compute_Pi + // ------------------------------------------------------------------------- + { + Eigen::Quaterniond qm = random_unit_quat(rng); + Eigen::Quaterniond qp = random_unit_quat(rng); + + Eigen::Matrix3d Pi_cpu = bc::ref_cpu::compute_Pi(qm, qp); + + double qm_a[4] = {qm.x(), qm.y(), qm.z(), qm.w()}; + double qp_a[4] = {qp.x(), qp.y(), qp.z(), qp.w()}; + double Pi_gpu[9] = {0}; + bc::cuda_kernels::compute_Pi_host(qm_a, qp_a, Pi_gpu); + + double max_d = max_abs_diff_3x3(Pi_gpu, Pi_cpu); + char buf[80]; + std::snprintf(buf, sizeof(buf), "(3x3)"); + constexpr double tol = 1e-12; + TestResult r{"compute_Pi", max_d <= tol, max_d, tol, buf}; + if (r.passed) + log_pass(r.name, max_d, buf); + else + log_fail(r.name, max_d, buf); + results.push_back(std::move(r)); + } + + // ------------------------------------------------------------------------- + // 4. compute_L + // ------------------------------------------------------------------------- + { + Eigen::Quaterniond q_W_CC = random_unit_quat(rng); + Eigen::Vector3d w0(0.7, -0.4, 0.2); + + bc::MatrixX3d L_ref = bc::ref_cpu::compute_L(I_flat, q_W_CC, w0); + + bc::DeviceBuffer I_CC_d; + // I_flat is already RowMajor; .data() is row-major (N, 9) flatten. + I_CC_d.copy_from_host(I_flat.data(), + static_cast(I_flat.size())); + + double q_a[4] = {q_W_CC.x(), q_W_CC.y(), q_W_CC.z(), q_W_CC.w()}; + double w_a[3] = {w0.x(), w0.y(), w0.z()}; + bc::DeviceBuffer L_d(static_cast(3) * kN); + bc::cuda_kernels::compute_L(kN, I_CC_d.data(), q_a, w_a, L_d.data()); + + std::vector L_h(static_cast(3) * kN); + L_d.copy_to_host(L_h.data(), L_h.size()); + + double max_d = max_abs_diff_n3(L_h.data(), L_ref); + char buf[80]; + std::snprintf(buf, sizeof(buf), "(N=%d, 3 cols)", kN); + constexpr double tol = 1e-9; + TestResult r{"compute_L", max_d <= tol, max_d, tol, buf}; + if (r.passed) + log_pass(r.name, max_d, buf); + else + log_fail(r.name, max_d, buf); + results.push_back(std::move(r)); + } + + // ------------------------------------------------------------------------- + // 5. assemble_b + // + // ref_cpu::assemble_b only reads sys.{num_parts, mass, L0} and + // in.{dt, J, H} (sparse blocks of HostSystem are untouched), so we leave + // the rest at default-constructed. + // ------------------------------------------------------------------------- + { + bc::HostSystem sys; + sys.num_parts = kN; + sys.mass = mass; + sys.L0 = L0; + + bc::HostInput in; + in.dt = dt; + in.J = J; + in.H = H; + + Eigen::Matrix3d Pi; + Pi << 1.1, 0.2, -0.1, + -0.05, 0.95, 0.07, + 0.03, -0.02, 1.03; + + bc::VectorXd b_cpu = bc::ref_cpu::assemble_b(sys, in, v_W_curr, v_W_prev, + L_curr, L_prev, Pi); + + std::vector v_W_curr_h = to_rm3_flat(v_W_curr); + std::vector v_W_prev_h = to_rm3_flat(v_W_prev); + std::vector L_curr_h = to_rm3_flat(L_curr); + std::vector L_prev_h = to_rm3_flat(L_prev); + std::vector J_h = to_rm3_flat(J); + std::vector H_h = to_rm3_flat(H); + + bc::DeviceBuffer v_W_curr_d, v_W_prev_d, L_curr_d, L_prev_d, J_d, + H_d; + v_W_curr_d.copy_from_host(v_W_curr_h.data(), v_W_curr_h.size()); + v_W_prev_d.copy_from_host(v_W_prev_h.data(), v_W_prev_h.size()); + L_curr_d.copy_from_host(L_curr_h.data(), L_curr_h.size()); + L_prev_d.copy_from_host(L_prev_h.data(), L_prev_h.size()); + J_d.copy_from_host(J_h.data(), J_h.size()); + H_d.copy_from_host(H_h.data(), H_h.size()); + + // Pi in row-major flat form for the CUDA kernel. + double Pi_flat[9]; + for (int i = 0; i < 3; ++i) + for (int j = 0; j < 3; ++j) Pi_flat[3 * i + j] = Pi(i, j); + + bc::DeviceBuffer b_d(static_cast(6) * kN); + bc::cuda_kernels::assemble_b(kN, v_W_curr_d.data(), v_W_prev_d.data(), + L_curr_d.data(), L_prev_d.data(), J_d.data(), + H_d.data(), mass_d.data(), Pi_flat, dt, L0, + b_d.data()); + + std::vector b_h(static_cast(6) * kN); + b_d.copy_to_host(b_h.data(), b_h.size()); + + // b_cpu is laid out row-major as (N, 6); compare element-wise to the + // (already row-major) GPU buffer. + double max_d = 0.0; + for (std::size_t i = 0; i < b_h.size(); ++i) { + max_d = std::max(max_d, std::abs(b_h[i] - b_cpu(static_cast(i)))); + } + char buf[80]; + std::snprintf(buf, sizeof(buf), "(6N=%d)", 6 * kN); + constexpr double tol = 1e-9; + TestResult r{"assemble_b", max_d <= tol, max_d, tol, buf}; + if (r.passed) + log_pass(r.name, max_d, buf); + else + log_fail(r.name, max_d, buf); + results.push_back(std::move(r)); + } + + // ------------------------------------------------------------------------- + // Summary table. + // ------------------------------------------------------------------------- + std::printf("\n=========================== Summary ===========================\n"); + std::printf("%-15s %-6s %-12s %-12s %s\n", "kernel", "status", "max_diff", + "tol", "detail"); + std::printf("---------------------------------------------------------------\n"); + bool all_pass = true; + for (const auto &r : results) { + std::printf("%-15s %-6s %-12.3e %-12.3e %s\n", r.name.c_str(), + r.passed ? "PASS" : "FAIL", r.max_abs_diff, r.tol, + r.detail.c_str()); + if (!r.passed) all_pass = false; + } + std::printf("===============================================================\n"); + std::printf("%s\n", all_pass ? "OVERALL: PASS" : "OVERALL: FAIL"); + return all_pass ? 0 : 1; +} diff --git a/native/cuda/breakage_cuda/tests/test_full_pipeline.cpp b/native/cuda/breakage_cuda/tests/test_full_pipeline.cpp new file mode 100644 index 0000000..e6b6e77 --- /dev/null +++ b/native/cuda/breakage_cuda/tests/test_full_pipeline.cpp @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: MIT +// +// End-to-end pipeline test: drives the OSQP-backed CPU baseline +// (`ref_cpu::solve_full`) and the hand-written CUDA pipeline +// (`cuda_qp::Pipeline`) on the same input and checks that they agree on the +// un-whitened solution `x` and the per-clutch utilization vector. +// +// Two modes: +// 1. JSON-driven: argv[1] is a path to a debug-dump JSON produced by +// bricksim::BreakageChecker (i.e. one of the dumps that +// `breakage_cuda::load_debug_dump` accepts). +// 2. Synthetic fallback (no args): builds the smallest non-trivial +// HostSystem (N=2 parts, K=1 clutch, ncv=0) by hand. The matrices follow +// the `BreakageSystem::check_shape` invariants from breakage.cppm: +// `num_vars = ncv + 9*K`, `num_eq = 6*N`, V is (mh, K), etc. +// +// Tolerance: relative inf-norm error <= 1e-2. Both solvers are independent +// ADMM implementations so bit-exact agreement is not expected; we only check +// that they land in the same neighbourhood. + +#include "breakage_cuda/qp_solver.hpp" +#include "breakage_cuda/ref_cpu.hpp" +#include "breakage_cuda/system.hpp" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace { + +using namespace breakage_cuda; +using Trip = Eigen::Triplet; + +// --------------------------------------------------------------------------- +// Synthetic builder. +// +// Layout: 2 parts arranged on the x-axis at +/-0.5 m, joined by 1 clutch +// (so num_clutches = 1, num_contact_vertices = 0, num_vars = 9). Each per- +// part 3x3 inertia is the identity. Q is a small positive-definite diagonal +// (matches the contact regularization role from breakage.cppm). A is set up +// so that the first three "axial" variables of the clutch couple the two +// parts' linear residuals with opposite signs (Newton's third law for the +// linear part); the angular block of A is zero, so the angular components of +// b appear as projection-stage slack but the rest of the QP is well-posed. +// G enforces a non-negativity on the axial variable; H + V give two friction- +// cone-like rows so the relaxation stage has something to do. +// --------------------------------------------------------------------------- +HostSystem make_synthetic_system() { + HostSystem sys; + const int N = 2; + const int K = 1; + + sys.num_parts = N; + sys.num_clutches = K; + sys.num_contact_vertices = 0; + sys.num_vars = 9 * K; + sys.num_eq = 6 * N; + sys.num_ineq = 1; + sys.num_relaxed_ineq = 2; + sys.total_mass = 2.0; + sys.L0 = 1.0; + + sys.mass = VectorXd::Constant(N, 1.0); + + sys.q_CC.resize(N, 4); + sys.q_CC.row(0) << 0.0, 0.0, 0.0, 1.0; + sys.q_CC.row(1) << 0.0, 0.0, 0.0, 1.0; + + sys.c_CC.resize(N, 3); + sys.c_CC.row(0) << -0.5, 0.0, 0.0; + sys.c_CC.row(1) << 0.5, 0.0, 0.0; + + sys.I_CC.resize(N, 9); + for (int i = 0; i < N; ++i) { + sys.I_CC.row(i) << 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0; + } + + // Q: 0.1 * I_9. Small positive definite; mirrors ContactRegularization on + // the diagonal in breakage.cppm. + std::vector Qt; + Qt.reserve(9); + for (int i = 0; i < 9; ++i) Qt.emplace_back(i, i, 0.1); + sys.Q.resize(9, 9); + sys.Q.setFromTriplets(Qt.begin(), Qt.end()); + sys.Q.makeCompressed(); + + // A: 12 x 9. + // Linear coupling: -I_3 on cols 0..2 of A_i (rows 0..2), + // +I_3 on cols 0..2 of A_j (rows 6..8). + // Angular rows (3..5 and 9..11) are zero - the angular component of b + // becomes projection-stage slack on both backends. + std::vector At; + At.reserve(6); + for (int axis = 0; axis < 3; ++axis) { + At.emplace_back(axis, axis, -1.0); + At.emplace_back(6 + axis, axis, 1.0); + } + sys.A.resize(12, 9); + sys.A.setFromTriplets(At.begin(), At.end()); + sys.A.makeCompressed(); + + // G: x[0] >= 0 (axial force is compressive). + std::vector Gt; + Gt.emplace_back(0, 0, 1.0); + sys.G.resize(1, 9); + sys.G.setFromTriplets(Gt.begin(), Gt.end()); + sys.G.makeCompressed(); + + // H: 2 friction-cone rows of small magnitude so the limit stays slack at + // the optimum (we want to compare unconstrained-ish behaviour, not test + // who clamps first). + std::vector Ht; + Ht.emplace_back(0, 0, -0.1); + Ht.emplace_back(0, 1, 0.1); + Ht.emplace_back(1, 0, -0.1); + Ht.emplace_back(1, 1, -0.1); + sys.H.resize(2, 9); + sys.H.setFromTriplets(Ht.begin(), Ht.end()); + sys.H.makeCompressed(); + + // V: 2 x 1, both relaxed-ineq rows belong to clutch 0. + std::vector Vt; + Vt.emplace_back(0, 0, 1.0); + Vt.emplace_back(1, 0, 1.0); + sys.V.resize(2, 1); + sys.V.setFromTriplets(Vt.begin(), Vt.end()); + sys.V.makeCompressed(); + + // Capacity rows: both for clutch 0; structured per breakage.cppm + // (head<3>() = "force used", tail<6>() = "force capacity offset"). + sys.capacity_clutch_indices = {0, 0}; + sys.capacities.resize(2, 9); + sys.capacities.row(0) << 1.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0; + sys.capacities.row(1) << 1.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0; + + // Wk = identity (so CPU's column-major Map and GPU's row-major read + // produce the same matrix; in production both bricksim and breakage_cuda + // assume Wk is symmetric so the two readouts agree by construction). + sys.clutch_whiten.resize(1, 4); + sys.clutch_whiten.row(0) << 1.0, 0.0, 0.0, 1.0; + + return sys; +} + +HostInput make_synthetic_input() { + HostInput in; + const int N = 2; + + in.dt = 0.01; + in.w = MatrixX3d::Zero(N, 3); + + // Equal-and-opposite y velocities. After fit_twist this picks up a tiny + // z-axis rotation (w0_z = -0.01) and produces both linear AND angular + // contributions to b. + in.v.resize(N, 3); + in.v.row(0) << 0.0, 0.05, 0.0; + in.v.row(1) << 0.0, -0.05, 0.0; + + in.q.resize(N, 4); + in.q.row(0) << 0.0, 0.0, 0.0, 1.0; + in.q.row(1) << 0.0, 0.0, 0.0, 1.0; + + // Very small displacement off c_CC; fit_se3 still recovers ~identity. + in.c.resize(N, 3); + in.c.row(0) << -0.5, 0.0005, 0.0; + in.c.row(1) << 0.5, -0.0005, 0.0; + + in.J = MatrixX3d::Zero(N, 3); + in.H = MatrixX3d::Zero(N, 3); + return in; +} + +HostState make_synthetic_state(int N) { + HostState st; + st.q_W_CC_prev = Eigen::Quaterniond::Identity(); + st.v_W_prev = MatrixX3d::Zero(N, 3); + st.L_prev = MatrixX3d::Zero(N, 3); + return st; +} + +// --------------------------------------------------------------------------- +// Pretty-printing. +// --------------------------------------------------------------------------- +void print_solve_info(const char *label, const SolveInfo &info) { + std::printf(" %-3s prj iter=%4d res(p,d)=(%.2e,%.2e) %s | " + "rlx iter=%4d res(p,d)=(%.2e,%.2e) %s | " + "opt iter=%4d res(p,d)=(%.2e,%.2e) %s\n", + label, + info.prj.iter, info.prj.prim_res, info.prj.dual_res, + info.prj.converged ? "OK " : "FAIL", + info.rlx.iter, info.rlx.prim_res, info.rlx.dual_res, + info.rlx.converged ? "OK " : "FAIL", + info.opt.iter, info.opt.prim_res, info.opt.dual_res, + info.opt.converged ? "OK " : "FAIL"); +} + +bool run_compare(const char *label, const HostSystem &sys, const HostInput &in, + HostState state_cpu, HostState state_gpu, + const Thresholds &thr) { + std::printf("[%s] num_parts=%d num_clutches=%d num_vars=%d num_eq=%d " + "num_ineq=%d num_relaxed_ineq=%d\n", + label, sys.num_parts, sys.num_clutches, sys.num_vars, + sys.num_eq, sys.num_ineq, sys.num_relaxed_ineq); + + Solution sol_cpu = ref_cpu::solve_full(sys, in, state_cpu, thr); + std::printf(" cpu converged=%d slack_fraction=%.3e\n", + static_cast(sol_cpu.info.converged), + sol_cpu.slack_fraction); + print_solve_info("cpu", sol_cpu.info); + + cuda_qp::Pipeline pipe; + pipe.setup(sys, thr); + Solution sol_gpu = pipe.solve(in, state_gpu); + std::printf(" gpu converged=%d slack_fraction=%.3e\n", + static_cast(sol_gpu.info.converged), + sol_gpu.slack_fraction); + print_solve_info("gpu", sol_gpu.info); + + bool ok = true; + if (!sol_cpu.info.converged) { + std::printf("[FAIL] %s: cpu solver did not converge\n", label); + ok = false; + } + if (!sol_gpu.info.converged) { + std::printf("[FAIL] %s: gpu solver did not converge\n", label); + ok = false; + } + if (!ok) return false; + + if (sol_cpu.x.size() != sol_gpu.x.size()) { + std::printf("[FAIL] %s: x size mismatch (cpu=%lld gpu=%lld)\n", label, + static_cast(sol_cpu.x.size()), + static_cast(sol_gpu.x.size())); + return false; + } + if (sol_cpu.utilization.size() != sol_gpu.utilization.size()) { + std::printf("[FAIL] %s: utilization size mismatch (cpu=%lld gpu=%lld)\n", + label, + static_cast(sol_cpu.utilization.size()), + static_cast(sol_gpu.utilization.size())); + return false; + } + + // Floor: 1e-3 absolute. With an x of order ~1, 1e-2 relative ~ 1e-2 + // absolute, which is well above the floor; tiny solutions still get a + // sensible scale to compare against. + constexpr double kFloor = 1e-3; + constexpr double kTol = 1e-2; + + const double err_x = relative_inf_error(sol_gpu.x, sol_cpu.x, kFloor); + std::printf(" x: cpu_inf=%.4e gpu_inf=%.4e abs_diff_inf=%.4e " + "rel_inf_err=%.4e\n", + sol_cpu.x.cwiseAbs().maxCoeff(), + sol_gpu.x.cwiseAbs().maxCoeff(), + (sol_cpu.x - sol_gpu.x).cwiseAbs().maxCoeff(), err_x); + if (err_x <= kTol) { + std::printf("[PASS] full_pipeline.x rel_inf=%.4e <= %.0e\n", + err_x, kTol); + } else { + std::printf("[FAIL] full_pipeline.x rel_inf=%.4e > %.0e\n", + err_x, kTol); + ok = false; + } + + const double err_u = + relative_inf_error(sol_gpu.utilization, sol_cpu.utilization, kFloor); + std::printf(" utilization: cpu_inf=%.4e gpu_inf=%.4e abs_diff_inf=%.4e " + "rel_inf_err=%.4e\n", + sol_cpu.utilization.cwiseAbs().maxCoeff(), + sol_gpu.utilization.cwiseAbs().maxCoeff(), + (sol_cpu.utilization - sol_gpu.utilization) + .cwiseAbs() + .maxCoeff(), + err_u); + if (err_u <= kTol) { + std::printf("[PASS] full_pipeline.utilization rel_inf=%.4e <= %.0e\n", + err_u, kTol); + } else { + std::printf("[FAIL] full_pipeline.utilization rel_inf=%.4e > %.0e\n", + err_u, kTol); + ok = false; + } + + return ok; +} + +bool run_json_mode(const std::string &path) { + std::printf("[full_pipeline] (JSON mode) loading %s\n", path.c_str()); + DebugDump dump = load_debug_dump(path); + // Both solvers mutate state; clone so each sees a fresh copy. + HostState state_cpu = dump.state; + HostState state_gpu = dump.state; + return run_compare("json", dump.system, dump.input, std::move(state_cpu), + std::move(state_gpu), dump.thresholds); +} + +bool run_synthetic_mode() { + std::printf("[full_pipeline] (synthetic mode)\n"); + HostSystem sys = make_synthetic_system(); + HostInput in = make_synthetic_input(); + HostState state_cpu = make_synthetic_state(sys.num_parts); + HostState state_gpu = make_synthetic_state(sys.num_parts); + Thresholds thr; + thr.enabled = true; + return run_compare("synthetic", sys, in, std::move(state_cpu), + std::move(state_gpu), thr); +} + +} // namespace + +int main(int argc, char **argv) { + try { + bool ok = (argc >= 2) ? run_json_mode(argv[1]) : run_synthetic_mode(); + std::printf("[full_pipeline] %s\n", ok ? "OK" : "FAILED"); + return ok ? 0 : 1; + } catch (const std::exception &e) { + std::fprintf(stderr, "[ERROR] full_pipeline: %s\n", e.what()); + return 2; + } +} diff --git a/native/cuda/breakage_cuda/tests/test_qp_random.cpp b/native/cuda/breakage_cuda/tests/test_qp_random.cpp new file mode 100644 index 0000000..c3234f4 --- /dev/null +++ b/native/cuda/breakage_cuda/tests/test_qp_random.cpp @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: MIT +// +// Cross-validation test for the CUDA QP solver against the CPU reference +// (real OSQP). Builds a tiny synthetic HostSystem and runs the same RHS +// through both backends, then compares their stage-3 `x` outputs via a +// relative inf-norm error. +// +// Toy HostSystem layout (NOT a physical breakage problem; the QP solvers +// are oblivious to the per-part fields, so we only populate the matrices +// that `Solver::setup` / `InternalQpState::setup_from` actually read): +// +// num_parts = 1 => num_eq = me = 6 +// num_clutches = 1 => nv = 1 +// num_contact_vertices = 1 => num_vars = nx = 1 + 9*1 = 10 (consistent +// with +// breakage.cppm +// invariant) +// num_ineq = mi = 2 +// num_relaxed_ineq = mh = 2 +// +// A in R^{6x10}: [ I_6 | 0_{6x4} ] +// -> stage 1 finds b_prj = b for any b (range(A) = R^6), +// and stages 2/3 force x.head(6) == b exactly. +// Q in R^{10x10}: I_10 + 0.5 * sum_{i=0..3} (e_i e_{6+i}^T + e_{6+i} e_i^T) +// -> SPD (each (i, 6+i) 2x2 block has eigenvalues 0.5, 1.5; +// other diagonals are 1.0; condition number 3). +// -> stage 3 unconstrained min over y = x.tail(4) is +// y* = -0.5 * b.head(4). +// G in R^{2x10}: rows pick x[6], x[7] (so G x = [x[6]; x[7]] >= -eps). +// H in R^{2x10}: rows pick x[8], x[9] (so H x = [x[8]; x[9]] <= 1+V*v). +// V in R^{2x1}: ones, so the single clutch slack v couples both H rows. +// +// We pick b = [4, 4, -4, -4, 0, 0]: +// Stage 2 unconstrained optimum is x.tail(4) = 0, v = 0 -> v_relax = 0. +// Stage 3 unconstrained optimum y* = (-2, -2, 2, 2) violates ALL FOUR +// inequality constraints, so the constrained minimum hits the box at +// (y[0], y[1], y[2], y[3]) = (-eps, -eps, 1, 1). All three stages run +// with non-trivial active sets. +// +// The OSQP baseline and the hand-rolled CUDA ADMM are independent solvers +// with different inner KKT machinery (OSQP factors, CUDA uses CG on the +// reduced normal equations) and different exact convergence criteria, so +// we do NOT expect bit equality. Tolerance: relative inf-norm error +// `||x_cuda - x_cpu||_inf / max(1, ||x_cpu||_inf) <= 1e-2`. Per-stage +// iter / prim_res / dual_res are printed for both backends so we can see +// whether either one stalled. +// +// SOURCE-FILE FIX APPLIED while writing this test: +// src/qp_solver.cu, Solver::solve, opt-stage upper bound: +// `u_opt.tail(mh_).setConstant(1.0 - kEpsilon);` was wrong -- +// osqp.cppm (the source of truth) and ref_cpu.cpp both use plain 1.0 +// for the H-row upper bound in stage 3 (the `1 - epsilon` slack +// belongs to stage 2 only). Changed to `setConstant(1.0)` so the two +// backends solve the same QP. Without this fix the CUDA opt-stage +// ceiling sits 1e-4 below OSQP's, which biases every active H-row +// constraint by epsilon. + +#include "breakage_cuda/device.hpp" +#include "breakage_cuda/qp_solver.hpp" +#include "breakage_cuda/ref_cpu.hpp" +#include "breakage_cuda/system.hpp" + +#include +#include + +#include + +#include +#include +#include +#include +#include + +using breakage_cuda::DeviceBuffer; +using breakage_cuda::DeviceSystem; +using breakage_cuda::HostState; +using breakage_cuda::HostSystem; +using breakage_cuda::MatrixX3d; +using breakage_cuda::MatrixX4d; +using breakage_cuda::SolveInfo; +using breakage_cuda::SparseMatrixCSC; +using breakage_cuda::StageInfo; +using breakage_cuda::VectorXd; +using breakage_cuda::relative_inf_error; + +namespace { + +SparseMatrixCSC dense_to_csc(const Eigen::MatrixXd &m) { + std::vector> trips; + trips.reserve(static_cast(m.rows()) * + static_cast(m.cols())); + for (int j = 0; j < m.cols(); ++j) { + for (int i = 0; i < m.rows(); ++i) { + const double v = m(i, j); + if (v != 0.0) { + trips.emplace_back(i, j, v); + } + } + } + SparseMatrixCSC sp(m.rows(), m.cols()); + sp.setFromTriplets(trips.begin(), trips.end()); + sp.makeCompressed(); + return sp; +} + +HostSystem build_toy_system() { + HostSystem sys; + const int N = 1; + const int K = 1; + const int ncv = 1; + const int nx = ncv + 9 * K; + const int me = 6 * N; + const int mi = 2; + const int mh = 2; + + sys.num_parts = N; + sys.num_clutches = K; + sys.num_contact_vertices = ncv; + sys.num_vars = nx; + sys.num_eq = me; + sys.num_ineq = mi; + sys.num_relaxed_ineq = mh; + sys.total_mass = 1.0; + sys.L0 = 1.0; + + // Per-part dense fields are unused by the QP solvers (they're only + // touched by the dense kernels in the full pipeline). Populate them + // with sane defaults so downstream consumers wouldn't trip on garbage. + sys.mass = VectorXd::Constant(N, 1.0); + sys.q_CC = MatrixX4d::Zero(N, 4); + sys.q_CC(0, 3) = 1.0; + sys.c_CC = MatrixX3d::Zero(N, 3); + sys.I_CC.resize(N, 9); + sys.I_CC.setZero(); + sys.I_CC(0, 0) = 1.0; + sys.I_CC(0, 4) = 1.0; + sys.I_CC(0, 8) = 1.0; + + Eigen::MatrixXd Qd = Eigen::MatrixXd::Identity(nx, nx); + for (int i = 0; i < 4; ++i) { + Qd(i, 6 + i) = 0.5; + Qd(6 + i, i) = 0.5; + } + sys.Q = dense_to_csc(Qd); + + Eigen::MatrixXd Ad = Eigen::MatrixXd::Zero(me, nx); + for (int i = 0; i < me; ++i) { + Ad(i, i) = 1.0; + } + sys.A = dense_to_csc(Ad); + + Eigen::MatrixXd Gd = Eigen::MatrixXd::Zero(mi, nx); + Gd(0, 6) = 1.0; + Gd(1, 7) = 1.0; + sys.G = dense_to_csc(Gd); + + Eigen::MatrixXd Hd = Eigen::MatrixXd::Zero(mh, nx); + Hd(0, 8) = 1.0; + Hd(1, 9) = 1.0; + sys.H = dense_to_csc(Hd); + + Eigen::MatrixXd Vd = Eigen::MatrixXd::Ones(mh, K); + sys.V = dense_to_csc(Vd); + + // Friction-cone metadata is only consulted by the postprocess step in + // the full pipeline; leave empty for the QP-only test. + sys.capacity_clutch_indices.clear(); + sys.capacities.resize(0, 9); + sys.clutch_whiten.resize(K, 4); + sys.clutch_whiten.setZero(); + sys.clutch_whiten(0, 0) = 1.0; + sys.clutch_whiten(0, 3) = 1.0; + + return sys; +} + +void print_stage(const char *name, const StageInfo &cpu, const StageInfo &cuda) { + std::printf(" %s:\n", name); + std::printf(" CPU : converged=%d iter=%4d prim_res=%.3e dual_res=%.3e " + "rho=%.3e\n", + cpu.converged ? 1 : 0, cpu.iter, cpu.prim_res, cpu.dual_res, + cpu.rho); + std::printf(" CUDA : converged=%d iter=%4d prim_res=%.3e dual_res=%.3e " + "rho=%.3e\n", + cuda.converged ? 1 : 0, cuda.iter, cuda.prim_res, cuda.dual_res, + cuda.rho); +} + +} // namespace + +int main() { + try { + HostSystem sys = build_toy_system(); + + VectorXd b(sys.num_eq); + b << 4.0, 4.0, -4.0, -4.0, 0.0, 0.0; + + std::printf("=== test_qp_random ===\n"); + std::printf("Toy QP: N=%d K=%d ncv=%d nx=%d me=%d mi=%d mh=%d nv=%d\n", + sys.num_parts, sys.num_clutches, sys.num_contact_vertices, + sys.num_vars, sys.num_eq, sys.num_ineq, sys.num_relaxed_ineq, + sys.num_clutches); + std::printf("RHS b = [%g, %g, %g, %g, %g, %g]\n", b[0], b[1], b[2], b[3], + b[4], b[5]); + + // -------------------- CPU reference (OSQP) ---------------------------- + HostState cpu_state; + cpu_state.v_W_prev = MatrixX3d::Zero(sys.num_parts, 3); + cpu_state.L_prev = MatrixX3d::Zero(sys.num_parts, 3); + VectorXd x_cpu, slack_cpu, v_relax_cpu; + SolveInfo cpu_info = breakage_cuda::ref_cpu::solve_qp( + sys, b, cpu_state, x_cpu, slack_cpu, v_relax_cpu); + + // -------------------- CUDA solver ------------------------------------- + // Solver::setup ignores its DeviceSystem argument (uses HostSystem + // matrices directly), so a default-constructed one is fine here. + DeviceSystem dev_dummy{}; + breakage_cuda::cuda_qp::Settings settings; + breakage_cuda::cuda_qp::Solver cuda_solver; + cuda_solver.setup(sys, dev_dummy, settings); + + DeviceBuffer x_d(static_cast(sys.num_vars)); + DeviceBuffer slack_d(static_cast(sys.num_eq)); + DeviceBuffer v_relax_d( + static_cast(sys.num_clutches)); + + SolveInfo cuda_info = cuda_solver.solve(b, x_d.data(), slack_d.data(), + v_relax_d.data()); + + VectorXd x_cuda(sys.num_vars); + x_d.copy_to_host(x_cuda.data(), x_cuda.size()); + + VectorXd v_relax_cuda(sys.num_clutches); + if (sys.num_clutches > 0) { + v_relax_d.copy_to_host(v_relax_cuda.data(), v_relax_cuda.size()); + } + + // -------------------- Per-stage info ---------------------------------- + std::printf("\nPer-stage convergence (CPU = OSQP, CUDA = ADMM+CG):\n"); + print_stage("prj", cpu_info.prj, cuda_info.prj); + print_stage("rlx", cpu_info.rlx, cuda_info.rlx); + print_stage("opt", cpu_info.opt, cuda_info.opt); + std::printf(" v_relax: cpu = %.6e cuda = %.6e\n", v_relax_cpu[0], + v_relax_cuda[0]); + + // -------------------- Compare x --------------------------------------- + const double tol = 1e-2; + const double err = relative_inf_error(x_cuda, x_cpu, /*floor=*/1.0); + std::printf("\nx (CPU vs CUDA), relative inf-norm error = %.3e (tol %.0e)\n", + err, tol); + std::printf(" i : x_cpu x_cuda |diff|\n"); + for (int i = 0; i < sys.num_vars; ++i) { + std::printf(" %2d: %12.6f %12.6f %.3e\n", i, x_cpu[i], x_cuda[i], + std::abs(x_cpu[i] - x_cuda[i])); + } + + // -------------------- Pass / fail summary ----------------------------- + bool overall_ok = true; + auto stage_pass = [&](const char *name, bool cpu_conv, bool cuda_conv) { + const bool ok = cpu_conv && cuda_conv; + std::printf("[%s] %s convergence (CPU=%d, CUDA=%d)\n", + ok ? "PASS" : "FAIL", name, cpu_conv ? 1 : 0, + cuda_conv ? 1 : 0); + if (!ok) overall_ok = false; + }; + stage_pass("prj", cpu_info.prj.converged, cuda_info.prj.converged); + stage_pass("rlx", cpu_info.rlx.converged, cuda_info.rlx.converged); + stage_pass("opt", cpu_info.opt.converged, cuda_info.opt.converged); + + if (err <= tol) { + std::printf("[PASS] x agreement (err = %.3e <= %.0e)\n", err, tol); + } else { + std::printf("[FAIL] x agreement (err = %.3e > %.0e)\n", err, tol); + overall_ok = false; + } + + std::printf("\n%s overall\n", overall_ok ? "[PASS]" : "[FAIL]"); + return overall_ok ? 0 : 1; + } catch (const std::exception &e) { + std::fprintf(stderr, "[FAIL] exception: %s\n", e.what()); + return 1; + } +} diff --git a/native/modules/bricksim/physx/breakage.cppm b/native/modules/bricksim/physx/breakage.cppm index e202ce6..dc79c1d 100644 --- a/native/modules/bricksim/physx/breakage.cppm +++ b/native/modules/bricksim/physx/breakage.cppm @@ -55,9 +55,10 @@ using QpSolver = OsqpSolver; using QpSolverState = OsqpState; using QpSolverInfo = OsqpInfo; -Transformd fit_se3(const MatrixX4d &q0, const MatrixX3d &t0, - const MatrixX4d &qx, const MatrixX3d &tx, - const VectorXd &mass, double total_mass, double lambda_R) { +export Transformd fit_se3(const MatrixX4d &q0, const MatrixX3d &t0, + const MatrixX4d &qx, const MatrixX3d &tx, + const VectorXd &mass, double total_mass, + double lambda_R) { Index N = mass.size(); Vector3d t0_bar = t0.transpose() * mass / total_mass; Vector3d tx_bar = tx.transpose() * mass / total_mass; @@ -87,16 +88,17 @@ Transformd fit_se3(const MatrixX4d &q0, const MatrixX3d &t0, return {q, t}; } -struct TwistFitResult { +export struct TwistFitResult { Vector3d w0; Vector3d v0; MatrixX3d v_W; }; -TwistFitResult fit_twist(const MatrixX3d &w, const MatrixX3d &v, - const MatrixX3d &c_CC, const Transformd &T_W_CC, - const VectorXd &mass, double total_mass, - double lambda_w) { +export TwistFitResult fit_twist(const MatrixX3d &w, const MatrixX3d &v, + const MatrixX3d &c_CC, + const Transformd &T_W_CC, + const VectorXd &mass, double total_mass, + double lambda_w) { const auto &[q_W_CC, t_W_CC] = T_W_CC; MatrixX3d c_W = (c_CC * q_W_CC.toRotationMatrix().transpose()).rowwise() + t_W_CC.transpose(); @@ -120,7 +122,7 @@ TwistFitResult fit_twist(const MatrixX3d &w, const MatrixX3d &v, return result; } -Vector3d so3_log_from_unit_quat(Quaterniond q) { +export Vector3d so3_log_from_unit_quat(Quaterniond q) { // log-map branch threshold on s = ||q.vec|| = sin(theta/2) constexpr double kSmallS = 5e-5; q.normalize(); @@ -143,7 +145,7 @@ Vector3d so3_log_from_unit_quat(Quaterniond q) { // Inverse of the Jacobian corresponding to your report's convention: // J(φ) = I + (1-cosθ)/θ^2 [φ]x + (θ-sinθ)/θ^3 [φ]x^2 (this is the "right Jacobian" in many texts) -Matrix3d so3_jacobian_inv(const Vector3d &phi) { +export Matrix3d so3_jacobian_inv(const Vector3d &phi) { // J^{-1} branch threshold on theta = ||phi|| constexpr double kSmallTheta = 1e-4; Matrix3d Phi = phi.asSkewSymmetric(); @@ -163,13 +165,13 @@ Matrix3d so3_jacobian_inv(const Vector3d &phi) { return Matrix3d::Identity() - 0.5 * Phi + c * (Phi * Phi); } -Matrix3d compute_Pi(const Quaterniond &qm, const Quaterniond &qp) { +export Matrix3d compute_Pi(const Quaterniond &qm, const Quaterniond &qp) { // Relative rotation: R_rel = Rm^T Rp return so3_jacobian_inv(so3_log_from_unit_quat(qm.conjugate() * qp)) * qm.toRotationMatrix().transpose(); } -template +export template MatrixX3d compute_L(const MatrixBase &Iflat, const Quaterniond &q_W_CC, const Vector3d &w0) { Vector3d w_CC = q_W_CC.conjugate() * w0; @@ -205,7 +207,7 @@ void add_block_triplets(std::vector> &out, int row0, int col0, } } -Matrix2d inv_sqrt_spd(const Matrix2d &C) { +export Matrix2d inv_sqrt_spd(const Matrix2d &C) { SelfAdjointEigenSolver eig{C}; if (eig.info() != Eigen::Success) { throw std::invalid_argument("inv_sqrt_spd: eigen decomposition failed"); diff --git a/native/tests/TestBreakage.cpp b/native/tests/TestBreakage.cpp new file mode 100644 index 0000000..d88a9f6 --- /dev/null +++ b/native/tests/TestBreakage.cpp @@ -0,0 +1,433 @@ +import std; +import bricksim.physx.breakage; +import bricksim.utils.transforms; +import bricksim.vendor; + +#include + +using namespace bricksim; + +using Eigen::AngleAxisd; +using Eigen::Dynamic; +using Eigen::Matrix; +using Eigen::Matrix2d; +using Eigen::Matrix3d; +using Eigen::Quaterniond; +using Eigen::Vector3d; +using Eigen::Vector4d; +using Eigen::VectorXd; +using MatrixX3d = Matrix; +using MatrixX4d = Matrix; +using MatrixX9d = Matrix; + +// --------------------- helpers --------------------- + +static constexpr double kTol = 1e-9; + +template +static bool approx_mat(const A &a, const B &b, double tol = kTol) { + if (a.rows() != b.rows() || a.cols() != b.cols()) + return false; + double scale = 1.0 + std::max(a.cwiseAbs().maxCoeff(), + b.cwiseAbs().maxCoeff()); + return (a - b).cwiseAbs().maxCoeff() <= tol * scale; +} + +// Eigen stores `coeffs()` as (x, y, z, w). Same convention is used by the +// breakage code (`fit_se3` reads rows of the input quat matrix as (x,y,z,w) +// and feeds them into Quaterniond's vector ctor). +static Vector4d quat_xyzw(const Quaterniond &q) { + return q.coeffs(); +} + +static Quaterniond axis_angle(const Vector3d &axis_unit, double angle) { + return Quaterniond{AngleAxisd{angle, axis_unit}}; +} + +// Force evaluation of a 3-component Eigen expression into a Vector3d. Avoids +// the brace-init ambiguity between `Vector3d(MatrixBase&)` and +// `Vector3d(Scalar, Scalar, Scalar)` that some compilers stumble on. +template static Vector3d v3(const Expr &e) { + Vector3d out = e; + return out; +} + +// Closed-form right Jacobian J(phi) = I + (1-cosθ)/θ² [φ]× + +// (θ-sinθ)/θ³ [φ]ײ +// (matches the convention so3_jacobian_inv inverts). +static Matrix3d so3_jacobian_closed_form(const Vector3d &phi) { + Matrix3d Phi = phi.asSkewSymmetric(); + double theta2 = phi.squaredNorm(); + if (theta2 < 1e-20) { + return Matrix3d::Identity() + 0.5 * Phi + + (1.0 / 6.0) * (Phi * Phi); + } + double theta = std::sqrt(theta2); + double a = (1.0 - std::cos(theta)) / theta2; + double b = (theta - std::sin(theta)) / (theta2 * theta); + return Matrix3d::Identity() + a * Phi + b * (Phi * Phi); +} + +// --------------------- so3_log_from_unit_quat --------------------- + +void test_so3_log_identity() { + Quaterniond q = Quaterniond::Identity(); + Vector3d v = so3_log_from_unit_quat(q); + assert(approx_mat(v, Vector3d::Zero())); +} + +void test_so3_log_axis_angle_round_trip() { + const Vector3d axis = Vector3d{0.3, -0.7, 0.4}.normalized(); + for (double angle : + {0.0, 1e-6, 1e-3, 0.1, 0.5, 1.0, 2.0, 3.0, 3.1, + std::numbers::pi - 1e-3}) { + Quaterniond q = axis_angle(axis, angle); + Vector3d phi = so3_log_from_unit_quat(q); + assert(approx_mat(phi, axis * angle, 1e-9)); + } +} + +void test_so3_log_negative_w_canonicalizes() { + const Vector3d axis = Vector3d{1.0, 0.0, 0.0}; + Quaterniond q = axis_angle(axis, 1.2); + Quaterniond q_neg(-q.w(), -q.x(), -q.y(), -q.z()); + Vector3d phi_pos = so3_log_from_unit_quat(q); + Vector3d phi_neg = so3_log_from_unit_quat(q_neg); + assert(approx_mat(phi_pos, phi_neg)); +} + +void test_so3_log_branch_continuity() { + // 5e-5 is the documented small-s threshold; values just above and below + // must agree to many digits. + const Vector3d axis = Vector3d{0.0, 0.0, 1.0}; + for (double s : {4.0e-5, 5.0e-5, 6.0e-5, 1.0e-4}) { + double angle = 2.0 * std::asin(s); + Quaterniond q = axis_angle(axis, angle); + Vector3d phi = so3_log_from_unit_quat(q); + assert(approx_mat(phi, axis * angle, 1e-12)); + } +} + +// --------------------- so3_jacobian_inv --------------------- + +void test_so3_jacobian_inv_at_zero() { + Matrix3d J = so3_jacobian_inv(Vector3d::Zero()); + assert(approx_mat(J, Matrix3d::Identity())); +} + +void test_so3_jacobian_inv_inverts_jacobian() { + for (Vector3d phi : {Vector3d{1e-6, 0, 0}, + Vector3d{1e-3, 1e-3, 0}, + Vector3d{0.1, -0.2, 0.3}, + Vector3d{1.0, -0.5, 0.7}, + Vector3d{2.5, 0.3, -1.1}}) { + Matrix3d Jinv = so3_jacobian_inv(phi); + Matrix3d J = so3_jacobian_closed_form(phi); + Matrix3d prod = Jinv * J; + assert(approx_mat(prod, Matrix3d::Identity(), 1e-9)); + } +} + +void test_so3_jacobian_inv_branch_continuity() { + // 1e-4 is the documented small-theta threshold. + for (double th : + {0.5e-4, 1.0e-4, 1.5e-4, 2.0e-4, 5.0e-4}) { + Vector3d phi{th, 0.0, 0.0}; + Matrix3d Jinv = so3_jacobian_inv(phi); + Matrix3d J = so3_jacobian_closed_form(phi); + assert(approx_mat(Jinv * J, Matrix3d::Identity(), 1e-9)); + } +} + +// --------------------- compute_Pi --------------------- + +void test_compute_Pi_qm_equals_qp() { + // log(I) = 0 => J^-1(0) = I => Pi = R_m^T + for (Quaterniond q : {Quaterniond::Identity(), + axis_angle(Vector3d::UnitX(), 0.5), + axis_angle(Vector3d{1, 1, 1}.normalized(), 1.2)}) { + Matrix3d Pi = compute_Pi(q, q); + assert(approx_mat(Pi, q.toRotationMatrix().transpose())); + } +} + +void test_compute_Pi_small_relative_rotation() { + // For a small relative rotation R_rel, log(R_rel) is small, J^-1 ~ I, + // so Pi ~ R_m^T. + Quaterniond qm = axis_angle(Vector3d{0.2, 0.3, -0.1}.normalized(), 0.7); + Quaterniond qp = qm * axis_angle(Vector3d::UnitZ(), 1e-7); + Matrix3d Pi = compute_Pi(qm, qp); + assert(approx_mat(Pi, qm.toRotationMatrix().transpose(), 1e-6)); +} + +// --------------------- inv_sqrt_spd --------------------- + +void test_inv_sqrt_spd_identity() { + Matrix2d M = inv_sqrt_spd(Matrix2d::Identity()); + assert(approx_mat(M, Matrix2d::Identity())); +} + +void test_inv_sqrt_spd_diagonal() { + Matrix2d C; + C << 4.0, 0.0, 0.0, 9.0; + Matrix2d M = inv_sqrt_spd(C); + Matrix2d expected; + expected << 0.5, 0.0, 0.0, 1.0 / 3.0; + assert(approx_mat(M, expected)); + assert(approx_mat(M * C * M.transpose(), Matrix2d::Identity())); +} + +void test_inv_sqrt_spd_general() { + Matrix2d C; + C << 5.0, 1.5, 1.5, 2.0; + Matrix2d M = inv_sqrt_spd(C); + Matrix2d should_be_I = M * C * M.transpose(); + assert(approx_mat(should_be_I, Matrix2d::Identity(), 1e-12)); + assert(approx_mat(M, M.transpose())); +} + +void test_inv_sqrt_spd_rejects_non_positive() { + Matrix2d C; + C << -1.0, 0.0, 0.0, 1.0; + bool threw = false; + try { + (void)inv_sqrt_spd(C); + } catch (const std::invalid_argument &) { + threw = true; + } + assert(threw && "inv_sqrt_spd must reject non-positive matrices"); +} + +// --------------------- fit_se3 --------------------- + +void test_fit_se3_identity() { + const int N = 4; + MatrixX4d q0(N, 4); + MatrixX3d t0(N, 3); + q0 << 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1; + t0 << 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 3.0, -1.0, -2.0, -3.0; + VectorXd mass(N); + mass << 1.0, 1.0, 1.0, 1.0; + double total = mass.sum(); + Transformd T = fit_se3(q0, t0, q0, t0, mass, total, 0.01); + const auto &[q, t] = T; + assert(approx_mat(q.toRotationMatrix(), Matrix3d::Identity())); + assert(approx_mat(t, Vector3d::Zero(), 1e-12)); +} + +void test_fit_se3_pure_translation() { + const int N = 3; + const Vector3d shift{0.5, -1.5, 2.0}; + MatrixX4d q0(N, 4); + MatrixX3d t0(N, 3); + q0 << 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1; + t0 << 1.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 3.0; + MatrixX3d t1 = t0.rowwise() + shift.transpose(); + VectorXd mass(N); + mass << 1.0, 2.0, 3.0; + double total = mass.sum(); + Transformd T = fit_se3(q0, t0, q0, t1, mass, total, 0.0); + const auto &[q, t] = T; + assert(approx_mat(q.toRotationMatrix(), Matrix3d::Identity())); + assert(approx_mat(t, shift, 1e-12)); +} + +void test_fit_se3_pure_rotation() { + const int N = 5; + // Reference points spread in 3D so SVD is well-conditioned. + MatrixX3d t0(N, 3); + t0 << 1.0, 0.0, 0.0, + -1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, + 0.0, -1.0, 0.0, + 0.0, 0.0, 1.0; + MatrixX4d q0(N, 4); + for (int i = 0; i < N; ++i) { + q0.row(i) = quat_xyzw(Quaterniond::Identity()).transpose(); + } + VectorXd mass = VectorXd::Ones(N); + double total = mass.sum(); + Quaterniond R = axis_angle(Vector3d{0.2, -0.5, 0.8}.normalized(), 0.9); + const Vector3d t_offset{2.0, -3.0, 1.0}; + MatrixX3d t1(N, 3); + for (int i = 0; i < N; ++i) { + t1.row(i) = + (R * v3(t0.row(i).transpose()) + t_offset).transpose(); + } + MatrixX4d q1(N, 4); + for (int i = 0; i < N; ++i) { + q1.row(i) = quat_xyzw(R).transpose(); // body rotates with the rigid frame + } + Transformd T = fit_se3(q0, t0, q1, t1, mass, total, 0.01); + const auto &[q_fit, t_fit] = T; + assert(approx_mat(q_fit.toRotationMatrix(), R.toRotationMatrix(), 1e-9)); + assert(approx_mat(t_fit, t_offset, 1e-9)); +} + +// --------------------- fit_twist --------------------- + +void test_fit_twist_zero() { + const int N = 3; + MatrixX3d w = MatrixX3d::Zero(N, 3); + MatrixX3d v = MatrixX3d::Zero(N, 3); + MatrixX3d c_CC(N, 3); + c_CC << 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0; + Transformd T_W_CC{Quaterniond::Identity(), Vector3d::Zero()}; + VectorXd mass = VectorXd::Ones(N); + double total = mass.sum(); + TwistFitResult r = fit_twist(w, v, c_CC, T_W_CC, mass, total, 0.0); + assert(approx_mat(r.w0, Vector3d::Zero())); + assert(approx_mat(r.v0, Vector3d::Zero())); + assert(approx_mat(r.v_W, MatrixX3d::Zero(N, 3))); +} + +void test_fit_twist_pure_translation() { + const int N = 4; + const Vector3d u{0.7, -0.4, 0.2}; + MatrixX3d c_CC(N, 3); + c_CC << 1.0, 0.0, 0.0, + -1.0, 0.0, 0.0, + 0.0, 2.0, 0.0, + 0.0, 0.0, -1.5; + MatrixX3d v(N, 3); + for (int i = 0; i < N; ++i) + v.row(i) = u.transpose(); + MatrixX3d w = MatrixX3d::Zero(N, 3); + Transformd T_W_CC{Quaterniond::Identity(), Vector3d{1.0, 2.0, 3.0}}; + VectorXd mass(N); + mass << 1.0, 2.0, 3.0, 4.0; + double total = mass.sum(); + TwistFitResult r = fit_twist(w, v, c_CC, T_W_CC, mass, total, 0.0); + assert(approx_mat(r.w0, Vector3d::Zero(), 1e-9)); + assert(approx_mat(r.v0, u, 1e-12)); + for (int i = 0; i < N; ++i) + assert(approx_mat(v3(r.v_W.row(i).transpose()), u, 1e-9)); +} + +void test_fit_twist_rigid_rotation_recovers_w() { + const int N = 5; + MatrixX3d c_CC(N, 3); + c_CC << 1.0, 0.0, 0.0, + -1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, + 0.0, -1.0, 0.0, + 0.0, 0.0, 1.5; + VectorXd mass(N); + mass << 1.0, 1.0, 2.0, 2.0, 3.0; + double total = mass.sum(); + + // Place CC in world with a non-identity pose to make sure fit_twist + // projects everything via T_W_CC consistently. + Quaterniond q_W_CC = axis_angle(Vector3d{0.4, 0.5, 0.7}.normalized(), 0.6); + Vector3d t_W_CC{-0.3, 0.8, 1.2}; + Transformd T_W_CC{q_W_CC, t_W_CC}; + + // Synthesize a pure rigid rotation about CoM. + MatrixX3d c_W(N, 3); + for (int i = 0; i < N; ++i) + c_W.row(i) = + (q_W_CC * v3(c_CC.row(i).transpose()) + t_W_CC).transpose(); + Vector3d r = c_W.transpose() * mass / total; + const Vector3d w_true{0.3, -0.5, 0.7}; + MatrixX3d v(N, 3); + for (int i = 0; i < N; ++i) { + Vector3d d = v3(c_W.row(i).transpose()) - r; + v.row(i) = w_true.cross(d).transpose(); + } + MatrixX3d w = MatrixX3d::Zero(N, 3); // body angular velocities ignored when lambda_w = 0 + + TwistFitResult res = fit_twist(w, v, c_CC, T_W_CC, mass, total, 0.0); + assert(approx_mat(res.w0, w_true, 1e-9)); + assert(approx_mat(res.v0, Vector3d::Zero(), 1e-9)); + for (int i = 0; i < N; ++i) { + Vector3d expected = v.row(i).transpose(); + assert(approx_mat(v3(res.v_W.row(i).transpose()), expected, 1e-9)); + } +} + +// --------------------- compute_L --------------------- + +// Identity inertia per part: L_i = R * I * R^T * w0 = w0 (independent of R). +void test_compute_L_identity_inertia() { + const int N = 3; + MatrixX9d Iflat(N, 9); + for (int i = 0; i < N; ++i) { + Iflat.row(i) << 1, 0, 0, 0, 1, 0, 0, 0, 1; + } + const Vector3d w0{0.4, -0.7, 1.1}; + for (Quaterniond q : + {Quaterniond::Identity(), + axis_angle(Vector3d::UnitZ(), 0.7), + axis_angle(Vector3d{1, -2, 3}.normalized(), 1.4)}) { + MatrixX3d L = compute_L(Iflat, q, w0); + for (int i = 0; i < N; ++i) { + assert(approx_mat(v3(L.row(i).transpose()), w0, 1e-12)); + } + } +} + +// Diagonal CC-frame inertia, identity orientation: L_i = I_i * w0. +void test_compute_L_diagonal_inertia_identity_pose() { + const int N = 2; + MatrixX9d Iflat(N, 9); + Iflat.row(0) << 2.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 5.0; + Iflat.row(1) << 7.0, 0.0, 0.0, 0.0, 11.0, 0.0, 0.0, 0.0, 13.0; + Quaterniond q = Quaterniond::Identity(); + Vector3d w0{1.0, 1.0, 1.0}; + MatrixX3d L = compute_L(Iflat, q, w0); + assert(approx_mat(v3(L.row(0).transpose()), Vector3d{2.0, 3.0, 5.0}, + 1e-12)); + assert(approx_mat(v3(L.row(1).transpose()), Vector3d{7.0, 11.0, 13.0}, + 1e-12)); +} + +// Rotated diagonal inertia: L_W = R * I_CC * R^T * w0. +void test_compute_L_rotated_inertia() { + const int N = 1; + MatrixX9d Iflat(N, 9); + Iflat.row(0) << 2.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 5.0; + Quaterniond q = axis_angle(Vector3d{0.3, -0.8, 0.5}.normalized(), 1.1); + Vector3d w0{0.7, -0.4, 1.3}; + MatrixX3d L = compute_L(Iflat, q, w0); + Matrix3d I_CC; + I_CC << 2, 0, 0, 0, 3, 0, 0, 0, 5; + Matrix3d R = q.toRotationMatrix(); + Vector3d expected = R * I_CC * R.transpose() * w0; + assert(approx_mat(v3(L.row(0).transpose()), expected, 1e-12)); +} + +// --------------------- main --------------------- + +int main() { + test_so3_log_identity(); + test_so3_log_axis_angle_round_trip(); + test_so3_log_negative_w_canonicalizes(); + test_so3_log_branch_continuity(); + + test_so3_jacobian_inv_at_zero(); + test_so3_jacobian_inv_inverts_jacobian(); + test_so3_jacobian_inv_branch_continuity(); + + test_compute_Pi_qm_equals_qp(); + test_compute_Pi_small_relative_rotation(); + + test_inv_sqrt_spd_identity(); + test_inv_sqrt_spd_diagonal(); + test_inv_sqrt_spd_general(); + test_inv_sqrt_spd_rejects_non_positive(); + + test_fit_se3_identity(); + test_fit_se3_pure_translation(); + test_fit_se3_pure_rotation(); + + test_fit_twist_zero(); + test_fit_twist_pure_translation(); + test_fit_twist_rigid_rotation_recovers_w(); + + test_compute_L_identity_inertia(); + test_compute_L_diagonal_inertia_identity_pose(); + test_compute_L_rotated_inertia(); + + std::cout << "All Breakage tests passed.\n"; + return 0; +}