diff --git a/Cargo.toml b/Cargo.toml index c69696637..f539b0458 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -112,3 +112,7 @@ harness = false [[bench]] name = "serialize" harness = false + +[[bench]] +name = "intern" +harness = false diff --git a/benches/intern.rs b/benches/intern.rs new file mode 100644 index 000000000..cd4925882 --- /dev/null +++ b/benches/intern.rs @@ -0,0 +1,47 @@ +use clvmr::allocator::Allocator; +use clvmr::serde::{intern, node_from_bytes, node_from_bytes_backrefs, node_to_bytes_limit}; +use criterion::{Criterion, criterion_group, criterion_main}; +use std::include_bytes; +use std::time::Instant; + +fn intern_benchmark(c: &mut Criterion) { + let block0: &[u8] = include_bytes!("0.generator"); + let block1: &[u8] = include_bytes!("1.generator"); + let block2: &[u8] = include_bytes!("2.generator"); + let block3: &[u8] = include_bytes!("3.generator"); + let block4: &[u8] = include_bytes!("4.generator"); + + let mut group = c.benchmark_group("intern"); + + for (block, name) in [ + (&block0, "0"), + (&block1, "1"), + (&block2, "2"), + (&block3, "3"), + (&block4, "4"), + ] { + let mut a = Allocator::new(); + let node = node_from_bytes_backrefs(&mut a, block).expect("node_from_bytes_backrefs"); + + // if the inflated form takes too much space, just run the benchmark on the compact form + let node = if let Ok(inflated) = node_to_bytes_limit(&a, node, 2000000) { + a = Allocator::new(); + node_from_bytes(&mut a, inflated.as_slice()).expect("node_from_bytes") + } else { + node + }; + + group.bench_function(format!("intern {name}"), |b| { + b.iter(|| { + let start = Instant::now(); + let _tree = intern(&a, node).expect("intern"); + start.elapsed() + }) + }); + } + + group.finish(); +} + +criterion_group!(intern_bench, intern_benchmark); +criterion_main!(intern_bench); diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index a82a517b8..196a6f230 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -121,3 +121,9 @@ name = "canonical-serialization-br" path = "fuzz_targets/canonical_serialization_br.rs" test = false doc = false + +[[bin]] +name = "intern" +path = "fuzz_targets/intern.rs" +test = false +doc = false diff --git a/fuzz/fuzz_targets/intern.rs b/fuzz/fuzz_targets/intern.rs new file mode 100644 index 000000000..f74a84cf7 --- /dev/null +++ b/fuzz/fuzz_targets/intern.rs @@ -0,0 +1,87 @@ +#![no_main] + +use clvm_fuzzing::make_tree_limits; +use clvmr::allocator::Allocator; +use clvmr::serde::{ObjectCache, intern, node_to_bytes, treehash}; +use libfuzzer_sys::fuzz_target; + +// Fuzzer for the interning functionality. +// Build and run with allocator-debug enabled (default for this fuzz crate) so NodePtr +// don't get mixed up between the source and interned allocators. +// Verifies that: +// 1. Interning succeeds on valid nodes +// 2. The interned node serializes to the same bytes as the original +// 3. The tree hash is preserved +// 4. Interned nodes have fewer or equal unique atoms/pairs (deduplication works) +fuzz_target!(|data: &[u8]| { + let mut unstructured = arbitrary::Unstructured::new(data); + let mut allocator = Allocator::new(); + let (program, _) = + make_tree_limits(&mut allocator, &mut unstructured, 600_000, false).expect("out of memory"); + + // Serialize the original node + let Ok(original_serialized) = node_to_bytes(&allocator, program) else { + return; + }; + + // Compute original tree hash + let mut original_cache = ObjectCache::new(treehash); + let Some(original_tree_hash) = original_cache.get_or_calculate(&allocator, &program, None) + else { + return; + }; + let original_tree_hash = *original_tree_hash; + + // Count original atoms and pairs before interning + let original_atoms = allocator.atom_count(); + let original_pairs = allocator.pair_count(); + let original_allocated_atoms = allocator.allocated_atom_count(); + let original_allocated_pairs = allocator.allocated_pair_count(); + + // Create interned version using new API + let Ok(tree) = intern(&allocator, program) else { + return; + }; + + // Serialize the interned node + let Ok(interned_serialized) = node_to_bytes(&tree.allocator, tree.root) else { + panic!("Interned node should serialize successfully"); + }; + + // The serializations must match + assert_eq!( + original_serialized, interned_serialized, + "Serialized bytes differ after interning" + ); + + // Verify deduplication: interned unique counts should not exceed original + let interned_atoms = tree.atoms.len(); + let interned_pairs = tree.pairs.len(); + assert!( + interned_atoms <= original_atoms, + "Interning increased atoms: {original_atoms} -> {interned_atoms}" + ); + assert!( + interned_pairs <= original_pairs, + "Interning increased pairs: {original_pairs} -> {interned_pairs}", + ); + + // Verify allocated counts (RAM usage) do not increase + let interned_allocated_atoms = tree.allocator.allocated_atom_count(); + let interned_allocated_pairs = tree.allocator.allocated_pair_count(); + assert!( + interned_allocated_atoms <= original_allocated_atoms, + "Interning increased allocated atoms: {original_allocated_atoms} -> {interned_allocated_atoms}", + ); + assert!( + interned_allocated_pairs <= original_allocated_pairs, + "Interning increased allocated pairs: {original_allocated_pairs} -> {interned_allocated_pairs}", + ); + + // Verify tree hash is preserved + let interned_tree_hash = tree.tree_hash(); + assert_eq!( + original_tree_hash, interned_tree_hash, + "Tree hash differs after interning" + ); +}); diff --git a/src/serde/intern.rs b/src/serde/intern.rs new file mode 100644 index 000000000..26c87a209 --- /dev/null +++ b/src/serde/intern.rs @@ -0,0 +1,249 @@ +//! CLVM tree interning: deduplicate atoms and pairs in a single pass. + +use std::collections::HashMap; +use std::collections::hash_map::Entry; + +use crate::allocator::{Allocator, Atom, NodePtr, SExp}; +use crate::error::Result; + +use super::bytes32::Bytes32; +use super::object_cache::{ObjectCache, treehash}; + +/// Result of interning a CLVM tree (deduplicated nodes, unique atoms/pairs). +#[derive(Debug)] +pub struct InternedTree { + /// Allocator containing only unique (deduplicated) nodes + pub allocator: Allocator, + /// Root node in the interned allocator + pub root: NodePtr, + /// All unique atoms, in insertion order + pub atoms: Vec, + /// All unique pairs, in post-order (children before parents) + pub pairs: Vec, +} + +impl InternedTree { + /// SHA256 tree hash (each unique node hashed once via ObjectCache). + pub fn tree_hash(&self) -> [u8; 32] { + let mut cache: ObjectCache = ObjectCache::new(treehash); + *cache + .get_or_calculate(&self.allocator, &self.root, None) + .expect("treehash should not fail on valid tree") + } +} + +/// Intern a CLVM tree: deduplicate atoms and pairs in a single pass. +/// +/// This function traverses the source tree once, building a new allocator +/// with deduplicated nodes. It tracks: +/// - Atoms by content (identical byte sequences share one node) +/// - Pairs by their (left, right) tuple in the interned allocator +/// +/// The resulting `InternedTree` contains: +/// - A new allocator with only unique nodes +/// - The root node in the new allocator +/// - Lists of unique atoms and pairs for cost/serialization +/// +/// # Algorithm +/// +/// Uses an iterative post-order traversal with explicit stack: +/// 1. Push root to stack +/// 2. For each node: +/// - If atom: deduplicate by content, add to atoms list if new +/// - If pair: wait for children to be processed, then deduplicate by (left, right) +/// 3. Pairs are naturally collected in post-order (children before parents) +/// +/// # Errors +/// +/// Returns an error if allocator limits are exceeded when creating new nodes. +pub fn intern(allocator: &Allocator, node: NodePtr) -> Result { + let mut new_allocator = Allocator::new(); + let mut atoms: Vec = Vec::new(); + let mut pairs: Vec = Vec::new(); + + // Maps from source allocator to interned allocator + let mut node_to_interned: HashMap = HashMap::new(); + // Maps atom content to interned NodePtr (for deduplication) + let mut atom_to_interned: HashMap = HashMap::new(); + // Maps (left_interned, right_interned) to interned pair NodePtr + let mut pair_to_interned: HashMap<(NodePtr, NodePtr), NodePtr> = HashMap::new(); + + let mut stack = vec![node]; + + while let Some(current) = stack.pop() { + // Skip if already processed + if node_to_interned.contains_key(¤t) { + continue; + } + + match allocator.sexp(current) { + SExp::Atom => { + let atom = allocator.atom(current); + let interned = match atom_to_interned.entry(atom) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let new_node = new_allocator.new_atom(atom.as_ref())?; + v.insert(new_node); + atoms.push(new_node); + new_node + } + }; + node_to_interned.insert(current, interned); + } + SExp::Pair(left, right) => { + // Check if children are processed + let left_interned = node_to_interned.get(&left); + let right_interned = node_to_interned.get(&right); + + if let (Some(l), Some(r)) = (left_interned, right_interned) { + // Both children processed, create or reuse pair + let interned = match pair_to_interned.entry((*l, *r)) { + Entry::Occupied(o) => *o.get(), + Entry::Vacant(v) => { + let new_node = new_allocator.new_pair(*l, *r)?; + v.insert(new_node); + pairs.push(new_node); + new_node + } + }; + node_to_interned.insert(current, interned); + } else { + // Need to process children first + stack.push(current); + if right_interned.is_none() { + stack.push(right); + } + if left_interned.is_none() { + stack.push(left); + } + } + } + } + } + + let root = node_to_interned[&node]; + Ok(InternedTree { + allocator: new_allocator, + root, + atoms, + pairs, + }) +} +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_intern_single_atom() { + let mut allocator = Allocator::new(); + let node = allocator.new_atom(&[1, 2, 3]).unwrap(); + + let tree = intern(&allocator, node).unwrap(); + + assert_eq!(tree.atoms.len(), 1); + assert_eq!(tree.pairs.len(), 0); + assert_eq!(tree.allocator.atom(tree.root).as_ref(), &[1, 2, 3]); + } + + #[test] + fn test_intern_simple_pair() { + let mut allocator = Allocator::new(); + let left = allocator.new_atom(&[1]).unwrap(); + let right = allocator.new_atom(&[2]).unwrap(); + let node = allocator.new_pair(left, right).unwrap(); + + let tree = intern(&allocator, node).unwrap(); + + assert_eq!(tree.atoms.len(), 2); + assert_eq!(tree.pairs.len(), 1); + } + + #[test] + fn test_intern_deduplicates_atoms() { + // Create (A . A) where A has same content + let mut allocator = Allocator::new(); + let a1 = allocator.new_atom(&[42]).unwrap(); + let a2 = allocator.new_atom(&[42]).unwrap(); // Same content, different NodePtr + let node = allocator.new_pair(a1, a2).unwrap(); + + let tree = intern(&allocator, node).unwrap(); + + // Should have only 1 unique atom + assert_eq!(tree.atoms.len(), 1); + assert_eq!(tree.pairs.len(), 1); + } + + #[test] + fn test_intern_deduplicates_pairs() { + // Create ((A . B) . (A . B)) + let mut allocator = Allocator::new(); + let a = allocator.new_atom(&[1]).unwrap(); + let b = allocator.new_atom(&[2]).unwrap(); + let p1 = allocator.new_pair(a, b).unwrap(); + let p2 = allocator.new_pair(a, b).unwrap(); // Same structure, different NodePtr + let node = allocator.new_pair(p1, p2).unwrap(); + + let tree = intern(&allocator, node).unwrap(); + + // Should have 2 atoms, 2 pairs (inner pair deduplicated) + assert_eq!(tree.atoms.len(), 2); + assert_eq!(tree.pairs.len(), 2); // (A . B) and ((A.B) . (A.B)) + } + + #[test] + fn test_tree_hash_deterministic() { + let mut alloc1 = Allocator::new(); + let a1 = alloc1.new_atom(&[1, 2, 3]).unwrap(); + let b1 = alloc1.new_atom(&[4, 5, 6]).unwrap(); + let node1 = alloc1.new_pair(a1, b1).unwrap(); + + let mut alloc2 = Allocator::new(); + let a2 = alloc2.new_atom(&[1, 2, 3]).unwrap(); + let b2 = alloc2.new_atom(&[4, 5, 6]).unwrap(); + let node2 = alloc2.new_pair(a2, b2).unwrap(); + + let tree1 = intern(&alloc1, node1).unwrap(); + let tree2 = intern(&alloc2, node2).unwrap(); + + assert_eq!(tree1.tree_hash(), tree2.tree_hash()); + } + + #[test] + fn test_pairs_in_post_order() { + // Create (A . (B . C)) + let mut allocator = Allocator::new(); + let a = allocator.new_atom(&[1]).unwrap(); + let b = allocator.new_atom(&[2]).unwrap(); + let c = allocator.new_atom(&[3]).unwrap(); + let inner = allocator.new_pair(b, c).unwrap(); + let outer = allocator.new_pair(a, inner).unwrap(); + + let tree = intern(&allocator, outer).unwrap(); + + // Post-order: inner pair before outer pair + assert_eq!(tree.pairs.len(), 2); + // The inner pair (B . C) should come before the outer pair (A . (B . C)) + // because children must be processed before parents + + // Verify the ordering: inner pair should be first, outer pair should be second + let inner_pair = tree.pairs[0]; + let outer_pair = tree.pairs[1]; + + // Verify that inner_pair is actually the (B . C) pair + let SExp::Pair(left, right) = tree.allocator.sexp(inner_pair) else { + panic!("Expected inner_pair to be a pair"); + }; + assert_eq!(tree.allocator.atom(left).as_ref(), &[2]); + assert_eq!(tree.allocator.atom(right).as_ref(), &[3]); + + // Verify that outer_pair is actually the (A . (B . C)) pair + let SExp::Pair(left, right) = tree.allocator.sexp(outer_pair) else { + panic!("Expected outer_pair to be a pair"); + }; + assert_eq!(tree.allocator.atom(left).as_ref(), &[1]); + assert_eq!( + right, inner_pair, + "Outer pair's right child should be the inner pair" + ); + } +} diff --git a/src/serde/mod.rs b/src/serde/mod.rs index 48368d5fd..889b1336d 100644 --- a/src/serde/mod.rs +++ b/src/serde/mod.rs @@ -5,6 +5,7 @@ mod de_br; mod de_tree; mod identity_hash; mod incremental; +mod intern; mod object_cache; mod parse_atom; mod path_builder; @@ -19,6 +20,8 @@ pub mod write_atom; #[cfg(test)] mod test; +#[cfg(test)] +mod test_intern; pub use bitset::BitSet; pub use de::{node_from_bytes, node_from_stream}; @@ -26,6 +29,7 @@ pub use de_br::{node_from_bytes_backrefs, node_from_bytes_backrefs_old}; pub use de_tree::{ParsedTriple, parse_triples}; pub use identity_hash::RandomState; pub use incremental::{Serializer, UndoState}; +pub use intern::{InternedTree, intern}; pub use object_cache::{ObjectCache, serialized_length, treehash}; pub use path_builder::{ChildPos, PathBuilder}; pub use read_cache_lookup::ReadCacheLookup; diff --git a/src/serde/test_intern.rs b/src/serde/test_intern.rs new file mode 100644 index 000000000..698f0ceb1 --- /dev/null +++ b/src/serde/test_intern.rs @@ -0,0 +1,100 @@ +use crate::allocator::{Allocator, NodePtr}; +use crate::error::Result; +use crate::serde::bytes32::Bytes32; +use crate::serde::intern::intern; +use crate::serde::node_from_bytes_backrefs; +use crate::serde::node_to_bytes; +use crate::serde::object_cache::{ObjectCache, treehash}; +use rstest::rstest; + +fn treehash_for_node(allocator: &Allocator, node: NodePtr) -> Bytes32 { + let mut object_cache = ObjectCache::new(treehash); + *object_cache + .get_or_calculate(allocator, &node, None) + .unwrap() +} + +/// Helper to convert hex string directly to a node +fn hex_to_node(allocator: &mut Allocator, hex: &str) -> Result { + let bytes = hex::decode(hex.trim().replace([' ', '\n'], "")).expect("invalid hex"); + node_from_bytes_backrefs(allocator, &bytes) +} + +/// Helper to deserialize hex and create interned version, returning intern stats +fn test_hex_interning(hex: &str, expected_atoms: usize, expected_pairs: usize) -> Result<()> { + let mut allocator = Allocator::new(); + + // Deserialize from hex + let node = hex_to_node(&mut allocator, hex)?; + + // Create interned version using the new API + let tree = intern(&allocator, node)?; + + // Ensure interned node serializes to same bytes + let original_serialized = node_to_bytes(&allocator, node)?; + let new_serialized = node_to_bytes(&tree.allocator, tree.root)?; + assert_eq!( + original_serialized, new_serialized, + "Serialized bytes do not match after interning." + ); + + // Ensure treehashes match + let original_treehash = treehash_for_node(&allocator, node); + let new_treehash = tree.tree_hash(); + assert_eq!( + original_treehash, new_treehash, + "Treehashes do not match after interning." + ); + + // Verify unique atom and pair counts + assert_eq!( + tree.atoms.len(), + expected_atoms, + "Atom count doesn't match expected.\nGot: {:?}\nExpected: {:?}", + tree.atoms.len(), + expected_atoms + ); + assert_eq!( + tree.pairs.len(), + expected_pairs, + "Pair count doesn't match expected.\nGot: {:?}\nExpected: {:?}", + tree.pairs.len(), + expected_pairs + ); + + Ok(()) +} + +// ======================================================== +// Hex-based test cases with intern statistics verification +// ======================================================== + +#[rstest] +#[case("01", 1, 0)] // Simple atom 1: 1 atom, 0 pairs +#[case("0a", 1, 0)] // Atom 10: 1 atom, 0 pairs +#[case("ff0101", 1, 1)] // (1 . 1): 1 atom (deduplicated), 1 pair +#[case("ff010a", 2, 1)] // (1 . 10): 2 atoms, 1 pair +#[case("ff01ff0101", 1, 2)] // (1 . (1 . 1)): 1 atom (deduplicated), 2 pairs +#[case("ffff2a2a2a", 1, 2)] // ((42 . 42) . 42): 1 atom (deduplicated), 2 pairs +#[case("ff01ff02ff0301", 3, 3)] // (1 . (2 . (3 . 1))): 3 atoms, 3 pairs +#[case("ff01ff02ff0300", 4, 3)] // (1 . (2 . (3 . nil))): 4 atoms (1,2,3,nil), 3 pairs +#[case("ff01ff02ff0304", 4, 3)] // (1 . (2 . (3 . 4))): 4 atoms, 3 pairs +#[case("ff01ff02ff0103", 3, 3)] // (1 . (2 . (1 . 3))): 3 atoms (1 repeated), 3 pairs +#[case("ffff0102ff0102", 2, 2)] // ((1 . 2) . (1 . 2)): repeated pair, 2 atoms, 2 pairs +#[case("ffff0102ffff0102ff0102", 2, 3)] // ((1 . 2) . ((1 . 2) . (1 . 2))): same pair 3×, 2 atoms, 3 pairs +#[case("ffff0102ffff010200", 3, 3)] // ((1 . 2) . ((1 . 2) . nil)): list with repeated pair, 3 atoms, 3 pairs +#[case("ffff010aff010a", 2, 2)] // ((1 . 10) . (1 . 10)): repeated pair (different atoms), 2 atoms, 2 pairs +#[case("00", 1, 0)] // nil: single nil atom, 0 pairs +#[case("ff0100", 2, 1)] // (1 . nil): minimal list of one element, 2 atoms, 1 pair +#[case("ff0000", 1, 1)] // (nil . nil): pair of nils, 1 atom (nil), 1 pair +#[case("ff01ff01ff0100", 2, 3)] // (1 . (1 . (1 . nil))): list [1,1,1], 2 atoms, 3 pairs +#[case("ff01ff01ff0101", 1, 3)] // (1 . (1 . (1 . 1))): atom 1 repeated 3× in pairs, 1 atom, 3 pairs +#[case("ffff01ff0203ff01ff0203", 3, 3)] // ((1.(2.3)) . (1.(2.3))): nested pair repeated, 3 atoms, 3 pairs +#[case("ffff0102ffff0102ffff010200", 3, 4)] // ((1.2) . ((1.2) . ((1.2) . nil))): list of 3× (1.2); 3 atoms (1, 2, nil), 4 unique pairs +fn test_interning( + #[case] hex: &str, + #[case] expected_atoms: usize, + #[case] expected_pairs: usize, +) -> Result<()> { + test_hex_interning(hex, expected_atoms, expected_pairs) +}