From 6708b7726e26a461c9b2e6a036baff2c14b67842 Mon Sep 17 00:00:00 2001 From: Marcus Pousette Date: Fri, 17 Apr 2026 09:23:15 +0200 Subject: [PATCH 01/13] feat: add immediate frontier catch-up experiment --- Cargo.lock | 9 + Cargo.toml | 1 + packages/treecrdt-core/src/lib.rs | 10 +- packages/treecrdt-core/src/materialization.rs | 354 ++++++++++---- packages/treecrdt-core/src/traits.rs | 40 ++ .../tests/materialization_helpers.rs | 206 +++++++- packages/treecrdt-postgres-rs/Cargo.toml | 2 +- packages/treecrdt-postgres-rs/src/profile.rs | 4 +- packages/treecrdt-postgres-rs/src/store.rs | 191 +++++++- .../tests/postgres_test.rs | 345 ++++++++------ packages/treecrdt-sqlite-ext/Cargo.toml | 1 + .../src/extension/functions/materialize.rs | 94 +++- .../src/extension/functions/node_store.rs | 79 ++++ .../src/extension/functions/op_index.rs | 21 + .../src/extension/functions/payload_store.rs | 40 +- .../src/extension/functions/schema.rs | 4 + .../tests/extension_roundtrip.rs | 439 ++++++++++-------- packages/treecrdt-test-support/Cargo.toml | 10 + packages/treecrdt-test-support/src/lib.rs | 233 ++++++++++ 19 files changed, 1604 insertions(+), 479 deletions(-) create mode 100644 packages/treecrdt-test-support/Cargo.toml create mode 100644 packages/treecrdt-test-support/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index f18f135a..126f90a6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1439,6 +1439,7 @@ dependencies = [ "serde", "serde_json", "treecrdt-core", + "treecrdt-test-support", "uuid", ] @@ -1476,6 +1477,14 @@ dependencies = [ "sqlite3ext-sys", "tempfile", "treecrdt-core", + "treecrdt-test-support", +] + +[[package]] +name = "treecrdt-test-support" +version = "0.0.1" +dependencies = [ + "treecrdt-core", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 6cbf6056..c8cd6518 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,7 @@ members = [ "packages/treecrdt-postgres-rs", "packages/treecrdt-riblt-wasm", "packages/treecrdt-sqlite-ext", + "packages/treecrdt-test-support", "packages/treecrdt-wasm", ] resolver = "2" diff --git a/packages/treecrdt-core/src/lib.rs b/packages/treecrdt-core/src/lib.rs index 5a13aaf9..dd3f1f7c 100644 --- a/packages/treecrdt-core/src/lib.rs +++ b/packages/treecrdt-core/src/lib.rs @@ -17,13 +17,15 @@ pub use ids::{Lamport, NodeId, OperationId, ReplicaId}; pub use materialization::{ apply_incremental_ops_with_delta, apply_persisted_remote_ops_with_delta, catch_up_materialized_state, materialize_persisted_remote_ops_with_delta, - IncrementalApplyResult, MaterializationCursor, MaterializationFrontier, MaterializationHead, - MaterializationKey, MaterializationState, PersistedRemoteApplyResult, PersistedRemoteStores, + try_shortcut_out_of_order_payload_noops, CatchUpResult, IncrementalApplyResult, + MaterializationCursor, MaterializationFrontier, MaterializationHead, MaterializationKey, + MaterializationState, PayloadNoopShortcut, PersistedRemoteApplyResult, PersistedRemoteStores, }; pub use ops::{cmp_op_key, cmp_ops, Operation, OperationKind, OperationMetadata}; pub use traits::{ - Clock, IndexProvider, LamportClock, MemoryNodeStore, MemoryPayloadStore, MemoryStorage, - NodeStore, NoopParentOpIndex, NoopStorage, ParentOpIndex, PayloadStore, Storage, + Clock, ExactNodeStore, ExactPayloadStore, IndexProvider, LamportClock, MemoryNodeStore, + MemoryPayloadStore, MemoryStorage, NodeStore, NoopParentOpIndex, NoopStorage, ParentOpIndex, + PayloadStore, Storage, TruncatingParentOpIndex, }; pub use tree::{ ApplyDelta, LocalFinalizePlan, LocalPlacement, NodeExport, NodeSnapshotExport, TreeCrdt, diff --git a/packages/treecrdt-core/src/materialization.rs b/packages/treecrdt-core/src/materialization.rs index 35059826..19d1ceb7 100644 --- a/packages/treecrdt-core/src/materialization.rs +++ b/packages/treecrdt-core/src/materialization.rs @@ -1,9 +1,10 @@ use std::cmp::Ordering; +use std::collections::{HashMap, HashSet}; use crate::ops::{cmp_op_key, cmp_ops, Operation}; use crate::traits::{ - Clock, LamportClock, MemoryNodeStore, MemoryPayloadStore, NodeStore, NoopStorage, - ParentOpIndex, PayloadStore, Storage, + Clock, ExactNodeStore, ExactPayloadStore, LamportClock, MemoryNodeStore, MemoryPayloadStore, + NodeStore, NoopStorage, ParentOpIndex, PayloadStore, Storage, TruncatingParentOpIndex, }; use crate::tree::TreeCrdt; use crate::{Error, Lamport, NodeId, OperationId, ReplicaId, Result}; @@ -79,18 +80,31 @@ pub struct IncrementalApplyResult { pub affected_nodes: Vec, } +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct CatchUpResult { + pub head: Option, + pub affected_nodes: Vec, +} + #[derive(Clone, Debug, Eq, PartialEq)] pub struct PersistedRemoteApplyResult { /// Number of ops from the input batch that were actually inserted by adapter-side dedupe. pub inserted_count: u64, /// Nodes changed by core materialization when incremental replay succeeded. /// - /// This is empty when nothing was inserted or when the helper had to defer catch-up by - /// recording a replay frontier instead of trusting incremental materialization. + /// This is empty when nothing was inserted or when the helper could not advance + /// materialization immediately and had to hand catch-up work back to the caller. + pub affected_nodes: Vec, + /// True when the helper recorded/kept a replay frontier and expects the caller to perform + /// catch-up. + pub catch_up_needed: bool, +} + +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct PayloadNoopShortcut { + pub resumed_head: MaterializationHead, + pub remaining_ops: Vec, pub affected_nodes: Vec, - /// True when the helper recorded a replay frontier instead of advancing materialization head - /// immediately. - pub frontier_recorded: bool, } /// Backend-owned stores used to replay already-persisted remote ops through core semantics. @@ -123,6 +137,13 @@ impl ParentOpIndex for RecordingIndex { } } +impl TruncatingParentOpIndex for RecordingIndex { + fn truncate_from(&mut self, seq: u64) -> Result<()> { + self.records.retain(|(_, _, existing_seq)| *existing_seq < seq); + Ok(()) + } +} + struct PrefixSnapshot { crdt: TreeCrdt, index: RecordingIndex, @@ -138,6 +159,14 @@ fn frontier_from_op(op: &Operation) -> MaterializationFrontier { } } +fn frontier_from_writer(lamport: Lamport, id: &OperationId) -> MaterializationFrontier { + MaterializationFrontier { + lamport, + replica: id.replica.as_bytes().to_vec(), + counter: id.counter, + } +} + fn owned_frontier>(frontier: &MaterializationKey) -> MaterializationFrontier { MaterializationFrontier { lamport: frontier.lamport, @@ -201,6 +230,122 @@ fn next_replay_frontier( } } +/// Try to skip out-of-order payload ops that are already dominated by a later payload winner. +/// +/// This allows adapters to avoid recording a replay frontier for a narrow but common case: +/// older payload ops that do not change materialized payload state even after being inserted +/// earlier in the canonical log order. +pub fn try_shortcut_out_of_order_payload_noops( + meta: &M, + inserted_ops: Vec, + mut load_last_writer: LoadWriter, +) -> std::result::Result, E> +where + M: MaterializationCursor, + LoadWriter: FnMut(NodeId) -> std::result::Result, E>, +{ + let state = meta.state(); + if state.replay_from.is_some() || inserted_ops.is_empty() { + return Ok(None); + } + + let Some(head) = state.head.as_ref() else { + return Ok(None); + }; + + let mut ops = inserted_ops; + ops.sort_by(cmp_ops); + + let mut candidate_nodes = HashSet::new(); + let mut has_out_of_order = false; + for op in &ops { + if cmp_frontiers(&frontier_from_op(op), &head.at) != Ordering::Less { + continue; + } + has_out_of_order = true; + match &op.kind { + crate::ops::OperationKind::Payload { node, .. } => { + candidate_nodes.insert(*node); + } + _ => return Ok(None), + } + } + + if !has_out_of_order { + return Ok(None); + } + + let mut final_writers: HashMap = HashMap::new(); + for node in &candidate_nodes { + if let Some((lamport, id)) = load_last_writer(*node)? { + final_writers.insert(*node, frontier_from_writer(lamport, &id)); + } + } + + for op in &ops { + let crate::ops::OperationKind::Payload { node, .. } = &op.kind else { + continue; + }; + if !candidate_nodes.contains(node) { + continue; + } + let op_frontier = frontier_from_op(op); + match final_writers.get(node) { + Some(existing) if cmp_frontiers(&op_frontier, existing) != Ordering::Greater => {} + _ => { + final_writers.insert(*node, op_frontier); + } + } + } + + let mut skipped = 0u64; + let mut affected = HashSet::new(); + let mut remaining_ops = Vec::new(); + + for op in ops { + let op_frontier = frontier_from_op(&op); + if cmp_frontiers(&op_frontier, &head.at) != Ordering::Less { + remaining_ops.push(op); + continue; + } + + let node = match &op.kind { + crate::ops::OperationKind::Payload { node, .. } => *node, + _ => return Ok(None), + }; + + let Some(final_writer) = final_writers.get(&node) else { + return Ok(None); + }; + if cmp_frontiers(&op_frontier, final_writer) != Ordering::Less { + return Ok(None); + } + + skipped = skipped.saturating_add(1); + affected.insert(node); + } + + if skipped == 0 { + return Ok(None); + } + + let mut affected_nodes: Vec = affected.into_iter().collect(); + affected_nodes.sort(); + + Ok(Some(PayloadNoopShortcut { + resumed_head: MaterializationHead { + at: MaterializationKey { + lamport: head.at.lamport, + replica: head.at.replica.to_vec(), + counter: head.at.counter, + }, + seq: head.seq.saturating_add(skipped), + }, + remaining_ops, + affected_nodes, + })) +} + /// Apply an incremental batch and return both head metadata and full affected-node delta. /// /// `affected_nodes` is deduplicated and sorted (`NodeId` ascending) for stable consumers. @@ -328,11 +473,11 @@ where Ok(result) } -fn build_prefix_snapshot( +fn replay_frontier_in_memory( storage: &S, frontier: &MaterializationFrontier, replica_id: &ReplicaId, -) -> Result { +) -> Result<(PrefixSnapshot, u64, Vec)> { let mut crdt = TreeCrdt::with_stores( replica_id.clone(), NoopStorage, @@ -343,75 +488,90 @@ fn build_prefix_snapshot( let mut index = RecordingIndex::default(); let mut seq = 0u64; let mut head: Option = None; + let mut affected = HashSet::new(); + let mut prefix_seq = None; storage.scan_since(0, &mut |op| { - if cmp_frontiers(&frontier_from_op(&op), frontier) != Ordering::Less { - return Ok(()); + let in_suffix = cmp_frontiers(&frontier_from_op(&op), frontier) != Ordering::Less; + if in_suffix && prefix_seq.is_none() { + prefix_seq = Some(seq); } match crdt.apply_remote_with_materialization_seq(op.clone(), &mut index, &mut seq)? { - Some(_) => { + Some(delta) => { head = Some(op); + if in_suffix { + affected.extend(delta.affected_nodes); + } Ok(()) } None => Err(Error::Storage( - "prefix replay unexpectedly required nested catch-up".into(), + "frontier replay unexpectedly required nested catch-up".into(), )), } })?; - Ok(PrefixSnapshot { - crdt, - index, - head, - seq, - }) + let mut affected_nodes: Vec = affected.into_iter().collect(); + affected_nodes.sort(); + Ok(( + PrefixSnapshot { + crdt, + index, + head, + seq, + }, + prefix_seq.unwrap_or(seq), + affected_nodes, + )) } -fn restore_prefix_snapshot( +fn patch_final_state_in_place( prefix: &mut PrefixSnapshot, + prefix_seq: u64, + affected_nodes: &[NodeId], nodes: &mut N, payloads: &mut P, index: &mut I, -) -> Result<()> { - let mut all_nodes = prefix.crdt.node_store_mut().all_nodes()?; - all_nodes.sort(); +) -> Result<()> +where + N: ExactNodeStore, + P: ExactPayloadStore, + I: TruncatingParentOpIndex, +{ + let truncate_from = prefix_seq.saturating_add(1); + index.truncate_from(truncate_from)?; - for node in &all_nodes { + for node in affected_nodes { nodes.ensure_node(*node)?; - } + nodes.detach(*node)?; - for node in &all_nodes { - if *node == NodeId::ROOT { - continue; - } - let parent = prefix.crdt.node_store_mut().parent(*node)?; - let order_key = prefix.crdt.node_store_mut().order_key(*node)?; - if let Some(parent) = parent { - nodes.attach(*node, parent, order_key.unwrap_or_default())?; - } else { - nodes.detach(*node)?; + if let Some(parent) = prefix.crdt.node_store_mut().parent(*node)? { + let order_key = prefix.crdt.node_store_mut().order_key(*node)?.unwrap_or_default(); + nodes.attach(*node, parent, order_key)?; } - } - for node in &all_nodes { nodes.set_tombstone(*node, prefix.crdt.node_store_mut().tombstone(*node)?)?; let last_change = prefix.crdt.node_store_mut().last_change(*node)?; - if !last_change.is_empty() { - nodes.merge_last_change(*node, &last_change)?; - } + nodes.set_last_change_exact(*node, &last_change)?; - if let Some(deleted_at) = prefix.crdt.node_store_mut().deleted_at(*node)? { - nodes.merge_deleted_at(*node, &deleted_at)?; - } + let deleted_at = prefix.crdt.node_store_mut().deleted_at(*node)?; + nodes.set_deleted_at_exact(*node, deleted_at.as_ref())?; if let Some(writer) = prefix.crdt.payload_last_writer(*node)? { payloads.set_payload(*node, prefix.crdt.payload(*node)?, writer)?; + } else { + payloads.clear_payload(*node)?; } } - let mut records = prefix.index.records.clone(); + let mut records: Vec<_> = prefix + .index + .records + .iter() + .filter(|(_, _, seq)| *seq >= truncate_from) + .cloned() + .collect(); records.sort_by(|a, b| a.2.cmp(&b.2).then_with(|| a.0.cmp(&b.0)).then_with(|| a.1.cmp(&b.1))); for (parent, op_id, seq) in records { index.record(parent, &op_id, seq)?; @@ -420,21 +580,21 @@ fn restore_prefix_snapshot( Ok(()) } -/// Catch backend materialized state up to the persisted op log using the replay frontier when -/// available. +/// Catch backend materialized state up from a replay frontier by patching affected backend rows +/// and suffix index entries in place. pub fn catch_up_materialized_state( storage: S, stores: PersistedRemoteStores, meta: &M, mut flush_nodes: FlushNodes, mut flush_index: FlushIndex, -) -> Result> +) -> Result where S: Storage, C: Clock, - N: NodeStore, - P: PayloadStore, - I: ParentOpIndex, + N: ExactNodeStore, + P: ExactPayloadStore, + I: TruncatingParentOpIndex, M: MaterializationCursor, FlushNodes: FnMut(&mut N) -> Result<()>, FlushIndex: FnMut(&mut I) -> Result<()>, @@ -444,65 +604,61 @@ where state.replay_from.as_ref().map(owned_frontier) }; + let Some(frontier) = replay_frontier.as_ref() else { + return Ok(CatchUpResult { + head: meta.state().head.as_ref().map(|head| MaterializationHead { + at: MaterializationKey { + lamport: head.at.lamport, + replica: head.at.replica.to_vec(), + counter: head.at.counter, + }, + seq: head.seq, + }), + affected_nodes: Vec::new(), + }); + }; + let PersistedRemoteStores { replica_id, - clock, + clock: _clock, mut nodes, mut payloads, mut index, } = stores; - nodes.reset()?; - payloads.reset()?; - index.reset()?; - - let mut head: Option = None; - let mut seq = 0u64; - - if let Some(frontier) = replay_frontier.as_ref() { - let mut prefix = build_prefix_snapshot(&storage, frontier, &replica_id)?; - restore_prefix_snapshot(&mut prefix, &mut nodes, &mut payloads, &mut index)?; - head = prefix.head; - seq = prefix.seq; - } - - let mut crdt = TreeCrdt::with_stores(replica_id, NoopStorage, clock, nodes, payloads)?; - storage.scan_since(0, &mut |op| { - if let Some(frontier) = replay_frontier.as_ref() { - if cmp_frontiers(&frontier_from_op(&op), frontier) == Ordering::Less { - return Ok(()); - } - } - - match crdt.apply_remote_with_materialization_seq(op.clone(), &mut index, &mut seq)? { - Some(_) => { - head = Some(op); - Ok(()) - } - None => Err(Error::Storage( - "catch-up replay unexpectedly required nested catch-up".into(), - )), - } - })?; + let (mut prefix, prefix_seq, affected_nodes) = + replay_frontier_in_memory(&storage, frontier, &replica_id)?; + patch_final_state_in_place( + &mut prefix, + prefix_seq, + &affected_nodes, + &mut nodes, + &mut payloads, + &mut index, + )?; - flush_nodes(crdt.node_store_mut())?; + flush_nodes(&mut nodes)?; flush_index(&mut index)?; - Ok(head.map(|head| MaterializationHead { - at: MaterializationKey { - lamport: head.meta.lamport, - replica: head.meta.id.replica.as_bytes().to_vec(), - counter: head.meta.id.counter, - }, - seq, - })) + Ok(CatchUpResult { + head: prefix.head.map(|head| MaterializationHead { + at: MaterializationKey { + lamport: head.meta.lamport, + replica: head.meta.id.replica.as_bytes().to_vec(), + counter: head.meta.id.counter, + }, + seq: prefix.seq, + }), + affected_nodes, + }) } /// Apply already-persisted inserted remote ops and commit adapter-owned metadata writes. /// /// Adapters own persistence + dedupe and pass only the inserted subset here. If the materialized /// doc is already behind a replay frontier, or if incremental materialization / metadata updates -/// fail, this records a replay frontier so catch-up can repair materialized state later. +/// fail, this records a replay frontier and returns control to the caller. Callers can then either +/// catch up immediately in the same append flow or defer catch-up to a later read/recovery path. pub fn apply_persisted_remote_ops_with_delta( meta: &M, inserted_ops: Vec, @@ -519,7 +675,7 @@ where return Ok(PersistedRemoteApplyResult { inserted_count: 0, affected_nodes: Vec::new(), - frontier_recorded: false, + catch_up_needed: false, }); } @@ -528,7 +684,7 @@ where return Ok(PersistedRemoteApplyResult { inserted_count, affected_nodes: Vec::new(), - frontier_recorded: true, + catch_up_needed: true, }); } @@ -539,7 +695,7 @@ where return Ok(PersistedRemoteApplyResult { inserted_count, affected_nodes: Vec::new(), - frontier_recorded: true, + catch_up_needed: true, }); }; @@ -547,14 +703,14 @@ where Ok(PersistedRemoteApplyResult { inserted_count, affected_nodes: result.affected_nodes, - frontier_recorded: false, + catch_up_needed: false, }) } else { schedule_replay(&start_replay_frontier())?; Ok(PersistedRemoteApplyResult { inserted_count, affected_nodes: Vec::new(), - frontier_recorded: true, + catch_up_needed: true, }) } } @@ -563,7 +719,7 @@ where Ok(PersistedRemoteApplyResult { inserted_count, affected_nodes: Vec::new(), - frontier_recorded: true, + catch_up_needed: true, }) } } diff --git a/packages/treecrdt-core/src/traits.rs b/packages/treecrdt-core/src/traits.rs index 3b3caa61..f1fcb710 100644 --- a/packages/treecrdt-core/src/traits.rs +++ b/packages/treecrdt-core/src/traits.rs @@ -159,6 +159,10 @@ pub trait PayloadStore { ) -> Result<()>; } +pub trait ExactPayloadStore: PayloadStore { + fn clear_payload(&mut self, node: NodeId) -> Result<()>; +} + /// Persistent index of operations relevant to a `children(parent)` filter. /// /// This is used by adapters (e.g. SQLite) to support partial sync without re-implementing which @@ -168,6 +172,10 @@ pub trait ParentOpIndex { fn record(&mut self, parent: NodeId, op_id: &OperationId, seq: u64) -> Result<()>; } +pub trait TruncatingParentOpIndex: ParentOpIndex { + fn truncate_from(&mut self, seq: u64) -> Result<()>; +} + #[derive(Default)] pub struct NoopParentOpIndex; @@ -181,6 +189,17 @@ impl ParentOpIndex for NoopParentOpIndex { } } +impl TruncatingParentOpIndex for NoopParentOpIndex { + fn truncate_from(&mut self, _seq: u64) -> Result<()> { + Ok(()) + } +} + +pub trait ExactNodeStore: NodeStore { + fn set_last_change_exact(&mut self, node: NodeId, vv: &VersionVector) -> Result<()>; + fn set_deleted_at_exact(&mut self, node: NodeId, vv: Option<&VersionVector>) -> Result<()>; +} + /// Basic Lamport clock implementation useful for tests and default flows. #[derive(Clone, Debug, Default)] pub struct LamportClock { @@ -308,6 +327,13 @@ impl PayloadStore for MemoryPayloadStore { } } +impl ExactPayloadStore for MemoryPayloadStore { + fn clear_payload(&mut self, node: NodeId) -> Result<()> { + self.entries.remove(&node); + Ok(()) + } +} + #[derive(Clone, Debug)] struct MemoryNodeState { parent: Option, @@ -483,3 +509,17 @@ impl NodeStore for MemoryNodeStore { Ok(self.nodes.keys().copied().collect()) } } + +impl ExactNodeStore for MemoryNodeStore { + fn set_last_change_exact(&mut self, node: NodeId, vv: &VersionVector) -> Result<()> { + self.ensure_node(node)?; + self.get_state_mut(node)?.last_change = vv.clone(); + Ok(()) + } + + fn set_deleted_at_exact(&mut self, node: NodeId, vv: Option<&VersionVector>) -> Result<()> { + self.ensure_node(node)?; + self.get_state_mut(node)?.deleted_at = vv.cloned(); + Ok(()) + } +} diff --git a/packages/treecrdt-core/tests/materialization_helpers.rs b/packages/treecrdt-core/tests/materialization_helpers.rs index 6ce1bac6..b0f421b4 100644 --- a/packages/treecrdt-core/tests/materialization_helpers.rs +++ b/packages/treecrdt-core/tests/materialization_helpers.rs @@ -1,9 +1,12 @@ +use std::cell::Cell; +use std::rc::Rc; use treecrdt_core::{ apply_incremental_ops_with_delta, apply_persisted_remote_ops_with_delta, - materialize_persisted_remote_ops_with_delta, LamportClock, MaterializationCursor, + catch_up_materialized_state, materialize_persisted_remote_ops_with_delta, + try_shortcut_out_of_order_payload_noops, Lamport, LamportClock, MaterializationCursor, MaterializationHead, MaterializationKey, MaterializationState, MemoryNodeStore, MemoryPayloadStore, MemoryStorage, NodeId, NoopParentOpIndex, Operation, OperationId, - ParentOpIndex, PersistedRemoteStores, ReplicaId, TreeCrdt, + ParentOpIndex, PersistedRemoteStores, ReplicaId, Storage, TreeCrdt, }; #[derive(Default)] @@ -74,6 +77,38 @@ impl ParentOpIndex for RecordingIndex { } } +struct CountingStorage { + inner: MemoryStorage, + scan_count: Rc>, +} + +impl Storage for CountingStorage { + fn apply(&mut self, op: Operation) -> treecrdt_core::Result { + self.inner.apply(op) + } + + fn load_since(&self, lamport: Lamport) -> treecrdt_core::Result> { + self.inner.load_since(lamport) + } + + fn latest_lamport(&self) -> Lamport { + self.inner.latest_lamport() + } + + fn latest_counter(&self, replica: &ReplicaId) -> treecrdt_core::Result { + self.inner.latest_counter(replica) + } + + fn scan_since( + &self, + lamport: Lamport, + visit: &mut dyn FnMut(Operation) -> treecrdt_core::Result<()>, + ) -> treecrdt_core::Result<()> { + self.scan_count.set(self.scan_count.get() + 1); + self.inner.scan_since(lamport, visit) + } +} + #[test] fn finalize_local_materialization_records_unique_hints_and_extras() { let mut crdt = TreeCrdt::new( @@ -269,7 +304,7 @@ fn apply_persisted_remote_ops_materializes_only_inserted_entries() { assert_eq!(seen_counters, vec![2, 3]); assert_eq!(result.inserted_count, 2); assert_eq!(result.affected_nodes, vec![NodeId(2)]); - assert!(!result.frontier_recorded); + assert!(!result.catch_up_needed); assert_eq!( updated_head, Some(MaterializationHead { @@ -313,7 +348,7 @@ fn apply_persisted_remote_ops_schedules_replay_from_start_when_head_is_missing() assert_eq!(scheduled_replay, 1); assert_eq!(result.inserted_count, 1); assert_eq!(result.affected_nodes, Vec::::new()); - assert!(result.frontier_recorded); + assert!(result.catch_up_needed); } #[test] @@ -350,7 +385,7 @@ fn apply_persisted_remote_ops_schedules_full_replay_when_update_head_fails() { assert_eq!(scheduled_replay, 1); assert_eq!(result.inserted_count, 1); assert!(result.affected_nodes.is_empty()); - assert!(result.frontier_recorded); + assert!(result.catch_up_needed); } #[test] @@ -389,7 +424,7 @@ fn apply_persisted_remote_ops_schedules_replay_frontier_for_out_of_order_ops() { assert_eq!(materialize_runs, 0); assert_eq!(result.inserted_count, 2); assert!(result.affected_nodes.is_empty()); - assert!(result.frontier_recorded); + assert!(result.catch_up_needed); assert_eq!( replay_frontier, Some(treecrdt_core::MaterializationFrontier { @@ -429,7 +464,7 @@ fn apply_persisted_remote_ops_keeps_earliest_existing_replay_frontier() { assert_eq!(result.inserted_count, 1); assert!(result.affected_nodes.is_empty()); - assert!(result.frontier_recorded); + assert!(result.catch_up_needed); assert_eq!( replay_frontier, Some(treecrdt_core::MaterializationFrontier { @@ -486,3 +521,160 @@ fn materialize_persisted_remote_ops_with_delta_runs_prepare_and_flush_hooks() { vec![NodeId::ROOT, NodeId(10), NodeId(11)] ); } + +#[test] +fn payload_noop_shortcut_skips_out_of_order_payload_dominated_by_current_winner() { + let cursor = Cursor { + head_lamport: 10, + head_replica: b"r".to_vec(), + head_counter: 10, + head_seq: 5, + ..Cursor::default() + }; + let replica = ReplicaId::new(b"r"); + let node = NodeId(7); + let op = Operation::set_payload(&replica, 4, 4, node, vec![1]); + + let shortcut = try_shortcut_out_of_order_payload_noops(&cursor, vec![op.clone()], |lookup| { + assert_eq!(lookup, node); + Ok::<_, ()>(Some(( + 9, + OperationId { + replica: replica.clone(), + counter: 9, + }, + ))) + }) + .unwrap() + .expect("expected payload noop shortcut"); + + assert_eq!(shortcut.resumed_head.at.counter, 10); + assert_eq!(shortcut.resumed_head.seq, 6); + assert!(shortcut.remaining_ops.is_empty()); + assert_eq!(shortcut.affected_nodes, vec![node]); +} + +#[test] +fn payload_noop_shortcut_keeps_later_in_order_payload_for_incremental_materialization() { + let cursor = Cursor { + head_lamport: 10, + head_replica: b"r".to_vec(), + head_counter: 10, + head_seq: 5, + ..Cursor::default() + }; + let replica = ReplicaId::new(b"r"); + let node = NodeId(8); + let older = Operation::set_payload(&replica, 4, 4, node, vec![1]); + let newer = Operation::set_payload(&replica, 12, 12, node, vec![2]); + + let shortcut = try_shortcut_out_of_order_payload_noops( + &cursor, + vec![newer.clone(), older.clone()], + |_| Ok::<_, ()>(None), + ) + .unwrap() + .expect("expected payload noop shortcut"); + + assert_eq!(shortcut.resumed_head.seq, 6); + assert_eq!(shortcut.remaining_ops, vec![newer]); + assert_eq!(shortcut.affected_nodes, vec![node]); +} + +#[test] +fn payload_noop_shortcut_rejects_out_of_order_payload_that_becomes_final_winner() { + let cursor = Cursor { + head_lamport: 10, + head_replica: b"r".to_vec(), + head_counter: 10, + head_seq: 5, + ..Cursor::default() + }; + let replica = ReplicaId::new(b"r"); + let node = NodeId(9); + let op = Operation::set_payload(&replica, 4, 4, node, vec![1]); + + let shortcut = try_shortcut_out_of_order_payload_noops(&cursor, vec![op], |_| { + Ok::<_, ()>(Some(( + 2, + OperationId { + replica: ReplicaId::new(b"old"), + counter: 2, + }, + ))) + }) + .unwrap(); + + assert!(shortcut.is_none()); +} + +#[test] +fn payload_noop_shortcut_rejects_out_of_order_move() { + let cursor = Cursor { + head_lamport: 10, + head_replica: b"r".to_vec(), + head_counter: 10, + head_seq: 5, + ..Cursor::default() + }; + let replica = ReplicaId::new(b"r"); + let move_op = Operation::move_node(&replica, 4, 4, NodeId(3), NodeId::ROOT, vec![0x10]); + let called = Cell::new(false); + + let shortcut = try_shortcut_out_of_order_payload_noops( + &cursor, + vec![move_op], + |_| -> Result, ()> { + called.set(true); + Ok(None) + }, + ) + .unwrap(); + + assert!(shortcut.is_none()); + assert!(!called.get()); +} + +#[test] +fn catch_up_materialized_state_scans_storage_once() { + let replica = ReplicaId::new(b"scan-once"); + let first = Operation::insert(&replica, 1, 1, NodeId::ROOT, NodeId(1), vec![0x10]); + let second = Operation::insert(&replica, 2, 2, NodeId::ROOT, NodeId(2), vec![0x20]); + + let mut inner = MemoryStorage::default(); + inner.apply(first.clone()).unwrap(); + inner.apply(second.clone()).unwrap(); + let scan_count = Rc::new(Cell::new(0)); + let storage = CountingStorage { + inner, + scan_count: scan_count.clone(), + }; + let meta = Cursor { + replay_lamport: Some(first.meta.lamport), + replay_replica: Some(first.meta.id.replica.as_bytes().to_vec()), + replay_counter: Some(first.meta.id.counter), + ..Cursor::default() + }; + + let result = catch_up_materialized_state( + storage, + PersistedRemoteStores { + replica_id: ReplicaId::new(b"adapter"), + clock: LamportClock::default(), + nodes: MemoryNodeStore::default(), + payloads: MemoryPayloadStore::default(), + index: NoopParentOpIndex, + }, + &meta, + |_| Ok(()), + |_| Ok(()), + ) + .unwrap(); + + assert_eq!(scan_count.get(), 1); + assert_eq!(result.head.expect("expected head").seq, 2); + assert_eq!( + result.affected_nodes, + vec![NodeId::ROOT, NodeId(1), NodeId(2)] + ); +} diff --git a/packages/treecrdt-postgres-rs/Cargo.toml b/packages/treecrdt-postgres-rs/Cargo.toml index 2c05a76a..6f8239e2 100644 --- a/packages/treecrdt-postgres-rs/Cargo.toml +++ b/packages/treecrdt-postgres-rs/Cargo.toml @@ -13,5 +13,5 @@ serde_json = "1.0" treecrdt-core = { path = "../treecrdt-core", features = ["serde"] } [dev-dependencies] +treecrdt-test-support = { path = "../treecrdt-test-support" } uuid = { version = "1.8", features = ["v4"] } - diff --git a/packages/treecrdt-postgres-rs/src/profile.rs b/packages/treecrdt-postgres-rs/src/profile.rs index 1f06412f..b4ba7006 100644 --- a/packages/treecrdt-postgres-rs/src/profile.rs +++ b/packages/treecrdt-postgres-rs/src/profile.rs @@ -22,7 +22,7 @@ pub(crate) struct PgAppendProfile { pub(crate) dedupe_filter_ms: f64, pub(crate) materialize_ms: f64, pub(crate) update_head_ms: f64, - pub(crate) frontier_recorded: bool, + pub(crate) catch_up_needed: bool, pub(crate) node_load_count: u64, pub(crate) node_load_ms: f64, pub(crate) node_ensure_count: u64, @@ -74,7 +74,7 @@ impl PgAppendProfile { "dedupeFilterMs": self.dedupe_filter_ms, "materializeMs": self.materialize_ms, "updateHeadMs": self.update_head_ms, - "frontierRecorded": self.frontier_recorded, + "catchUpNeeded": self.catch_up_needed, "nodeLoadCount": self.node_load_count, "nodeLoadMs": self.node_load_ms, "nodeEnsureCount": self.node_ensure_count, diff --git a/packages/treecrdt-postgres-rs/src/store.rs b/packages/treecrdt-postgres-rs/src/store.rs index b5d830a3..ecd3debc 100644 --- a/packages/treecrdt-postgres-rs/src/store.rs +++ b/packages/treecrdt-postgres-rs/src/store.rs @@ -7,10 +7,11 @@ use postgres::{Client, Row, Statement}; use treecrdt_core::{ apply_persisted_remote_ops_with_delta, catch_up_materialized_state, - materialize_persisted_remote_ops_with_delta, Error, Lamport, LamportClock, - MaterializationCursor, MaterializationFrontier, MaterializationHead, MaterializationKey, - MaterializationState, NodeId, Operation, OperationId, OperationKind, PersistedRemoteStores, - ReplicaId, Result, Storage, VersionVector, + materialize_persisted_remote_ops_with_delta, try_shortcut_out_of_order_payload_noops, Error, + ExactNodeStore, ExactPayloadStore, Lamport, LamportClock, MaterializationCursor, + MaterializationFrontier, MaterializationHead, MaterializationKey, MaterializationState, NodeId, + NodeStore, Operation, OperationId, OperationKind, PayloadStore, PersistedRemoteStores, + ReplicaId, Result, Storage, TruncatingParentOpIndex, VersionVector, }; use crate::opref::{derive_op_ref_v0, OPREF_V0_WIDTH}; @@ -841,6 +842,56 @@ impl treecrdt_core::NodeStore for PgNodeStore { } } +impl ExactNodeStore for PgNodeStore { + fn set_last_change_exact(&mut self, node: NodeId, vv: &VersionVector) -> Result<()> { + self.ensure_node(node)?; + let node_bytes = node_to_bytes(node); + let bytes = if vv.is_empty() { + None + } else { + Some(vv_to_bytes(vv)?) + }; + let mut c = self.ctx.client.borrow_mut(); + let stmt = self.ctx.stmt( + &mut c, + "UPDATE treecrdt_nodes \ + SET last_change = $3 \ + WHERE doc_id = $1 AND node = $2", + )?; + c.execute(&stmt, &[&self.ctx.doc_id, &node_bytes.as_slice(), &bytes]) + .map_err(storage_debug)?; + + if let Some(Some(row)) = self.cache.borrow_mut().get_mut(&node) { + row.last_change = bytes; + } + self.pending_last_change.borrow_mut().remove(&node); + Ok(()) + } + + fn set_deleted_at_exact(&mut self, node: NodeId, vv: Option<&VersionVector>) -> Result<()> { + self.ensure_node(node)?; + let node_bytes = node_to_bytes(node); + let bytes = match vv { + Some(vv) if !vv.is_empty() => Some(vv_to_bytes(vv)?), + _ => None, + }; + let mut c = self.ctx.client.borrow_mut(); + let stmt = self.ctx.stmt( + &mut c, + "UPDATE treecrdt_nodes \ + SET deleted_at = $3 \ + WHERE doc_id = $1 AND node = $2", + )?; + c.execute(&stmt, &[&self.ctx.doc_id, &node_bytes.as_slice(), &bytes]) + .map_err(storage_debug)?; + + if let Some(Some(row)) = self.cache.borrow_mut().get_mut(&node) { + row.deleted_at = bytes; + } + Ok(()) + } +} + pub(crate) struct PgPayloadStore { ctx: PgCtx, cache: RefCell>>, @@ -976,6 +1027,21 @@ impl treecrdt_core::PayloadStore for PgPayloadStore { } } +impl ExactPayloadStore for PgPayloadStore { + fn clear_payload(&mut self, node: NodeId) -> Result<()> { + let node_bytes = node_to_bytes(node); + let mut c = self.ctx.client.borrow_mut(); + let stmt = self.ctx.stmt( + &mut c, + "DELETE FROM treecrdt_payload WHERE doc_id = $1 AND node = $2", + )?; + c.execute(&stmt, &[&self.ctx.doc_id, &node_bytes.as_slice()]) + .map_err(storage_debug)?; + self.cache.borrow_mut().insert(node, None); + Ok(()) + } +} + pub(crate) struct PgParentOpIndex { ctx: PgCtx, pending: Vec, @@ -1059,6 +1125,19 @@ impl treecrdt_core::ParentOpIndex for PgParentOpIndex { } } +impl TruncatingParentOpIndex for PgParentOpIndex { + fn truncate_from(&mut self, seq: u64) -> Result<()> { + self.pending.clear(); + let mut c = self.ctx.client.borrow_mut(); + let stmt = self.ctx.stmt( + &mut c, + "DELETE FROM treecrdt_oprefs_children WHERE doc_id = $1 AND seq >= $2", + )?; + c.execute(&stmt, &[&self.ctx.doc_id, &(seq as i64)]).map_err(storage_debug)?; + Ok(()) + } +} + const PARENT_OP_INDEX_FLUSH_SIZE: usize = 4096; struct PendingParentOpRefRow { @@ -1539,6 +1618,13 @@ fn materialize_inserted_ops( ) } +fn merge_affected_nodes(mut left: Vec, right: Vec) -> Vec { + left.extend(right); + left.sort(); + left.dedup(); + left +} + pub fn append_ops(client: &Rc>, doc_id: &str, ops: &[Operation]) -> Result { { let mut c = client.borrow_mut(); @@ -1634,18 +1720,83 @@ fn append_ops_in_tx( let materialize_started_at = Instant::now(); let mut update_head_ms = 0.0; - let apply_result = apply_persisted_remote_ops_with_delta( - &meta, - inserted_ops, - |inserted| materialize_inserted_ops(ctx.clone(), &meta, inserted), - |head| { - let started_at = Instant::now(); - let result = update_tree_meta_head(&ctx.client, &ctx.doc_id, Some(head)); - update_head_ms += started_at.elapsed().as_secs_f64() * 1000.0; - result - }, - |frontier| set_tree_meta_replay_frontier(client, doc_id, frontier), - )?; + let mut update_head = |head: &MaterializationHead| { + let started_at = Instant::now(); + let result = update_tree_meta_head(&ctx.client, &ctx.doc_id, Some(head)); + update_head_ms += started_at.elapsed().as_secs_f64() * 1000.0; + result + }; + let apply_result = if let Some(shortcut) = { + let payloads = PgPayloadStore::new(ctx.clone()); + try_shortcut_out_of_order_payload_noops(&meta, inserted_ops.clone(), |node| { + payloads.last_writer(node) + })? + } { + if shortcut.remaining_ops.is_empty() { + update_head(&shortcut.resumed_head)?; + treecrdt_core::PersistedRemoteApplyResult { + inserted_count: inserted_ops.len().min(u64::MAX as usize) as u64, + affected_nodes: shortcut.affected_nodes, + catch_up_needed: false, + } + } else { + let shortcut_meta = TreeMeta(MaterializationState { + head: Some(shortcut.resumed_head.clone()), + replay_from: None, + }); + let result = + materialize_inserted_ops(ctx.clone(), &shortcut_meta, shortcut.remaining_ops)?; + let head = result.head.ok_or_else(|| { + Error::Storage("expected head after payload noop shortcut".into()) + })?; + update_head(&head)?; + treecrdt_core::PersistedRemoteApplyResult { + inserted_count: inserted_ops.len().min(u64::MAX as usize) as u64, + affected_nodes: merge_affected_nodes( + shortcut.affected_nodes, + result.affected_nodes, + ), + catch_up_needed: false, + } + } + } else { + apply_persisted_remote_ops_with_delta( + &meta, + inserted_ops, + |inserted| materialize_inserted_ops(ctx.clone(), &meta, inserted), + &mut update_head, + |frontier| set_tree_meta_replay_frontier(client, doc_id, frontier), + )? + }; + let apply_result = if apply_result.catch_up_needed { + let refreshed_meta = load_tree_meta_for_update(client, doc_id)?; + let catch_up = catch_up_materialized_state( + PgOpStorage::new(ctx.clone()), + PersistedRemoteStores { + replica_id: ReplicaId::new(b"postgres"), + clock: LamportClock::default(), + nodes: PgNodeStore::new(ctx.clone()), + payloads: PgPayloadStore::new(ctx.clone()), + index: PgParentOpIndex::new(ctx.clone()), + }, + &refreshed_meta, + |nodes| nodes.flush_last_change(), + |index| index.flush(), + )?; + update_head( + catch_up + .head + .as_ref() + .ok_or_else(|| Error::Storage("expected head after immediate catch-up".into()))?, + )?; + treecrdt_core::PersistedRemoteApplyResult { + inserted_count: apply_result.inserted_count, + affected_nodes: catch_up.affected_nodes, + catch_up_needed: false, + } + } else { + apply_result + }; if let Some(profile) = &append_profile { profile.borrow_mut().materialize_ms += materialize_started_at.elapsed().as_secs_f64() * 1000.0; @@ -1653,8 +1804,8 @@ fn append_ops_in_tx( if let Some(profile) = &append_profile { profile.borrow_mut().update_head_ms += update_head_ms; - if apply_result.frontier_recorded { - profile.borrow_mut().frontier_recorded = true; + if apply_result.catch_up_needed { + profile.borrow_mut().catch_up_needed = true; } profile.borrow().log(doc_id, apply_result.inserted_count as usize); } @@ -1701,7 +1852,7 @@ pub(crate) fn ensure_materialized_in_tx(client: &Rc>, doc_id: &s let ctx = PgCtx::new(client.clone(), doc_id)?; let storage = PgOpStorage::new(ctx.clone()); - let head = catch_up_materialized_state( + let catch_up = catch_up_materialized_state( storage, PersistedRemoteStores { replica_id: ReplicaId::new(b"postgres"), @@ -1715,7 +1866,7 @@ pub(crate) fn ensure_materialized_in_tx(client: &Rc>, doc_id: &s |index| index.flush(), )?; - update_tree_meta_head(client, doc_id, head.as_ref())?; + update_tree_meta_head(client, doc_id, catch_up.head.as_ref())?; Ok(()) } diff --git a/packages/treecrdt-postgres-rs/tests/postgres_test.rs b/packages/treecrdt-postgres-rs/tests/postgres_test.rs index bc3990e2..fb1ea431 100644 --- a/packages/treecrdt-postgres-rs/tests/postgres_test.rs +++ b/packages/treecrdt-postgres-rs/tests/postgres_test.rs @@ -12,34 +12,10 @@ use treecrdt_postgres::{ local_move, local_payload, max_lamport, replica_max_counter, reset_doc_for_tests, tree_children, tree_payload, }; - -fn order_key_from_position(position: u16) -> Vec { - let n = position.wrapping_add(1); - n.to_be_bytes().to_vec() -} - -fn node(n: u128) -> NodeId { - NodeId(n) -} - -fn representative_remote_batch(replica: &ReplicaId) -> (NodeId, NodeId, NodeId, Vec) { - let p1 = node(1); - let p2 = node(2); - let child = node(3); - ( - p1, - p2, - child, - vec![ - Operation::insert(replica, 1, 1, NodeId::ROOT, p1, order_key_from_position(0)), - Operation::insert(replica, 2, 2, NodeId::ROOT, p2, order_key_from_position(1)), - Operation::insert(replica, 3, 3, p1, child, order_key_from_position(0)), - Operation::set_payload(replica, 4, 4, child, vec![7]), - Operation::move_node(replica, 5, 5, child, p2, order_key_from_position(0)), - Operation::set_payload(replica, 6, 6, child, vec![8]), - ], - ) -} +use treecrdt_test_support::{ + self as materialization_conformance, node, order_key_from_position, + representative_remote_batch, MaterializationConformanceHarness, +}; fn connect() -> Option>> { let url = std::env::var("TREECRDT_POSTGRES_URL").ok()?; @@ -55,6 +31,97 @@ fn ensure_schema_once(client: &Rc>) { }); } +struct PgConformanceHarness { + client: Rc>, + doc_id: String, +} + +impl MaterializationConformanceHarness for PgConformanceHarness { + fn append_ops(&self, ops: &[Operation]) { + append_ops(&self.client, &self.doc_id, ops).unwrap(); + } + + fn append_ops_with_affected_nodes(&self, ops: &[Operation]) -> Vec { + append_ops_with_affected_nodes(&self.client, &self.doc_id, ops).unwrap() + } + + fn visible_children(&self, parent: NodeId) -> Vec { + tree_children(&self.client, &self.doc_id, parent).unwrap() + } + + fn payload(&self, node: NodeId) -> Option> { + tree_payload(&self.client, &self.doc_id, node).unwrap() + } + + fn replay_frontier(&self) -> Option { + let mut c = self.client.borrow_mut(); + let row = c + .query_one( + "SELECT replay_lamport, replay_replica, replay_counter \ + FROM treecrdt_meta WHERE doc_id = $1", + &[&self.doc_id], + ) + .unwrap(); + match ( + row.get::<_, Option>(0).map(|v| v.max(0) as u64), + row.get::<_, Option>>(1), + row.get::<_, Option>(2).map(|v| v.max(0) as u64), + ) { + (Some(lamport), Some(replica), Some(counter)) => { + Some(treecrdt_core::MaterializationFrontier { + lamport, + replica, + counter, + }) + } + _ => None, + } + } + + fn head_seq(&self) -> u64 { + let mut c = self.client.borrow_mut(); + let row = c + .query_one( + "SELECT head_seq FROM treecrdt_meta WHERE doc_id = $1", + &[&self.doc_id], + ) + .unwrap(); + row.get::<_, i64>(0).max(0) as u64 + } + + fn force_replay_from_start(&self) { + let mut c = self.client.borrow_mut(); + c.execute( + "UPDATE treecrdt_meta \ + SET replay_lamport = 0, replay_replica = ''::bytea, replay_counter = 0 \ + WHERE doc_id = $1", + &[&self.doc_id], + ) + .unwrap(); + } + + fn ensure_materialized(&self) { + ensure_materialized(&self.client, &self.doc_id).unwrap(); + } + + fn op_ref_counters_for_parent(&self, parent: NodeId) -> Vec { + let refs = list_op_refs_children(&self.client, &self.doc_id, parent).unwrap(); + let ops = get_ops_by_op_refs(&self.client, &self.doc_id, &refs).unwrap(); + ops.iter().map(|op| op.meta.id.counter).collect() + } +} + +fn setup_conformance_harness() -> Option { + let client = connect()?; + ensure_schema_once(&client); + let doc_id = format!("test-{}", Uuid::new_v4()); + { + let mut c = client.borrow_mut(); + reset_doc_for_tests(&mut c, &doc_id).unwrap(); + } + Some(PgConformanceHarness { client, doc_id }) +} + #[test] fn postgres_backend_apply_is_idempotent_and_max_lamport_monotonic() { let Some(client) = connect() else { @@ -165,93 +232,61 @@ fn postgres_backend_append_with_affected_nodes_matches_representative_remote_bat } #[test] -fn postgres_backend_out_of_order_append_uses_replay_frontier() { - let Some(client) = connect() else { +fn postgres_backend_out_of_order_append_catches_up_immediately_from_frontier() { + let Some(harness) = setup_conformance_harness() else { return; }; - ensure_schema_once(&client); - - let doc_id = format!("test-{}", Uuid::new_v4()); - { - let mut c = client.borrow_mut(); - reset_doc_for_tests(&mut c, &doc_id).unwrap(); - } - - let replica = ReplicaId::new(b"ooo"); - let second = Operation::insert( - &replica, - 2, - 2, - NodeId::ROOT, - node(2), - order_key_from_position(1), - ); - let first = Operation::insert( - &replica, - 1, - 1, - NodeId::ROOT, - node(1), - order_key_from_position(0), - ); + materialization_conformance::out_of_order_append_catches_up_immediately_from_frontier(&harness); +} - append_ops(&client, &doc_id, &[second]).unwrap(); - let affected = - append_ops_with_affected_nodes(&client, &doc_id, std::slice::from_ref(&first)).unwrap(); - assert!(affected.is_empty()); +#[test] +fn postgres_backend_out_of_order_losing_payload_skips_replay_frontier() { + let Some(harness) = setup_conformance_harness() else { + return; + }; + materialization_conformance::out_of_order_losing_payload_skips_replay_frontier(&harness); +} - let (replay_lamport, replay_replica, replay_counter, head_seq_before) = { - let mut c = client.borrow_mut(); - let row = c - .query_one( - "SELECT replay_lamport, replay_replica, replay_counter, head_seq \ - FROM treecrdt_meta WHERE doc_id = $1", - &[&doc_id], - ) - .unwrap(); - ( - row.get::<_, Option>(0).map(|v| v.max(0) as u64), - row.get::<_, Option>>(1), - row.get::<_, Option>(2).map(|v| v.max(0) as u64), - row.get::<_, i64>(3).max(0) as u64, - ) +#[test] +fn postgres_backend_out_of_order_move_with_later_payload_catches_up_immediately() { + let Some(harness) = setup_conformance_harness() else { + return; }; - assert_eq!(replay_lamport, Some(first.meta.lamport)); - assert_eq!( - replay_replica, - Some(first.meta.id.replica.as_bytes().to_vec()) + materialization_conformance::out_of_order_move_with_later_payload_catches_up_immediately( + &harness, ); - assert_eq!(replay_counter, Some(first.meta.id.counter)); - assert_eq!(head_seq_before, 1); +} - assert_eq!( - tree_children(&client, &doc_id, NodeId::ROOT).unwrap(), - vec![node(1), node(2)] +#[test] +fn postgres_backend_out_of_order_insert_and_move_before_head_catches_up_immediately() { + let Some(harness) = setup_conformance_harness() else { + return; + }; + materialization_conformance::out_of_order_insert_and_move_before_head_catches_up_immediately( + &harness, ); +} - let replay_after_read = { - let mut c = client.borrow_mut(); - let row = c - .query_one( - "SELECT replay_lamport, head_seq FROM treecrdt_meta WHERE doc_id = $1", - &[&doc_id], - ) - .unwrap(); - assert_eq!(row.get::<_, i64>(1).max(0) as u64, 2); - row.get::<_, Option>(0) +#[test] +fn postgres_backend_replay_from_start_frontier_catches_up_immediately() { + let Some(harness) = setup_conformance_harness() else { + return; }; - assert_eq!(replay_after_read, None); + materialization_conformance::replay_from_start_frontier_catches_up_immediately(&harness); +} - let refs = list_op_refs_children(&client, &doc_id, NodeId::ROOT).unwrap(); - let ops = get_ops_by_op_refs(&client, &doc_id, &refs).unwrap(); - assert_eq!( - ops.iter().map(|op| op.meta.id.counter).collect::>(), - vec![1, 2] +#[test] +fn postgres_backend_deferred_recovery_from_replay_frontier_catches_up_on_ensure() { + let Some(harness) = setup_conformance_harness() else { + return; + }; + materialization_conformance::deferred_recovery_from_replay_frontier_catches_up_on_ensure( + &harness, ); } #[test] -fn postgres_backend_replay_from_start_frontier_recovers_materialized_state() { +fn postgres_backend_failed_immediate_catch_up_rolls_back_inserted_ops_and_meta() { let Some(client) = connect() else { return; }; @@ -263,15 +298,7 @@ fn postgres_backend_replay_from_start_frontier_recovers_materialized_state() { reset_doc_for_tests(&mut c, &doc_id).unwrap(); } - let replica = ReplicaId::new(b"restart"); - let first = Operation::insert( - &replica, - 1, - 1, - NodeId::ROOT, - node(1), - order_key_from_position(0), - ); + let replica = ReplicaId::new(b"rollback"); let second = Operation::insert( &replica, 2, @@ -280,58 +307,92 @@ fn postgres_backend_replay_from_start_frontier_recovers_materialized_state() { node(2), order_key_from_position(1), ); + let first = Operation::insert( + &replica, + 1, + 1, + NodeId::ROOT, + node(1), + order_key_from_position(0), + ); - append_ops(&client, &doc_id, &[first]).unwrap(); + append_ops(&client, &doc_id, &[second]).unwrap(); + + let trigger_name = format!("fail_treecrdt_nodes_trigger_{}", Uuid::new_v4().simple()); + let function_name = format!("fail_treecrdt_nodes_fn_{}", Uuid::new_v4().simple()); { let mut c = client.borrow_mut(); - c.execute( - "UPDATE treecrdt_meta \ - SET replay_lamport = 0, replay_replica = ''::bytea, replay_counter = 0 \ - WHERE doc_id = $1", - &[&doc_id], - ) + c.batch_execute(&format!( + "CREATE FUNCTION {function_name}() RETURNS trigger LANGUAGE plpgsql AS $$ \ + BEGIN \ + RAISE EXCEPTION 'forced catch-up failure'; \ + END; \ + $$; \ + CREATE TRIGGER {trigger_name} \ + BEFORE INSERT OR UPDATE ON treecrdt_nodes \ + FOR EACH ROW \ + WHEN (NEW.doc_id = '{doc_id}') \ + EXECUTE FUNCTION {function_name}();" + )) .unwrap(); } - let affected = append_ops_with_affected_nodes(&client, &doc_id, &[second]).unwrap(); - assert!(affected.is_empty()); + let append_err = + append_ops_with_affected_nodes(&client, &doc_id, std::slice::from_ref(&first)).unwrap_err(); + assert!(append_err.to_string().contains("forced catch-up failure")); - let (replay_lamport, replay_replica, replay_counter) = { + let (op_count, replay_lamport, replay_replica, replay_counter, head_seq, children) = { + let root_bytes = NodeId::ROOT.0.to_be_bytes(); let mut c = client.borrow_mut(); - let row = c + let meta_row = c .query_one( - "SELECT replay_lamport, replay_replica, replay_counter \ + "SELECT \ + (SELECT COUNT(*) FROM treecrdt_ops WHERE doc_id = $1), \ + replay_lamport, replay_replica, replay_counter, head_seq \ FROM treecrdt_meta WHERE doc_id = $1", &[&doc_id], ) .unwrap(); + let child_rows = c + .query( + "SELECT node FROM treecrdt_nodes \ + WHERE doc_id = $1 AND parent = $2 AND tombstone = FALSE \ + ORDER BY order_key, node", + &[&doc_id, &root_bytes.as_slice()], + ) + .unwrap(); + let children = child_rows + .iter() + .map(|row| { + let bytes: Vec = row.get(0); + NodeId(u128::from_be_bytes(bytes.try_into().unwrap())) + }) + .collect::>(); ( - row.get::<_, Option>(0).map(|v| v.max(0) as u64), - row.get::<_, Option>>(1), - row.get::<_, Option>(2).map(|v| v.max(0) as u64), + meta_row.get::<_, i64>(0).max(0) as u64, + meta_row.get::<_, Option>(1).map(|v| v.max(0) as u64), + meta_row.get::<_, Option>>(2), + meta_row.get::<_, Option>(3).map(|v| v.max(0) as u64), + meta_row.get::<_, i64>(4).max(0) as u64, + children, ) }; - assert_eq!(replay_lamport, Some(0)); - assert_eq!(replay_replica, Some(Vec::new())); - assert_eq!(replay_counter, Some(0)); - assert_eq!( - tree_children(&client, &doc_id, NodeId::ROOT).unwrap(), - vec![node(1), node(2)] - ); + assert_eq!(op_count, 1); + assert_eq!(replay_lamport, None); + assert_eq!(replay_replica, None); + assert_eq!(replay_counter, None); + assert_eq!(head_seq, 1); + assert_eq!(children, vec![node(2)]); - let replay_after_read = { + { let mut c = client.borrow_mut(); - let row = c - .query_one( - "SELECT replay_lamport, head_seq FROM treecrdt_meta WHERE doc_id = $1", - &[&doc_id], - ) - .unwrap(); - assert_eq!(row.get::<_, i64>(1).max(0) as u64, 2); - row.get::<_, Option>(0) - }; - assert_eq!(replay_after_read, None); + c.batch_execute(&format!( + "DROP TRIGGER IF EXISTS {trigger_name} ON treecrdt_nodes; \ + DROP FUNCTION IF EXISTS {function_name}();" + )) + .unwrap(); + } } #[test] diff --git a/packages/treecrdt-sqlite-ext/Cargo.toml b/packages/treecrdt-sqlite-ext/Cargo.toml index 997831f2..a7cd112f 100644 --- a/packages/treecrdt-sqlite-ext/Cargo.toml +++ b/packages/treecrdt-sqlite-ext/Cargo.toml @@ -27,3 +27,4 @@ wasm-ext = ["static-link"] [dev-dependencies] tempfile = "3.10" +treecrdt-test-support = { path = "../treecrdt-test-support" } diff --git a/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs b/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs index c4879674..062638da 100644 --- a/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs +++ b/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs @@ -2,11 +2,19 @@ use super::append::JsonAppendOp; use super::node_store::SqliteNodeStore; use super::op_index::SqliteParentOpIndex; use super::payload_store::SqlitePayloadStore; -use super::schema::set_tree_meta_replay_frontier; +use super::schema::{set_tree_meta_replay_frontier, tree_meta_from_state}; use super::util::sqlite_err_from_core; use super::*; -use treecrdt_core::MaterializationCursor; +use treecrdt_core::PayloadStore; use treecrdt_core::Storage; +use treecrdt_core::{LamportClock, MaterializationCursor, ReplicaId}; + +fn merge_affected_nodes(mut left: Vec, right: Vec) -> Vec { + left.extend(right); + left.sort(); + left.dedup(); + left +} fn parse_node_id(bytes: &[u8]) -> Result { if bytes.len() != 16 { @@ -183,7 +191,7 @@ fn catch_up_materialized_from_frontier(db: *mut sqlite3) -> Result<(), c_int> { return Err(SQLITE_ERROR as c_int); } }; - let head = match catch_up_materialized_state( + let catch_up = match catch_up_materialized_state( storage, treecrdt_core::PersistedRemoteStores { replica_id: ReplicaId::new(b"sqlite-ext"), @@ -203,7 +211,7 @@ fn catch_up_materialized_from_frontier(db: *mut sqlite3) -> Result<(), c_int> { } }; - let head_rc = update_tree_meta_head(db, head.as_ref()); + let head_rc = update_tree_meta_head(db, catch_up.head.as_ref()); if head_rc.is_err() { sqlite_exec(db, rollback.as_ptr(), None, null_mut(), null_mut()); return head_rc; @@ -264,13 +272,77 @@ pub(super) fn append_ops_impl( } } - let apply_result = treecrdt_core::apply_persisted_remote_ops_with_delta( - &meta, - inserted_ops, - |inserted| materialize_inserted_ops(db, doc_id, &meta, &inserted), - |head| update_tree_meta_head(db, Some(head)), - |frontier| set_tree_meta_replay_frontier(db, frontier), - )?; + let apply_result = if let Some(shortcut) = { + let payloads = SqlitePayloadStore::prepare(db).map_err(|_| SQLITE_ERROR as c_int)?; + treecrdt_core::try_shortcut_out_of_order_payload_noops( + &meta, + inserted_ops.clone(), + |node| payloads.last_writer(node).map_err(sqlite_err_from_core), + )? + } { + if shortcut.remaining_ops.is_empty() { + update_tree_meta_head(db, Some(&shortcut.resumed_head))?; + treecrdt_core::PersistedRemoteApplyResult { + inserted_count: inserted_ops.len().min(u64::MAX as usize) as u64, + affected_nodes: shortcut.affected_nodes, + catch_up_needed: false, + } + } else { + let shortcut_meta = tree_meta_from_state(treecrdt_core::MaterializationState { + head: Some(shortcut.resumed_head.clone()), + replay_from: None, + }); + let result = + materialize_inserted_ops(db, doc_id, &shortcut_meta, &shortcut.remaining_ops)?; + let head = result.head.ok_or(SQLITE_ERROR as c_int)?; + update_tree_meta_head(db, Some(&head))?; + treecrdt_core::PersistedRemoteApplyResult { + inserted_count: inserted_ops.len().min(u64::MAX as usize) as u64, + affected_nodes: merge_affected_nodes( + shortcut.affected_nodes, + result.affected_nodes, + ), + catch_up_needed: false, + } + } + } else { + treecrdt_core::apply_persisted_remote_ops_with_delta( + &meta, + inserted_ops, + |inserted| materialize_inserted_ops(db, doc_id, &meta, &inserted), + |head| update_tree_meta_head(db, Some(head)), + |frontier| set_tree_meta_replay_frontier(db, frontier), + )? + }; + let apply_result = if apply_result.catch_up_needed { + let refreshed_meta = load_tree_meta(db)?; + let catch_up = treecrdt_core::catch_up_materialized_state( + super::op_storage::SqliteOpStorage::with_doc_id(db, doc_id.to_vec()), + treecrdt_core::PersistedRemoteStores { + replica_id: ReplicaId::new(b"sqlite-ext"), + clock: LamportClock::default(), + nodes: SqliteNodeStore::prepare(db).map_err(|_| SQLITE_ERROR as c_int)?, + payloads: SqlitePayloadStore::prepare(db).map_err(|_| SQLITE_ERROR as c_int)?, + index: SqliteParentOpIndex::prepare(db, doc_id.to_vec()) + .map_err(|_| SQLITE_ERROR as c_int)?, + }, + &refreshed_meta, + |_| Ok(()), + |_| Ok(()), + ) + .map_err(|_| SQLITE_ERROR as c_int)?; + update_tree_meta_head( + db, + Some(catch_up.head.as_ref().ok_or(SQLITE_ERROR as c_int)?), + )?; + treecrdt_core::PersistedRemoteApplyResult { + inserted_count: apply_result.inserted_count, + affected_nodes: catch_up.affected_nodes, + catch_up_needed: false, + } + } else { + apply_result + }; let commit_rc = sqlite_exec(db, commit.as_ptr(), None, null_mut(), null_mut()); if commit_rc != SQLITE_OK as c_int { diff --git a/packages/treecrdt-sqlite-ext/src/extension/functions/node_store.rs b/packages/treecrdt-sqlite-ext/src/extension/functions/node_store.rs index 00c16ef5..3c594c70 100644 --- a/packages/treecrdt-sqlite-ext/src/extension/functions/node_store.rs +++ b/packages/treecrdt-sqlite-ext/src/extension/functions/node_store.rs @@ -1,5 +1,6 @@ use super::*; use std::slice; +use treecrdt_core::NodeStore; fn sqlite_node_id_bytes(node: NodeId) -> [u8; 16] { node.0.to_be_bytes() @@ -773,3 +774,81 @@ impl treecrdt_core::NodeStore for SqliteNodeStore { Ok(out) } } + +impl treecrdt_core::ExactNodeStore for SqliteNodeStore { + fn set_last_change_exact( + &mut self, + node: NodeId, + vv: &VersionVector, + ) -> treecrdt_core::Result<()> { + self.ensure_node(node)?; + let node_bytes = sqlite_node_id_bytes(node); + unsafe { + sqlite_clear_bindings(self.update_last_change); + sqlite_reset(self.update_last_change); + sqlite_bind_blob( + self.update_last_change, + 1, + node_bytes.as_ptr() as *const c_void, + node_bytes.len() as c_int, + None, + ); + if vv.is_empty() { + sqlite_bind_null(self.update_last_change, 2); + } else { + let bytes = vv_to_bytes(vv)?; + sqlite_bind_blob( + self.update_last_change, + 2, + bytes.as_ptr() as *const c_void, + bytes.len() as c_int, + None, + ); + } + let step_rc = sqlite_step(self.update_last_change); + sqlite_reset(self.update_last_change); + if step_rc != SQLITE_DONE as c_int { + return Err(sqlite_rc_error(step_rc, "set exact last_change failed")); + } + } + Ok(()) + } + + fn set_deleted_at_exact( + &mut self, + node: NodeId, + vv: Option<&VersionVector>, + ) -> treecrdt_core::Result<()> { + self.ensure_node(node)?; + let node_bytes = sqlite_node_id_bytes(node); + unsafe { + sqlite_clear_bindings(self.update_deleted_at); + sqlite_reset(self.update_deleted_at); + sqlite_bind_blob( + self.update_deleted_at, + 1, + node_bytes.as_ptr() as *const c_void, + node_bytes.len() as c_int, + None, + ); + if let Some(vv) = vv.filter(|vv| !vv.is_empty()) { + let bytes = vv_to_bytes(vv)?; + sqlite_bind_blob( + self.update_deleted_at, + 2, + bytes.as_ptr() as *const c_void, + bytes.len() as c_int, + None, + ); + } else { + sqlite_bind_null(self.update_deleted_at, 2); + } + let step_rc = sqlite_step(self.update_deleted_at); + sqlite_reset(self.update_deleted_at); + if step_rc != SQLITE_DONE as c_int { + return Err(sqlite_rc_error(step_rc, "set exact deleted_at failed")); + } + } + Ok(()) + } +} diff --git a/packages/treecrdt-sqlite-ext/src/extension/functions/op_index.rs b/packages/treecrdt-sqlite-ext/src/extension/functions/op_index.rs index 997565ec..54963e5d 100644 --- a/packages/treecrdt-sqlite-ext/src/extension/functions/op_index.rs +++ b/packages/treecrdt-sqlite-ext/src/extension/functions/op_index.rs @@ -102,3 +102,24 @@ impl treecrdt_core::ParentOpIndex for SqliteParentOpIndex { Ok(()) } } + +impl treecrdt_core::TruncatingParentOpIndex for SqliteParentOpIndex { + fn truncate_from(&mut self, seq: u64) -> treecrdt_core::Result<()> { + let sql = + CString::new("DELETE FROM oprefs_children WHERE seq >= ?1").expect("truncate oprefs"); + let mut stmt: *mut sqlite3_stmt = null_mut(); + let prep_rc = sqlite_prepare_v2(self.db, sql.as_ptr(), -1, &mut stmt, null_mut()); + if prep_rc != SQLITE_OK as c_int { + return Err(sqlite_rc_error(prep_rc, "prepare truncate oprefs failed")); + } + unsafe { + sqlite_bind_int64(stmt, 1, seq.min(i64::MAX as u64) as i64); + let step_rc = sqlite_step(stmt); + sqlite_finalize(stmt); + if step_rc != SQLITE_DONE as c_int { + return Err(sqlite_rc_error(step_rc, "truncate oprefs step failed")); + } + } + Ok(()) + } +} diff --git a/packages/treecrdt-sqlite-ext/src/extension/functions/payload_store.rs b/packages/treecrdt-sqlite-ext/src/extension/functions/payload_store.rs index bdd0d111..1051bc44 100644 --- a/packages/treecrdt-sqlite-ext/src/extension/functions/payload_store.rs +++ b/packages/treecrdt-sqlite-ext/src/extension/functions/payload_store.rs @@ -12,6 +12,7 @@ pub(super) struct SqlitePayloadStore { db: *mut sqlite3, select: *mut sqlite3_stmt, upsert: *mut sqlite3_stmt, + delete: *mut sqlite3_stmt, } impl SqlitePayloadStore { @@ -31,9 +32,12 @@ impl SqlitePayloadStore { last_counter = excluded.last_counter", ) .expect("upsert payload sql"); + let delete_sql = + CString::new("DELETE FROM tree_payload WHERE node = ?1").expect("delete payload sql"); let mut select: *mut sqlite3_stmt = null_mut(); let mut upsert: *mut sqlite3_stmt = null_mut(); + let mut delete: *mut sqlite3_stmt = null_mut(); let prep = |sql: &CString, stmt: &mut *mut sqlite3_stmt| -> treecrdt_core::Result<()> { let rc = sqlite_prepare_v2(db, sql.as_ptr(), -1, stmt, null_mut()); @@ -44,8 +48,14 @@ impl SqlitePayloadStore { }; prep(&select_sql, &mut select)?; prep(&upsert_sql, &mut upsert)?; + prep(&delete_sql, &mut delete)?; - Ok(Self { db, select, upsert }) + Ok(Self { + db, + select, + upsert, + delete, + }) } } @@ -54,6 +64,7 @@ impl Drop for SqlitePayloadStore { unsafe { sqlite_finalize(self.select); sqlite_finalize(self.upsert); + sqlite_finalize(self.delete); } } } @@ -223,3 +234,30 @@ impl treecrdt_core::PayloadStore for SqlitePayloadStore { Ok(()) } } + +impl treecrdt_core::ExactPayloadStore for SqlitePayloadStore { + fn clear_payload(&mut self, node: NodeId) -> treecrdt_core::Result<()> { + let node_bytes = sqlite_node_id_bytes(node); + unsafe { + sqlite_clear_bindings(self.delete); + sqlite_reset(self.delete); + let bind_rc = sqlite_bind_blob( + self.delete, + 1, + node_bytes.as_ptr() as *const c_void, + node_bytes.len() as c_int, + None, + ); + if bind_rc != SQLITE_OK as c_int { + sqlite_reset(self.delete); + return Err(sqlite_rc_error(bind_rc, "bind delete payload failed")); + } + let step_rc = sqlite_step(self.delete); + sqlite_reset(self.delete); + if step_rc != SQLITE_DONE as c_int { + return Err(sqlite_rc_error(step_rc, "delete payload step failed")); + } + } + Ok(()) + } +} diff --git a/packages/treecrdt-sqlite-ext/src/extension/functions/schema.rs b/packages/treecrdt-sqlite-ext/src/extension/functions/schema.rs index d7ea904f..ec3130c3 100644 --- a/packages/treecrdt-sqlite-ext/src/extension/functions/schema.rs +++ b/packages/treecrdt-sqlite-ext/src/extension/functions/schema.rs @@ -20,6 +20,10 @@ impl MaterializationCursor for TreeMeta { } } +pub(super) fn tree_meta_from_state(state: MaterializationState) -> TreeMeta { + TreeMeta(state) +} + pub(super) fn load_doc_id(db: *mut sqlite3) -> Result>, c_int> { let sql = CString::new("SELECT value FROM meta WHERE key = 'doc_id' LIMIT 1").expect("doc id sql"); diff --git a/packages/treecrdt-sqlite-ext/tests/extension_roundtrip.rs b/packages/treecrdt-sqlite-ext/tests/extension_roundtrip.rs index c9594b27..cab60588 100644 --- a/packages/treecrdt-sqlite-ext/tests/extension_roundtrip.rs +++ b/packages/treecrdt-sqlite-ext/tests/extension_roundtrip.rs @@ -4,7 +4,13 @@ use std::path::PathBuf; use rusqlite::Connection; use serde::{Deserialize, Serialize}; -use treecrdt_core::{order_key::allocate_between, ReplicaId, VersionVector}; +use treecrdt_core::{ + order_key::allocate_between, NodeId, Operation, OperationKind, ReplicaId, VersionVector, +}; +use treecrdt_test_support::{ + self as materialization_conformance, representative_remote_batch, + MaterializationConformanceHarness, +}; #[derive(Clone, Deserialize, Serialize)] struct JsonOp { @@ -20,6 +26,77 @@ struct JsonOp { payload: Option>, } +fn node_bytes_from_id(node: NodeId) -> Vec { + node.0.to_be_bytes().to_vec() +} + +fn bytes_to_node_id(bytes: &[u8]) -> NodeId { + NodeId(u128::from_be_bytes(bytes.try_into().unwrap())) +} + +fn vv_to_bytes(vv: &VersionVector) -> Vec { + serde_json::to_vec(vv).unwrap() +} + +fn json_op(op: &Operation) -> JsonOp { + let (kind, parent, node, new_parent, order_key, payload) = match &op.kind { + OperationKind::Insert { + parent, + node, + order_key, + payload, + } => ( + "insert", + Some(parent.0.to_be_bytes()), + node.0.to_be_bytes(), + None, + Some(order_key.clone()), + payload.clone(), + ), + OperationKind::Move { + node, + new_parent, + order_key, + } => ( + "move", + None, + node.0.to_be_bytes(), + Some(new_parent.0.to_be_bytes()), + Some(order_key.clone()), + None, + ), + OperationKind::Delete { node } => ("delete", None, node.0.to_be_bytes(), None, None, None), + OperationKind::Tombstone { node } => { + ("tombstone", None, node.0.to_be_bytes(), None, None, None) + } + OperationKind::Payload { node, payload } => ( + "payload", + None, + node.0.to_be_bytes(), + None, + None, + payload.clone(), + ), + }; + + JsonOp { + replica: op.meta.id.replica.as_bytes().to_vec(), + counter: op.meta.id.counter, + lamport: op.meta.lamport, + kind: kind.into(), + parent, + node, + new_parent, + order_key, + known_state: op.meta.known_state.as_ref().map(vv_to_bytes), + payload, + } +} + +fn json_ops(ops: &[Operation]) -> Vec { + ops.iter().map(json_op).collect() +} + fn read_tree_meta(conn: &Connection) -> (i64, Vec, i64, i64) { conn.query_row( "SELECT head_lamport, head_replica, head_counter, head_seq FROM tree_meta WHERE id = 1", @@ -103,90 +180,82 @@ fn ops_by_oprefs(conn: &Connection, refs: &[Vec]) -> Vec { serde_json::from_str(&ops_json).unwrap() } -fn representative_remote_batch(replica: &[u8]) -> (Vec, Vec, Vec, Vec) { - let root = node_bytes(0); - let p1 = node_bytes(1); - let p2 = node_bytes(2); - let child = node_bytes(3); - ( - p1.clone(), - p2.clone(), - child.clone(), - vec![ - JsonOp { - replica: replica.to_vec(), - counter: 1, - lamport: 1, - kind: "insert".into(), - parent: Some(<[u8; 16]>::try_from(root.as_slice()).unwrap()), - node: <[u8; 16]>::try_from(p1.as_slice()).unwrap(), - new_parent: None, - order_key: Some((1u16).to_be_bytes().to_vec()), - known_state: None, - payload: None, - }, - JsonOp { - replica: replica.to_vec(), - counter: 2, - lamport: 2, - kind: "insert".into(), - parent: Some(<[u8; 16]>::try_from(root.as_slice()).unwrap()), - node: <[u8; 16]>::try_from(p2.as_slice()).unwrap(), - new_parent: None, - order_key: Some((2u16).to_be_bytes().to_vec()), - known_state: None, - payload: None, - }, - JsonOp { - replica: replica.to_vec(), - counter: 3, - lamport: 3, - kind: "insert".into(), - parent: Some(<[u8; 16]>::try_from(p1.as_slice()).unwrap()), - node: <[u8; 16]>::try_from(child.as_slice()).unwrap(), - new_parent: None, - order_key: Some((1u16).to_be_bytes().to_vec()), - known_state: None, - payload: None, - }, - JsonOp { - replica: replica.to_vec(), - counter: 4, - lamport: 4, - kind: "payload".into(), - parent: None, - node: <[u8; 16]>::try_from(child.as_slice()).unwrap(), - new_parent: None, - order_key: None, - known_state: None, - payload: Some(vec![7]), - }, - JsonOp { - replica: replica.to_vec(), - counter: 5, - lamport: 5, - kind: "move".into(), - parent: None, - node: <[u8; 16]>::try_from(child.as_slice()).unwrap(), - new_parent: Some(<[u8; 16]>::try_from(p2.as_slice()).unwrap()), - order_key: Some((1u16).to_be_bytes().to_vec()), - known_state: None, - payload: None, - }, - JsonOp { - replica: replica.to_vec(), - counter: 6, - lamport: 6, - kind: "payload".into(), - parent: None, - node: <[u8; 16]>::try_from(child.as_slice()).unwrap(), - new_parent: None, - order_key: None, - known_state: None, - payload: Some(vec![8]), - }, - ], - ) +struct SqliteConformanceHarness { + conn: Connection, +} + +impl MaterializationConformanceHarness for SqliteConformanceHarness { + fn append_ops(&self, ops: &[Operation]) { + append_ops_json(&self.conn, &json_ops(ops)); + } + + fn append_ops_with_affected_nodes(&self, ops: &[Operation]) -> Vec { + let (affected, _) = append_ops_json(&self.conn, &json_ops(ops)); + affected.iter().map(|bytes| bytes_to_node_id(bytes)).collect() + } + + fn visible_children(&self, parent: NodeId) -> Vec { + visible_children(&self.conn, &node_bytes_from_id(parent)) + .iter() + .map(|bytes| bytes_to_node_id(bytes)) + .collect() + } + + fn payload(&self, node: NodeId) -> Option> { + payload_bytes(&self.conn, &node_bytes_from_id(node)) + } + + fn replay_frontier(&self) -> Option { + match read_replay_frontier(&self.conn) { + (Some(lamport), Some(replica), Some(counter)) => { + Some(treecrdt_core::MaterializationFrontier { + lamport: lamport.max(0) as u64, + replica, + counter: counter.max(0) as u64, + }) + } + _ => None, + } + } + + fn head_seq(&self) -> u64 { + let (_, _, _, head_seq) = read_tree_meta(&self.conn); + head_seq.max(0) as u64 + } + + fn force_replay_from_start(&self) { + self.conn + .execute( + "UPDATE tree_meta \ + SET replay_lamport = 0, replay_replica = X'', replay_counter = 0 \ + WHERE id = 1", + [], + ) + .unwrap(); + } + + fn ensure_materialized(&self) { + let _: i64 = self + .conn + .query_row("SELECT treecrdt_ensure_materialized()", [], |row| { + row.get(0) + }) + .unwrap(); + } + + fn op_ref_counters_for_parent(&self, parent: NodeId) -> Vec { + ops_by_oprefs( + &self.conn, + &oprefs_children(&self.conn, &node_bytes_from_id(parent)), + ) + .iter() + .map(|op| op.counter) + .collect() + } +} + +fn setup_conformance_harness() -> SqliteConformanceHarness { + SqliteConformanceHarness { conn: setup_conn() } } #[test] @@ -357,143 +426,129 @@ fn remote_append_representative_batch_matches_postgres_shape() { let conn = setup_conn(); let root = node_bytes(0); - let (p1, p2, child, ops) = representative_remote_batch(b"rep"); - let (affected, _) = append_ops_json(&conn, &ops); + let replica = ReplicaId::new(b"rep"); + let (p1, p2, child, ops) = representative_remote_batch(&replica); + let (affected, _) = append_ops_json(&conn, &json_ops(&ops)); assert_eq!( affected, - vec![root.clone(), p1.clone(), p2.clone(), child.clone()] + vec![ + root.clone(), + node_bytes_from_id(p1), + node_bytes_from_id(p2), + node_bytes_from_id(child) + ] + ); + assert_eq!( + visible_children(&conn, &root), + vec![node_bytes_from_id(p1), node_bytes_from_id(p2)] + ); + assert_eq!( + visible_children(&conn, &node_bytes_from_id(p2)), + vec![node_bytes_from_id(child)] + ); + assert_eq!( + payload_bytes(&conn, &node_bytes_from_id(child)), + Some(vec![8]) ); - assert_eq!(visible_children(&conn, &root), vec![p1.clone(), p2.clone()]); - assert_eq!(visible_children(&conn, &p2), vec![child.clone()]); - assert_eq!(payload_bytes(&conn, &child), Some(vec![8])); - let ops_p2 = ops_by_oprefs(&conn, &oprefs_children(&conn, &p2)); + let ops_p2 = ops_by_oprefs(&conn, &oprefs_children(&conn, &node_bytes_from_id(p2))); assert!(ops_p2.iter().any(|op| op.kind == "move")); assert!(ops_p2.iter().any(|op| op.kind == "payload")); } #[test] -fn remote_append_out_of_order_uses_replay_frontier() { - let conn = setup_conn(); - - let root = node_bytes(0); - let second = JsonOp { - replica: b"ooo".to_vec(), - counter: 2, - lamport: 2, - kind: "insert".into(), - parent: Some(<[u8; 16]>::try_from(root.as_slice()).unwrap()), - node: <[u8; 16]>::try_from(node_bytes(2).as_slice()).unwrap(), - new_parent: None, - order_key: Some((2u16).to_be_bytes().to_vec()), - known_state: None, - payload: None, - }; - let first = JsonOp { - replica: b"ooo".to_vec(), - counter: 1, - lamport: 1, - kind: "insert".into(), - parent: Some(<[u8; 16]>::try_from(root.as_slice()).unwrap()), - node: <[u8; 16]>::try_from(node_bytes(1).as_slice()).unwrap(), - new_parent: None, - order_key: Some((1u16).to_be_bytes().to_vec()), - known_state: None, - payload: None, - }; - - append_ops_json(&conn, &[second]); - let (affected, _) = append_ops_json(&conn, &[first.clone()]); - assert!(affected.is_empty()); +fn remote_append_out_of_order_catches_up_immediately_from_frontier() { + let harness = setup_conformance_harness(); + materialization_conformance::out_of_order_append_catches_up_immediately_from_frontier(&harness); +} - let (_, _, _, head_seq_before) = read_tree_meta(&conn); - let (replay_lamport, replay_replica, replay_counter) = read_replay_frontier(&conn); - assert_eq!(head_seq_before, 1); - assert_eq!(replay_lamport, Some(first.lamport as i64)); - assert_eq!(replay_replica, Some(first.replica.clone())); - assert_eq!(replay_counter, Some(first.counter as i64)); +#[test] +fn remote_append_out_of_order_losing_payload_skips_replay_frontier() { + let harness = setup_conformance_harness(); + materialization_conformance::out_of_order_losing_payload_skips_replay_frontier(&harness); +} - let _: i64 = conn - .query_row("SELECT treecrdt_ensure_materialized()", [], |row| { - row.get(0) - }) - .unwrap(); +#[test] +fn remote_append_out_of_order_move_with_later_payload_catches_up_immediately() { + let harness = setup_conformance_harness(); + materialization_conformance::out_of_order_move_with_later_payload_catches_up_immediately( + &harness, + ); +} - assert_eq!( - visible_children(&conn, &root), - vec![node_bytes(1), node_bytes(2)] +#[test] +fn remote_append_out_of_order_insert_and_move_before_head_catches_up_immediately() { + let harness = setup_conformance_harness(); + materialization_conformance::out_of_order_insert_and_move_before_head_catches_up_immediately( + &harness, ); - let (_, _, _, head_seq_after) = read_tree_meta(&conn); - assert_eq!(head_seq_after, 2); - assert_eq!(read_replay_frontier(&conn), (None, None, None)); +} - let ops = ops_by_oprefs(&conn, &oprefs_children(&conn, &root)); - assert_eq!( - ops.iter().map(|op| op.counter).collect::>(), - vec![1, 2] +#[test] +fn remote_append_replay_from_start_frontier_catches_up_immediately() { + let harness = setup_conformance_harness(); + materialization_conformance::replay_from_start_frontier_catches_up_immediately(&harness); +} + +#[test] +fn remote_deferred_recovery_from_replay_frontier_catches_up_on_ensure() { + let harness = setup_conformance_harness(); + materialization_conformance::deferred_recovery_from_replay_frontier_catches_up_on_ensure( + &harness, ); } #[test] -fn remote_append_replay_from_start_frontier_recovers_materialized_state() { +fn remote_failed_immediate_catch_up_rolls_back_inserted_ops_and_meta() { let conn = setup_conn(); - let root = node_bytes(0); - let first = JsonOp { - replica: b"restart".to_vec(), - counter: 1, - lamport: 1, - kind: "insert".into(), - parent: Some(<[u8; 16]>::try_from(root.as_slice()).unwrap()), - node: <[u8; 16]>::try_from(node_bytes(1).as_slice()).unwrap(), - new_parent: None, - order_key: Some((1u16).to_be_bytes().to_vec()), - known_state: None, - payload: None, - }; - let second = JsonOp { - replica: b"restart".to_vec(), - counter: 2, - lamport: 2, - kind: "insert".into(), - parent: Some(<[u8; 16]>::try_from(root.as_slice()).unwrap()), - node: <[u8; 16]>::try_from(node_bytes(2).as_slice()).unwrap(), - new_parent: None, - order_key: Some((2u16).to_be_bytes().to_vec()), - known_state: None, - payload: None, - }; + let replica = ReplicaId::new(b"rollback"); + let second = Operation::insert( + &replica, + 2, + 2, + NodeId::ROOT, + materialization_conformance::node(2), + materialization_conformance::order_key_from_position(1), + ); + let first = Operation::insert( + &replica, + 1, + 1, + NodeId::ROOT, + materialization_conformance::node(1), + materialization_conformance::order_key_from_position(0), + ); - append_ops_json(&conn, &[first]); - conn.execute( - "UPDATE tree_meta \ - SET replay_lamport = 0, replay_replica = X'', replay_counter = 0 \ - WHERE id = 1", - [], + append_ops_json(&conn, &json_ops(&[second])); + conn.execute_batch( + "CREATE TRIGGER fail_tree_nodes_insert \ + BEFORE INSERT ON tree_nodes \ + BEGIN \ + SELECT RAISE(ROLLBACK, 'forced catch-up failure'); \ + END;", ) .unwrap(); - let (affected, _) = append_ops_json(&conn, &[second]); - assert!(affected.is_empty()); - assert_eq!( - read_replay_frontier(&conn), - (Some(0), Some(Vec::new()), Some(0)) + let append_json = serde_json::to_string(&json_ops(&[first])).unwrap(); + let append_result: rusqlite::Result = conn.query_row( + "SELECT treecrdt_append_ops(?1)", + rusqlite::params![append_json], + |row| row.get(0), ); + assert!(append_result.is_err()); - let _: i64 = conn - .query_row("SELECT treecrdt_ensure_materialized()", [], |row| { - row.get(0) - }) - .unwrap(); + let (_, _, _, head_seq) = read_tree_meta(&conn); + let op_count: i64 = conn.query_row("SELECT COUNT(*) FROM ops", [], |row| row.get(0)).unwrap(); + assert_eq!(op_count, 1); + assert_eq!(read_replay_frontier(&conn), (None, None, None)); + assert_eq!(head_seq, 1); assert_eq!( - visible_children(&conn, &root), - vec![node_bytes(1), node_bytes(2)] + visible_children(&conn, &node_bytes(0)), + vec![node_bytes_from_id(materialization_conformance::node(2))] ); - let (_, _, _, head_seq) = read_tree_meta(&conn); - assert_eq!(head_seq, 2); - assert_eq!(read_replay_frontier(&conn), (None, None, None)); } #[test] diff --git a/packages/treecrdt-test-support/Cargo.toml b/packages/treecrdt-test-support/Cargo.toml new file mode 100644 index 00000000..b63bd89a --- /dev/null +++ b/packages/treecrdt-test-support/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "treecrdt-test-support" +version = "0.0.1" +edition = "2021" +license = "MIT" +publish = false +description = "Shared test support for TreeCRDT backend conformance suites." + +[dependencies] +treecrdt-core = { path = "../treecrdt-core" } diff --git a/packages/treecrdt-test-support/src/lib.rs b/packages/treecrdt-test-support/src/lib.rs new file mode 100644 index 00000000..22834a18 --- /dev/null +++ b/packages/treecrdt-test-support/src/lib.rs @@ -0,0 +1,233 @@ +use std::slice; + +use treecrdt_core::{MaterializationFrontier, NodeId, Operation, ReplicaId}; + +pub trait MaterializationConformanceHarness { + fn append_ops(&self, ops: &[Operation]); + fn append_ops_with_affected_nodes(&self, ops: &[Operation]) -> Vec; + fn visible_children(&self, parent: NodeId) -> Vec; + fn payload(&self, node: NodeId) -> Option>; + fn replay_frontier(&self) -> Option; + fn head_seq(&self) -> u64; + fn force_replay_from_start(&self); + fn ensure_materialized(&self); + fn op_ref_counters_for_parent(&self, parent: NodeId) -> Vec; +} + +pub fn order_key_from_position(position: u16) -> Vec { + let n = position.wrapping_add(1); + n.to_be_bytes().to_vec() +} + +pub fn node(n: u128) -> NodeId { + NodeId(n) +} + +pub fn representative_remote_batch( + replica: &ReplicaId, +) -> (NodeId, NodeId, NodeId, Vec) { + let p1 = node(1); + let p2 = node(2); + let child = node(3); + ( + p1, + p2, + child, + vec![ + Operation::insert(replica, 1, 1, NodeId::ROOT, p1, order_key_from_position(0)), + Operation::insert(replica, 2, 2, NodeId::ROOT, p2, order_key_from_position(1)), + Operation::insert(replica, 3, 3, p1, child, order_key_from_position(0)), + Operation::set_payload(replica, 4, 4, child, vec![7]), + Operation::move_node(replica, 5, 5, child, p2, order_key_from_position(0)), + Operation::set_payload(replica, 6, 6, child, vec![8]), + ], + ) +} + +fn assert_replay_cleared(harness: &H) { + assert_eq!(harness.replay_frontier(), None); +} + +pub fn out_of_order_append_catches_up_immediately_from_frontier< + H: MaterializationConformanceHarness, +>( + harness: &H, +) { + let replica = ReplicaId::new(b"ooo"); + let second = Operation::insert( + &replica, + 2, + 2, + NodeId::ROOT, + node(2), + order_key_from_position(1), + ); + let first = Operation::insert( + &replica, + 1, + 1, + NodeId::ROOT, + node(1), + order_key_from_position(0), + ); + + harness.append_ops(&[second]); + let affected = harness.append_ops_with_affected_nodes(slice::from_ref(&first)); + assert_eq!(affected, vec![NodeId::ROOT, node(1), node(2)]); + assert_replay_cleared(harness); + assert_eq!(harness.head_seq(), 2); + assert_eq!( + harness.visible_children(NodeId::ROOT), + vec![node(1), node(2)] + ); + assert_eq!(harness.op_ref_counters_for_parent(NodeId::ROOT), vec![1, 2]); +} + +pub fn out_of_order_losing_payload_skips_replay_frontier( + harness: &H, +) { + let replica = ReplicaId::new(b"payload-shortcut"); + let payload_node = node(7); + let insert = Operation::insert( + &replica, + 1, + 1, + NodeId::ROOT, + payload_node, + order_key_from_position(0), + ); + let winning_payload = Operation::set_payload(&replica, 3, 3, payload_node, vec![9]); + let losing_payload = Operation::set_payload(&replica, 2, 2, payload_node, vec![4]); + + harness.append_ops(&[insert, winning_payload]); + let affected = harness.append_ops_with_affected_nodes(slice::from_ref(&losing_payload)); + assert_eq!(affected, vec![payload_node]); + assert_replay_cleared(harness); + assert_eq!(harness.head_seq(), 3); + assert_eq!(harness.payload(payload_node), Some(vec![9])); +} + +pub fn out_of_order_move_with_later_payload_catches_up_immediately< + H: MaterializationConformanceHarness, +>( + harness: &H, +) { + let replica = ReplicaId::new(b"mixed-move"); + let p1 = node(1); + let p2 = node(2); + let child = node(3); + let insert_p1 = Operation::insert(&replica, 1, 1, NodeId::ROOT, p1, order_key_from_position(0)); + let insert_p2 = Operation::insert(&replica, 2, 2, NodeId::ROOT, p2, order_key_from_position(1)); + let insert_child = Operation::insert(&replica, 3, 3, p1, child, order_key_from_position(0)); + let earlier_payload = Operation::set_payload(&replica, 5, 5, child, vec![7]); + let out_of_order_move = + Operation::move_node(&replica, 4, 4, child, p2, order_key_from_position(0)); + let later_payload = Operation::set_payload(&replica, 6, 6, child, vec![9]); + + harness.append_ops(&[insert_p1, insert_p2, insert_child, earlier_payload]); + let affected = harness.append_ops_with_affected_nodes(&[later_payload, out_of_order_move]); + assert_eq!(affected, vec![p1, p2, child]); + assert_replay_cleared(harness); + assert_eq!(harness.head_seq(), 6); + assert_eq!(harness.visible_children(p1), Vec::::new()); + assert_eq!(harness.visible_children(p2), vec![child]); + assert_eq!(harness.payload(child), Some(vec![9])); +} + +pub fn out_of_order_insert_and_move_before_head_catches_up_immediately< + H: MaterializationConformanceHarness, +>( + harness: &H, +) { + let replica = ReplicaId::new(b"mixed-insert"); + let p1 = node(1); + let p2 = node(2); + let child = node(3); + let insert_p1 = Operation::insert(&replica, 1, 1, NodeId::ROOT, p1, order_key_from_position(0)); + let insert_p2 = Operation::insert(&replica, 2, 2, NodeId::ROOT, p2, order_key_from_position(1)); + let unrelated_head = Operation::set_payload(&replica, 5, 5, p2, vec![4]); + let out_of_order_insert = + Operation::insert(&replica, 3, 3, p1, child, order_key_from_position(0)); + let out_of_order_move = + Operation::move_node(&replica, 4, 4, child, p2, order_key_from_position(0)); + + harness.append_ops(&[insert_p1, insert_p2, unrelated_head]); + let affected = + harness.append_ops_with_affected_nodes(&[out_of_order_move, out_of_order_insert]); + assert_eq!(affected, vec![p1, p2, child]); + assert_replay_cleared(harness); + assert_eq!(harness.head_seq(), 5); + assert_eq!(harness.visible_children(p1), Vec::::new()); + assert_eq!(harness.visible_children(p2), vec![child]); + assert_eq!(harness.payload(p2), Some(vec![4])); +} + +pub fn replay_from_start_frontier_catches_up_immediately( + harness: &H, +) { + let replica = ReplicaId::new(b"restart"); + let first = Operation::insert( + &replica, + 1, + 1, + NodeId::ROOT, + node(1), + order_key_from_position(0), + ); + let second = Operation::insert( + &replica, + 2, + 2, + NodeId::ROOT, + node(2), + order_key_from_position(1), + ); + + harness.append_ops(&[first]); + harness.force_replay_from_start(); + + let affected = harness.append_ops_with_affected_nodes(&[second]); + assert_eq!(affected, vec![NodeId::ROOT, node(1), node(2)]); + assert_replay_cleared(harness); + assert_eq!( + harness.visible_children(NodeId::ROOT), + vec![node(1), node(2)] + ); + assert_eq!(harness.head_seq(), 2); +} + +pub fn deferred_recovery_from_replay_frontier_catches_up_on_ensure< + H: MaterializationConformanceHarness, +>( + harness: &H, +) { + let replica = ReplicaId::new(b"ensure"); + let first = Operation::insert( + &replica, + 1, + 1, + NodeId::ROOT, + node(1), + order_key_from_position(0), + ); + let second = Operation::insert( + &replica, + 2, + 2, + NodeId::ROOT, + node(2), + order_key_from_position(1), + ); + + harness.append_ops(&[first, second]); + harness.force_replay_from_start(); + harness.ensure_materialized(); + + assert_replay_cleared(harness); + assert_eq!( + harness.visible_children(NodeId::ROOT), + vec![node(1), node(2)] + ); + assert_eq!(harness.head_seq(), 2); + assert_eq!(harness.op_ref_counters_for_parent(NodeId::ROOT), vec![1, 2]); +} From c2f8a686bae03c12839befa2ffae17e67480576c Mon Sep 17 00:00:00 2001 From: Marcus Pousette Date: Fri, 17 Apr 2026 10:00:59 +0200 Subject: [PATCH 02/13] feat: add partial rewind materialization experiment --- packages/treecrdt-core/src/lib.rs | 8 +- packages/treecrdt-core/src/materialization.rs | 351 ++++++++++- packages/treecrdt-core/src/tree.rs | 66 +++ packages/treecrdt-postgres-rs/src/schema.rs | 3 + packages/treecrdt-postgres-rs/src/store.rs | 189 +++++- .../src/extension/functions/materialize.rs | 75 ++- .../src/extension/functions/op_storage.rs | 561 +++++++++++------- .../src/extension/functions/schema.rs | 1 + 8 files changed, 995 insertions(+), 259 deletions(-) diff --git a/packages/treecrdt-core/src/lib.rs b/packages/treecrdt-core/src/lib.rs index dd3f1f7c..726c4b53 100644 --- a/packages/treecrdt-core/src/lib.rs +++ b/packages/treecrdt-core/src/lib.rs @@ -17,9 +17,11 @@ pub use ids::{Lamport, NodeId, OperationId, ReplicaId}; pub use materialization::{ apply_incremental_ops_with_delta, apply_persisted_remote_ops_with_delta, catch_up_materialized_state, materialize_persisted_remote_ops_with_delta, - try_shortcut_out_of_order_payload_noops, CatchUpResult, IncrementalApplyResult, - MaterializationCursor, MaterializationFrontier, MaterializationHead, MaterializationKey, - MaterializationState, PayloadNoopShortcut, PersistedRemoteApplyResult, PersistedRemoteStores, + try_partial_rewind_catch_up_materialized_state, try_shortcut_out_of_order_payload_noops, + CatchUpResult, FrontierRewindStorage, IncrementalApplyResult, MaterializationCursor, + MaterializationFrontier, MaterializationFrontierRef, MaterializationHead, + MaterializationHeadRef, MaterializationKey, MaterializationState, MaterializationStateRef, + PayloadNoopShortcut, PersistedRemoteApplyResult, PersistedRemoteStores, }; pub use ops::{cmp_op_key, cmp_ops, Operation, OperationKind, OperationMetadata}; pub use traits::{ diff --git a/packages/treecrdt-core/src/materialization.rs b/packages/treecrdt-core/src/materialization.rs index 19d1ceb7..0b259be8 100644 --- a/packages/treecrdt-core/src/materialization.rs +++ b/packages/treecrdt-core/src/materialization.rs @@ -4,7 +4,8 @@ use std::collections::{HashMap, HashSet}; use crate::ops::{cmp_op_key, cmp_ops, Operation}; use crate::traits::{ Clock, ExactNodeStore, ExactPayloadStore, LamportClock, MemoryNodeStore, MemoryPayloadStore, - NodeStore, NoopStorage, ParentOpIndex, PayloadStore, Storage, TruncatingParentOpIndex, + MemoryStorage, NodeStore, NoopStorage, ParentOpIndex, PayloadStore, Storage, + TruncatingParentOpIndex, }; use crate::tree::TreeCrdt; use crate::{Error, Lamport, NodeId, OperationId, ReplicaId, Result}; @@ -74,6 +75,86 @@ pub trait MaterializationCursor { fn state(&self) -> MaterializationStateRef<'_>; } +/// Optional storage hooks for partial rewind/replay of a frontier-invalidated suffix. +/// +/// The default implementations are intentionally naive and scan the full op log in memory. Real +/// storage backends should override these with ordered SQL lookups so the partial rewind fast path +/// only touches the invalidated suffix plus node-scoped predecessor queries. +pub trait FrontierRewindStorage: Storage { + fn scan_frontier_range( + &self, + start: &MaterializationFrontierRef<'_>, + end: Option<&MaterializationKey<&[u8]>>, + visit: &mut dyn FnMut(Operation) -> Result<()>, + ) -> Result<()> { + let mut ops = self.load_since(0)?; + ops.sort_by(cmp_ops); + for op in ops { + let frontier = frontier_from_op(&op); + if cmp_frontiers(&frontier, start) == Ordering::Less { + continue; + } + if let Some(end) = end { + if cmp_frontiers(&frontier, end) == Ordering::Greater { + continue; + } + } + visit(op)?; + } + Ok(()) + } + + fn latest_structural_before( + &self, + node: NodeId, + before: &MaterializationFrontierRef<'_>, + ) -> Result> { + let mut ops = self.load_since(0)?; + ops.sort_by(cmp_ops); + Ok(ops + .into_iter() + .filter(|op| { + let frontier = frontier_from_op(op); + cmp_frontiers(&frontier, before) == Ordering::Less + && matches!( + op.kind, + crate::ops::OperationKind::Insert { node: n, .. } + | crate::ops::OperationKind::Move { node: n, .. } + if n == node + ) + }) + .next_back()) + } + + fn latest_payload_before( + &self, + node: NodeId, + before: &MaterializationFrontierRef<'_>, + ) -> Result> { + let mut ops = self.load_since(0)?; + ops.sort_by(cmp_ops); + Ok(ops + .into_iter() + .filter(|op| { + let frontier = frontier_from_op(op); + cmp_frontiers(&frontier, before) == Ordering::Less + && match &op.kind { + crate::ops::OperationKind::Insert { + node: n, + payload, + .. + } => *n == node && payload.is_some(), + crate::ops::OperationKind::Payload { node: n, .. } => *n == node, + _ => false, + } + }) + .next_back()) + } +} + +impl FrontierRewindStorage for MemoryStorage {} +impl FrontierRewindStorage for NoopStorage {} + #[derive(Clone, Debug, Eq, PartialEq)] pub struct IncrementalApplyResult { pub head: Option, @@ -208,6 +289,152 @@ fn start_replay_frontier() -> MaterializationFrontier { } } +fn op_requires_full_replay(op: &Operation) -> bool { + matches!( + op.kind, + crate::ops::OperationKind::Delete { .. } | crate::ops::OperationKind::Tombstone { .. } + ) +} + +fn op_sets_payload(op: &Operation) -> bool { + match &op.kind { + crate::ops::OperationKind::Insert { payload, .. } => payload.is_some(), + crate::ops::OperationKind::Payload { .. } => true, + _ => false, + } +} + +fn payload_from_op(op: &Operation) -> Option>> { + match &op.kind { + crate::ops::OperationKind::Insert { payload, .. } => payload.clone().map(Some), + crate::ops::OperationKind::Payload { payload, .. } => Some(payload.clone()), + _ => None, + } +} + +fn clone_materialized_state( + nodes: &N, + payloads: &P, + replica_id: &ReplicaId, +) -> Result> { + let mut memory_nodes = MemoryNodeStore::default(); + let mut memory_payloads = MemoryPayloadStore::default(); + let mut attachments: Vec<(NodeId, NodeId, Vec)> = Vec::new(); + + for node in nodes.all_nodes()? { + memory_nodes.ensure_node(node)?; + if let Some(parent) = nodes.parent(node)? { + let order_key = nodes.order_key(node)?.unwrap_or_default(); + attachments.push((node, parent, order_key)); + } + + memory_nodes.set_tombstone(node, nodes.tombstone(node)?)?; + let last_change = nodes.last_change(node)?; + memory_nodes.set_last_change_exact(node, &last_change)?; + let deleted_at = nodes.deleted_at(node)?; + memory_nodes.set_deleted_at_exact(node, deleted_at.as_ref())?; + + if let Some(writer) = payloads.last_writer(node)? { + memory_payloads.set_payload(node, payloads.payload(node)?, writer)?; + } + } + + for (node, parent, order_key) in attachments { + if node == NodeId::ROOT { + continue; + } + memory_nodes.attach(node, parent, order_key)?; + } + + TreeCrdt::with_stores( + replica_id.clone(), + NoopStorage, + LamportClock::default(), + memory_nodes, + memory_payloads, + ) +} + +fn rewind_structure_op( + scratch: &mut TreeCrdt, + storage: &S, + op: &Operation, +) -> Result<()> { + let node = op.kind.node(); + let previous = storage.latest_structural_before(node, &frontier_from_op(op).as_borrowed())?; + + let nodes = scratch.node_store_mut(); + nodes.ensure_node(node)?; + nodes.detach(node)?; + + match previous.as_ref().map(|prev| &prev.kind) { + Some(crate::ops::OperationKind::Insert { + parent, + order_key, + .. + }) => nodes.attach(node, *parent, order_key.clone())?, + Some(crate::ops::OperationKind::Move { + new_parent, + order_key, + .. + }) => nodes.attach(node, *new_parent, order_key.clone())?, + Some(_) | None => {} + } + + Ok(()) +} + +fn rewind_payload_op( + scratch: &mut TreeCrdt, + storage: &S, + op: &Operation, +) -> Result<()> { + let node = op.kind.node(); + let previous = storage.latest_payload_before(node, &frontier_from_op(op).as_borrowed())?; + let payloads = scratch.payload_store_mut(); + + if let Some(previous) = previous { + let payload = payload_from_op(&previous) + .ok_or_else(|| Error::Storage("payload rewind expected payload-bearing op".into()))?; + payloads.set_payload( + node, + payload, + (previous.meta.lamport, previous.meta.id.clone()), + )?; + } else { + payloads.clear_payload(node)?; + } + + Ok(()) +} + +fn rewind_existing_suffix( + scratch: &mut TreeCrdt, + storage: &S, + existing_suffix_ops: &[Operation], +) -> Result<()> { + for op in existing_suffix_ops.iter().rev() { + match &op.kind { + crate::ops::OperationKind::Insert { .. } => { + if op_sets_payload(op) { + rewind_payload_op(scratch, storage, op)?; + } + rewind_structure_op(scratch, storage, op)?; + } + crate::ops::OperationKind::Move { .. } => rewind_structure_op(scratch, storage, op)?, + crate::ops::OperationKind::Payload { .. } => rewind_payload_op(scratch, storage, op)?, + crate::ops::OperationKind::Delete { .. } + | crate::ops::OperationKind::Tombstone { .. } => { + return Err(Error::Storage( + "delete/tombstone ops are not supported by partial rewind".into(), + )); + } + } + } + + Ok(()) +} + fn next_replay_frontier( meta: &M, inserted_ops: &[Operation], @@ -525,6 +752,128 @@ fn replay_frontier_in_memory( )) } +/// Try a partial rewind/replay catch-up for append-time out-of-order suffixes. +/// +/// This starts from the current materialized state, rewinds only the already-materialized suffix, +/// then replays the invalidated suffix in canonical order. It deliberately bails out for +/// delete/tombstone suffixes and for broader recovery cases. +pub fn try_partial_rewind_catch_up_materialized_state( + storage: &S, + inserted_op_ids: &HashSet, + stores: PersistedRemoteStores, + meta: &M, + mut flush_nodes: FlushNodes, + mut flush_index: FlushIndex, +) -> Result> +where + S: FrontierRewindStorage, + C: Clock, + N: ExactNodeStore, + P: ExactPayloadStore, + I: TruncatingParentOpIndex, + M: MaterializationCursor, + FlushNodes: FnMut(&mut N) -> Result<()>, + FlushIndex: FnMut(&mut I) -> Result<()>, +{ + let state = meta.state(); + let Some(head) = state.head.as_ref() else { + return Ok(None); + }; + let Some(frontier) = state.replay_from.as_ref() else { + return Ok(None); + }; + + if frontier.lamport == 0 && frontier.replica.is_empty() && frontier.counter == 0 { + return Ok(None); + } + if cmp_frontiers(frontier, &head.at) != Ordering::Less { + return Ok(None); + } + + let mut existing_suffix_ops = Vec::new(); + storage.scan_frontier_range(frontier, Some(&head.at), &mut |op| { + if !inserted_op_ids.contains(&op.meta.id) { + existing_suffix_ops.push(op); + } + Ok(()) + })?; + if existing_suffix_ops.is_empty() { + return Ok(None); + } + + let mut full_suffix_ops = Vec::new(); + storage.scan_frontier_range(frontier, None, &mut |op| { + full_suffix_ops.push(op); + Ok(()) + })?; + if full_suffix_ops.is_empty() || full_suffix_ops.iter().any(op_requires_full_replay) { + return Ok(None); + } + + let prefix_seq = head + .seq + .saturating_sub(existing_suffix_ops.len().min(u64::MAX as usize) as u64); + + let PersistedRemoteStores { + replica_id, + clock: _clock, + mut nodes, + mut payloads, + mut index, + } = stores; + + let mut scratch = PrefixSnapshot { + crdt: clone_materialized_state(&nodes, &payloads, &replica_id)?, + index: RecordingIndex::default(), + head: None, + seq: prefix_seq, + }; + + rewind_existing_suffix(&mut scratch.crdt, storage, &existing_suffix_ops)?; + + let mut affected = HashSet::new(); + let mut seq = prefix_seq; + let mut replay_head: Option = None; + for op in full_suffix_ops { + seq = seq.saturating_add(1); + let delta = scratch + .crdt + .apply_sorted_remote_with_materialization(op.clone(), &mut scratch.index, seq)?; + affected.extend(delta.affected_nodes); + replay_head = Some(op); + } + + scratch.head = replay_head; + scratch.seq = seq; + + let mut affected_nodes: Vec = affected.into_iter().collect(); + affected_nodes.sort(); + + patch_final_state_in_place( + &mut scratch, + prefix_seq, + &affected_nodes, + &mut nodes, + &mut payloads, + &mut index, + )?; + + flush_nodes(&mut nodes)?; + flush_index(&mut index)?; + + Ok(Some(CatchUpResult { + head: scratch.head.map(|head| MaterializationHead { + at: MaterializationKey { + lamport: head.meta.lamport, + replica: head.meta.id.replica.as_bytes().to_vec(), + counter: head.meta.id.counter, + }, + seq: scratch.seq, + }), + affected_nodes, + })) +} + fn patch_final_state_in_place( prefix: &mut PrefixSnapshot, prefix_seq: u64, diff --git a/packages/treecrdt-core/src/tree.rs b/packages/treecrdt-core/src/tree.rs index e3a67f2f..ecdf8380 100644 --- a/packages/treecrdt-core/src/tree.rs +++ b/packages/treecrdt-core/src/tree.rs @@ -563,6 +563,68 @@ where Ok(applied) } + /// Apply a canonically sorted remote op directly against the current materialized state. + /// + /// This skips storage persistence and out-of-order detection, and is intended for callers + /// that already reconstructed/rewound state to the correct prefix and now need to replay a + /// suffix in canonical op-key order. + pub fn apply_sorted_remote_with_materialization( + &mut self, + op: Operation, + index: &mut I, + seq: u64, + ) -> Result { + self.clock.observe(op.meta.lamport); + self.version_vector.observe(&op.meta.id.replica, op.meta.id.counter); + if op.meta.id.replica == self.replica_id { + self.counter = self.counter.max(op.meta.id.counter); + } + + let op_node = op.kind.node(); + let parent_after = match &op.kind { + OperationKind::Insert { parent, .. } => Some(*parent), + OperationKind::Move { new_parent, .. } => Some(*new_parent), + _ => None, + }; + let op_id = op.meta.id.clone(); + let op_kind = op.kind.clone(); + + let snapshot = Self::apply_forward(&mut self.nodes, &mut self.payloads, &op)?; + self.op_count = seq; + self.head = Some(op.clone()); + + let parents = affected_parents(snapshot.parent, &op_kind); + for parent in &parents { + if *parent == NodeId::TRASH { + continue; + } + index.record(*parent, &op_id, seq)?; + } + + if let Some(parent_after) = parent_after { + if parent_after != NodeId::TRASH && snapshot.parent != Some(parent_after) { + if let Some((_lamport, payload_id)) = self.payload_last_writer(op_node)? { + index.record(parent_after, &payload_id, seq)?; + } + } + } + + let mut affected_nodes = direct_affected_nodes(snapshot.parent, &op_kind); + let mut starts = parents; + starts.push(op_node); + let tombstone_changed = self.refresh_tombstones_upward_with_delta(starts)?; + affected_nodes.extend(tombstone_changed.into_iter().filter(|node| *node != NodeId::TRASH)); + affected_nodes = sorted_node_ids(affected_nodes); + + Ok(ApplyDelta { + snapshot: NodeSnapshotExport { + parent: snapshot.parent, + order_key: snapshot.order_key, + }, + affected_nodes, + }) + } + /// Finalize adapter-owned local ops by refreshing tombstones and recording parent-op index rows. /// /// This is intended for adapters that execute local operations directly against core and then @@ -905,6 +967,10 @@ where &mut self.nodes } + pub(crate) fn payload_store_mut(&mut self) -> &mut P { + &mut self.payloads + } + pub fn validate_invariants(&self) -> Result<()> { for pid in self.nodes.all_nodes()? { let pchildren = self.nodes.children(pid)?; diff --git a/packages/treecrdt-postgres-rs/src/schema.rs b/packages/treecrdt-postgres-rs/src/schema.rs index 0f80e605..4eda95ca 100644 --- a/packages/treecrdt-postgres-rs/src/schema.rs +++ b/packages/treecrdt-postgres-rs/src/schema.rs @@ -24,6 +24,9 @@ CREATE TABLE IF NOT EXISTS treecrdt_ops ( CREATE INDEX IF NOT EXISTS idx_treecrdt_ops_doc_order ON treecrdt_ops (doc_id, lamport, replica, counter); +CREATE INDEX IF NOT EXISTS idx_treecrdt_ops_doc_node_kind_order + ON treecrdt_ops (doc_id, node, kind, lamport, replica, counter); + CREATE TABLE IF NOT EXISTS treecrdt_meta ( doc_id TEXT PRIMARY KEY, head_lamport BIGINT NOT NULL DEFAULT 0, diff --git a/packages/treecrdt-postgres-rs/src/store.rs b/packages/treecrdt-postgres-rs/src/store.rs index ecd3debc..91b03bb9 100644 --- a/packages/treecrdt-postgres-rs/src/store.rs +++ b/packages/treecrdt-postgres-rs/src/store.rs @@ -7,8 +7,9 @@ use postgres::{Client, Row, Statement}; use treecrdt_core::{ apply_persisted_remote_ops_with_delta, catch_up_materialized_state, - materialize_persisted_remote_ops_with_delta, try_shortcut_out_of_order_payload_noops, Error, - ExactNodeStore, ExactPayloadStore, Lamport, LamportClock, MaterializationCursor, + materialize_persisted_remote_ops_with_delta, try_partial_rewind_catch_up_materialized_state, + try_shortcut_out_of_order_payload_noops, Error, ExactNodeStore, ExactPayloadStore, + FrontierRewindStorage, Lamport, LamportClock, MaterializationCursor, MaterializationFrontier, MaterializationHead, MaterializationKey, MaterializationState, NodeId, NodeStore, Operation, OperationId, OperationKind, PayloadStore, PersistedRemoteStores, ReplicaId, Result, Storage, TruncatingParentOpIndex, VersionVector, @@ -1215,6 +1216,130 @@ impl Storage for PgOpStorage { } } +impl FrontierRewindStorage for PgOpStorage { + fn scan_frontier_range( + &self, + start: &treecrdt_core::MaterializationFrontierRef<'_>, + end: Option<&treecrdt_core::MaterializationKey<&[u8]>>, + visit: &mut dyn FnMut(Operation) -> Result<()>, + ) -> Result<()> { + let mut c = self.ctx.client.borrow_mut(); + let rows = if let Some(end) = end { + let stmt = self.ctx.stmt( + &mut c, + "SELECT lamport, replica, counter, kind, parent, node, new_parent, order_key, payload, known_state \ + FROM treecrdt_ops \ + WHERE doc_id = $1 \ + AND (lamport > $2 OR (lamport = $2 AND (replica > $3 OR (replica = $3 AND counter >= $4)))) \ + AND (lamport < $5 OR (lamport = $5 AND (replica < $6 OR (replica = $6 AND counter <= $7)))) \ + ORDER BY lamport, replica, counter", + )?; + c.query( + &stmt, + &[ + &self.ctx.doc_id, + &(start.lamport as i64), + &start.replica, + &(start.counter as i64), + &(end.lamport as i64), + &end.replica, + &(end.counter as i64), + ], + ) + .map_err(storage_debug)? + } else { + let stmt = self.ctx.stmt( + &mut c, + "SELECT lamport, replica, counter, kind, parent, node, new_parent, order_key, payload, known_state \ + FROM treecrdt_ops \ + WHERE doc_id = $1 \ + AND (lamport > $2 OR (lamport = $2 AND (replica > $3 OR (replica = $3 AND counter >= $4)))) \ + ORDER BY lamport, replica, counter", + )?; + c.query( + &stmt, + &[ + &self.ctx.doc_id, + &(start.lamport as i64), + &start.replica, + &(start.counter as i64), + ], + ) + .map_err(storage_debug)? + }; + + drop(c); + for row in rows { + visit(row_to_op(row)?)?; + } + Ok(()) + } + + fn latest_structural_before( + &self, + node: NodeId, + before: &treecrdt_core::MaterializationFrontierRef<'_>, + ) -> Result> { + let mut c = self.ctx.client.borrow_mut(); + let stmt = self.ctx.stmt( + &mut c, + "SELECT lamport, replica, counter, kind, parent, node, new_parent, order_key, payload, known_state \ + FROM treecrdt_ops \ + WHERE doc_id = $1 \ + AND node = $2 \ + AND kind IN ('insert', 'move') \ + AND (lamport < $3 OR (lamport = $3 AND (replica < $4 OR (replica = $4 AND counter < $5)))) \ + ORDER BY lamport DESC, replica DESC, counter DESC \ + LIMIT 1", + )?; + let rows = c + .query( + &stmt, + &[ + &self.ctx.doc_id, + &node_to_bytes(node).to_vec(), + &(before.lamport as i64), + &before.replica, + &(before.counter as i64), + ], + ) + .map_err(storage_debug)?; + rows.first().cloned().map(row_to_op).transpose() + } + + fn latest_payload_before( + &self, + node: NodeId, + before: &treecrdt_core::MaterializationFrontierRef<'_>, + ) -> Result> { + let mut c = self.ctx.client.borrow_mut(); + let stmt = self.ctx.stmt( + &mut c, + "SELECT lamport, replica, counter, kind, parent, node, new_parent, order_key, payload, known_state \ + FROM treecrdt_ops \ + WHERE doc_id = $1 \ + AND node = $2 \ + AND (kind = 'payload' OR (kind = 'insert' AND payload IS NOT NULL)) \ + AND (lamport < $3 OR (lamport = $3 AND (replica < $4 OR (replica = $4 AND counter < $5)))) \ + ORDER BY lamport DESC, replica DESC, counter DESC \ + LIMIT 1", + )?; + let rows = c + .query( + &stmt, + &[ + &self.ctx.doc_id, + &node_to_bytes(node).to_vec(), + &(before.lamport as i64), + &before.replica, + &(before.counter as i64), + ], + ) + .map_err(storage_debug)?; + rows.first().cloned().map(row_to_op).transpose() + } +} + pub(crate) fn row_to_op(row: Row) -> Result { let lamport = row.get::<_, i64>(0).max(0) as Lamport; let replica: Vec = row.get(1); @@ -1713,6 +1838,8 @@ fn append_ops_in_tx( // Only materialize the ops Postgres actually inserted. This keeps duplicate opRefs in the // input batch from being replayed twice through core materialization. let inserted_ops = select_inserted_ops(&ctx, ops, inserted_op_refs); + let inserted_op_ids: HashSet = + inserted_ops.iter().map(|op| op.meta.id.clone()).collect(); if let Some(profile) = &append_profile { profile.borrow_mut().dedupe_filter_ms += dedupe_filter_started_at.elapsed().as_secs_f64() * 1000.0; @@ -1770,19 +1897,51 @@ fn append_ops_in_tx( }; let apply_result = if apply_result.catch_up_needed { let refreshed_meta = load_tree_meta_for_update(client, doc_id)?; - let catch_up = catch_up_materialized_state( - PgOpStorage::new(ctx.clone()), - PersistedRemoteStores { - replica_id: ReplicaId::new(b"postgres"), - clock: LamportClock::default(), - nodes: PgNodeStore::new(ctx.clone()), - payloads: PgPayloadStore::new(ctx.clone()), - index: PgParentOpIndex::new(ctx.clone()), - }, - &refreshed_meta, - |nodes| nodes.flush_last_change(), - |index| index.flush(), - )?; + let catch_up = if meta.state().replay_from.is_none() { + try_partial_rewind_catch_up_materialized_state( + &PgOpStorage::new(ctx.clone()), + &inserted_op_ids, + PersistedRemoteStores { + replica_id: ReplicaId::new(b"postgres"), + clock: LamportClock::default(), + nodes: PgNodeStore::new(ctx.clone()), + payloads: PgPayloadStore::new(ctx.clone()), + index: PgParentOpIndex::new(ctx.clone()), + }, + &refreshed_meta, + |nodes| nodes.flush_last_change(), + |index| index.flush(), + )? + .unwrap_or( + catch_up_materialized_state( + PgOpStorage::new(ctx.clone()), + PersistedRemoteStores { + replica_id: ReplicaId::new(b"postgres"), + clock: LamportClock::default(), + nodes: PgNodeStore::new(ctx.clone()), + payloads: PgPayloadStore::new(ctx.clone()), + index: PgParentOpIndex::new(ctx.clone()), + }, + &refreshed_meta, + |nodes| nodes.flush_last_change(), + |index| index.flush(), + )?, + ) + } else { + catch_up_materialized_state( + PgOpStorage::new(ctx.clone()), + PersistedRemoteStores { + replica_id: ReplicaId::new(b"postgres"), + clock: LamportClock::default(), + nodes: PgNodeStore::new(ctx.clone()), + payloads: PgPayloadStore::new(ctx.clone()), + index: PgParentOpIndex::new(ctx.clone()), + }, + &refreshed_meta, + |nodes| nodes.flush_last_change(), + |index| index.flush(), + )? + }; update_head( catch_up .head diff --git a/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs b/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs index 062638da..ceffdc23 100644 --- a/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs +++ b/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs @@ -5,9 +5,12 @@ use super::payload_store::SqlitePayloadStore; use super::schema::{set_tree_meta_replay_frontier, tree_meta_from_state}; use super::util::sqlite_err_from_core; use super::*; +use std::collections::HashSet; use treecrdt_core::PayloadStore; use treecrdt_core::Storage; -use treecrdt_core::{LamportClock, MaterializationCursor, ReplicaId}; +use treecrdt_core::{ + try_partial_rewind_catch_up_materialized_state, LamportClock, MaterializationCursor, ReplicaId, +}; fn merge_affected_nodes(mut left: Vec, right: Vec) -> Vec { left.extend(right); @@ -271,6 +274,8 @@ pub(super) fn append_ops_impl( inserted_ops.push(operation); } } + let inserted_op_ids: HashSet = + inserted_ops.iter().map(|op| op.meta.id.clone()).collect(); let apply_result = if let Some(shortcut) = { let payloads = SqlitePayloadStore::prepare(db).map_err(|_| SQLITE_ERROR as c_int)?; @@ -316,21 +321,59 @@ pub(super) fn append_ops_impl( }; let apply_result = if apply_result.catch_up_needed { let refreshed_meta = load_tree_meta(db)?; - let catch_up = treecrdt_core::catch_up_materialized_state( - super::op_storage::SqliteOpStorage::with_doc_id(db, doc_id.to_vec()), - treecrdt_core::PersistedRemoteStores { - replica_id: ReplicaId::new(b"sqlite-ext"), - clock: LamportClock::default(), - nodes: SqliteNodeStore::prepare(db).map_err(|_| SQLITE_ERROR as c_int)?, - payloads: SqlitePayloadStore::prepare(db).map_err(|_| SQLITE_ERROR as c_int)?, - index: SqliteParentOpIndex::prepare(db, doc_id.to_vec()) - .map_err(|_| SQLITE_ERROR as c_int)?, - }, - &refreshed_meta, - |_| Ok(()), - |_| Ok(()), - ) - .map_err(|_| SQLITE_ERROR as c_int)?; + let catch_up = if meta.state().replay_from.is_none() { + try_partial_rewind_catch_up_materialized_state( + &super::op_storage::SqliteOpStorage::with_doc_id(db, doc_id.to_vec()), + &inserted_op_ids, + treecrdt_core::PersistedRemoteStores { + replica_id: ReplicaId::new(b"sqlite-ext"), + clock: LamportClock::default(), + nodes: SqliteNodeStore::prepare(db).map_err(|_| SQLITE_ERROR as c_int)?, + payloads: SqlitePayloadStore::prepare(db).map_err(|_| SQLITE_ERROR as c_int)?, + index: SqliteParentOpIndex::prepare(db, doc_id.to_vec()) + .map_err(|_| SQLITE_ERROR as c_int)?, + }, + &refreshed_meta, + |_| Ok(()), + |_| Ok(()), + ) + .map_err(|_| SQLITE_ERROR as c_int)? + .unwrap_or( + treecrdt_core::catch_up_materialized_state( + super::op_storage::SqliteOpStorage::with_doc_id(db, doc_id.to_vec()), + treecrdt_core::PersistedRemoteStores { + replica_id: ReplicaId::new(b"sqlite-ext"), + clock: LamportClock::default(), + nodes: SqliteNodeStore::prepare(db) + .map_err(|_| SQLITE_ERROR as c_int)?, + payloads: SqlitePayloadStore::prepare(db) + .map_err(|_| SQLITE_ERROR as c_int)?, + index: SqliteParentOpIndex::prepare(db, doc_id.to_vec()) + .map_err(|_| SQLITE_ERROR as c_int)?, + }, + &refreshed_meta, + |_| Ok(()), + |_| Ok(()), + ) + .map_err(|_| SQLITE_ERROR as c_int)?, + ) + } else { + treecrdt_core::catch_up_materialized_state( + super::op_storage::SqliteOpStorage::with_doc_id(db, doc_id.to_vec()), + treecrdt_core::PersistedRemoteStores { + replica_id: ReplicaId::new(b"sqlite-ext"), + clock: LamportClock::default(), + nodes: SqliteNodeStore::prepare(db).map_err(|_| SQLITE_ERROR as c_int)?, + payloads: SqlitePayloadStore::prepare(db).map_err(|_| SQLITE_ERROR as c_int)?, + index: SqliteParentOpIndex::prepare(db, doc_id.to_vec()) + .map_err(|_| SQLITE_ERROR as c_int)?, + }, + &refreshed_meta, + |_| Ok(()), + |_| Ok(()), + ) + .map_err(|_| SQLITE_ERROR as c_int)? + }; update_tree_meta_head( db, Some(catch_up.head.as_ref().ok_or(SQLITE_ERROR as c_int)?), diff --git a/packages/treecrdt-sqlite-ext/src/extension/functions/op_storage.rs b/packages/treecrdt-sqlite-ext/src/extension/functions/op_storage.rs index 5aa9fc29..99384bea 100644 --- a/packages/treecrdt-sqlite-ext/src/extension/functions/op_storage.rs +++ b/packages/treecrdt-sqlite-ext/src/extension/functions/op_storage.rs @@ -20,6 +20,122 @@ fn sqlite_node_id_bytes(node: NodeId) -> [u8; 16] { node.0.to_be_bytes() } +fn read_operation_row(stmt: *mut sqlite3_stmt) -> treecrdt_core::Result { + let replica_ptr = unsafe { sqlite_column_blob(stmt, 0) } as *const u8; + let replica_len = unsafe { sqlite_column_bytes(stmt, 0) } as usize; + if replica_ptr.is_null() { + return Err(sqlite_rc_error( + SQLITE_ERROR as c_int, + "replica missing from op row", + )); + } + let replica = unsafe { slice::from_raw_parts(replica_ptr, replica_len) }.to_vec(); + let counter = unsafe { sqlite_column_int64(stmt, 1).max(0) as u64 }; + let lamport_val = unsafe { sqlite_column_int64(stmt, 2).max(0) as Lamport }; + + let kind_ptr = unsafe { sqlite_column_text(stmt, 3) } as *const u8; + let kind_len = unsafe { sqlite_column_bytes(stmt, 3) } as usize; + let kind = if kind_ptr.is_null() { + "" + } else { + std::str::from_utf8(unsafe { slice::from_raw_parts(kind_ptr, kind_len) }).unwrap_or("") + }; + + let parent = + unsafe { column_blob16(stmt, 4) }.map_err(|rc| sqlite_rc_error(rc, "read parent failed"))?; + let node = unsafe { column_blob16(stmt, 5) } + .map_err(|rc| sqlite_rc_error(rc, "read node failed"))? + .ok_or_else(|| sqlite_rc_error(SQLITE_ERROR as c_int, "node missing"))?; + let new_parent = unsafe { column_blob16(stmt, 6) } + .map_err(|rc| sqlite_rc_error(rc, "read new_parent failed"))?; + let order_key = if unsafe { sqlite_column_type(stmt, 7) } == SQLITE_NULL as c_int { + Vec::new() + } else { + let ptr = unsafe { sqlite_column_blob(stmt, 7) } as *const u8; + let len = unsafe { sqlite_column_bytes(stmt, 7) } as usize; + if ptr.is_null() { + Vec::new() + } else { + unsafe { slice::from_raw_parts(ptr, len) }.to_vec() + } + }; + + let known_state = if unsafe { sqlite_column_type(stmt, 8) } == SQLITE_NULL as c_int { + None + } else { + let ptr = unsafe { sqlite_column_blob(stmt, 8) } as *const u8; + let len = unsafe { sqlite_column_bytes(stmt, 8) } as usize; + if ptr.is_null() || len == 0 { + None + } else { + Some(vv_from_bytes(unsafe { slice::from_raw_parts(ptr, len) })?) + } + }; + + let payload = if unsafe { sqlite_column_type(stmt, 9) } == SQLITE_NULL as c_int { + None + } else { + let ptr = unsafe { sqlite_column_blob(stmt, 9) } as *const u8; + let len = unsafe { sqlite_column_bytes(stmt, 9) } as usize; + if ptr.is_null() { + None + } else { + Some(unsafe { slice::from_raw_parts(ptr, len) }.to_vec()) + } + }; + + let op_kind = match kind { + "insert" => { + let parent = parent + .ok_or_else(|| sqlite_rc_error(SQLITE_ERROR as c_int, "insert missing parent"))?; + treecrdt_core::OperationKind::Insert { + parent: sqlite_bytes_to_node_id(parent), + node: sqlite_bytes_to_node_id(node), + order_key, + payload, + } + } + "move" => { + let new_parent = new_parent.ok_or_else(|| { + sqlite_rc_error(SQLITE_ERROR as c_int, "move missing new_parent") + })?; + treecrdt_core::OperationKind::Move { + node: sqlite_bytes_to_node_id(node), + new_parent: sqlite_bytes_to_node_id(new_parent), + order_key, + } + } + "delete" => treecrdt_core::OperationKind::Delete { + node: sqlite_bytes_to_node_id(node), + }, + "tombstone" => treecrdt_core::OperationKind::Tombstone { + node: sqlite_bytes_to_node_id(node), + }, + "payload" => treecrdt_core::OperationKind::Payload { + node: sqlite_bytes_to_node_id(node), + payload, + }, + _ => { + return Err(sqlite_rc_error( + SQLITE_ERROR as c_int, + "unknown op kind in row", + )); + } + }; + + Ok(treecrdt_core::Operation { + meta: treecrdt_core::OperationMetadata { + id: treecrdt_core::OperationId { + replica: treecrdt_core::ReplicaId(replica), + counter, + }, + lamport: lamport_val, + known_state, + }, + kind: op_kind, + }) +} + pub(super) struct SqliteOpStorage { db: *mut sqlite3, doc_id: Option>, @@ -256,117 +372,7 @@ impl treecrdt_core::Storage for SqliteOpStorage { loop { let step_rc = unsafe { sqlite_step(stmt) }; if step_rc == SQLITE_ROW as c_int { - let replica_ptr = unsafe { sqlite_column_blob(stmt, 0) } as *const u8; - let replica_len = unsafe { sqlite_column_bytes(stmt, 0) } as usize; - if replica_ptr.is_null() { - continue; - } - let replica = unsafe { slice::from_raw_parts(replica_ptr, replica_len) }.to_vec(); - let counter = unsafe { sqlite_column_int64(stmt, 1).max(0) as u64 }; - let lamport_val = unsafe { sqlite_column_int64(stmt, 2).max(0) as Lamport }; - - let kind_ptr = unsafe { sqlite_column_text(stmt, 3) } as *const u8; - let kind_len = unsafe { sqlite_column_bytes(stmt, 3) } as usize; - let kind = if kind_ptr.is_null() { - "" - } else { - std::str::from_utf8(unsafe { slice::from_raw_parts(kind_ptr, kind_len) }) - .unwrap_or("") - }; - - let parent = unsafe { column_blob16(stmt, 4) } - .map_err(|rc| sqlite_rc_error(rc, "read parent failed"))?; - let node = unsafe { column_blob16(stmt, 5) } - .map_err(|rc| sqlite_rc_error(rc, "read node failed"))? - .ok_or_else(|| sqlite_rc_error(SQLITE_ERROR as c_int, "node missing"))?; - let new_parent = unsafe { column_blob16(stmt, 6) } - .map_err(|rc| sqlite_rc_error(rc, "read new_parent failed"))?; - let order_key = if unsafe { sqlite_column_type(stmt, 7) } == SQLITE_NULL as c_int { - Vec::new() - } else { - let ptr = unsafe { sqlite_column_blob(stmt, 7) } as *const u8; - let len = unsafe { sqlite_column_bytes(stmt, 7) } as usize; - if ptr.is_null() { - Vec::new() - } else { - unsafe { slice::from_raw_parts(ptr, len) }.to_vec() - } - }; - - let known_state = if unsafe { sqlite_column_type(stmt, 8) } == SQLITE_NULL as c_int - { - None - } else { - let ptr = unsafe { sqlite_column_blob(stmt, 8) } as *const u8; - let len = unsafe { sqlite_column_bytes(stmt, 8) } as usize; - if ptr.is_null() || len == 0 { - None - } else { - Some(vv_from_bytes(unsafe { slice::from_raw_parts(ptr, len) })?) - } - }; - - let payload = if unsafe { sqlite_column_type(stmt, 9) } == SQLITE_NULL as c_int { - None - } else { - let ptr = unsafe { sqlite_column_blob(stmt, 9) } as *const u8; - let len = unsafe { sqlite_column_bytes(stmt, 9) } as usize; - if ptr.is_null() { - None - } else { - Some(unsafe { slice::from_raw_parts(ptr, len) }.to_vec()) - } - }; - - let op_kind = match kind { - "insert" => { - let parent = parent.ok_or_else(|| { - sqlite_rc_error(SQLITE_ERROR as c_int, "insert missing parent") - })?; - treecrdt_core::OperationKind::Insert { - parent: sqlite_bytes_to_node_id(parent), - node: sqlite_bytes_to_node_id(node), - order_key, - payload, - } - } - "move" => { - let new_parent = new_parent.ok_or_else(|| { - sqlite_rc_error(SQLITE_ERROR as c_int, "move missing new_parent") - })?; - treecrdt_core::OperationKind::Move { - node: sqlite_bytes_to_node_id(node), - new_parent: sqlite_bytes_to_node_id(new_parent), - order_key, - } - } - "delete" => treecrdt_core::OperationKind::Delete { - node: sqlite_bytes_to_node_id(node), - }, - "tombstone" => treecrdt_core::OperationKind::Tombstone { - node: sqlite_bytes_to_node_id(node), - }, - "payload" => treecrdt_core::OperationKind::Payload { - node: sqlite_bytes_to_node_id(node), - payload, - }, - _ => { - unsafe { sqlite_finalize(stmt) }; - return Err(sqlite_rc_error(SQLITE_ERROR as c_int, "unknown op kind")); - } - }; - - out.push(treecrdt_core::Operation { - meta: treecrdt_core::OperationMetadata { - id: treecrdt_core::OperationId { - replica: treecrdt_core::ReplicaId(replica), - counter, - }, - lamport: lamport_val, - known_state, - }, - kind: op_kind, - }); + out.push(read_operation_row(stmt)?); } else if step_rc == SQLITE_DONE as c_int { break; } else { @@ -408,119 +414,7 @@ impl treecrdt_core::Storage for SqliteOpStorage { loop { let step_rc = unsafe { sqlite_step(stmt) }; if step_rc == SQLITE_ROW as c_int { - let replica_ptr = unsafe { sqlite_column_blob(stmt, 0) } as *const u8; - let replica_len = unsafe { sqlite_column_bytes(stmt, 0) } as usize; - if replica_ptr.is_null() { - continue; - } - let replica = unsafe { slice::from_raw_parts(replica_ptr, replica_len) }.to_vec(); - let counter = unsafe { sqlite_column_int64(stmt, 1).max(0) as u64 }; - let lamport_val = unsafe { sqlite_column_int64(stmt, 2).max(0) as Lamport }; - - let kind_ptr = unsafe { sqlite_column_text(stmt, 3) } as *const u8; - let kind_len = unsafe { sqlite_column_bytes(stmt, 3) } as usize; - let kind = if kind_ptr.is_null() { - "" - } else { - std::str::from_utf8(unsafe { slice::from_raw_parts(kind_ptr, kind_len) }) - .unwrap_or("") - }; - - let parent = unsafe { column_blob16(stmt, 4) } - .map_err(|rc| sqlite_rc_error(rc, "read parent failed"))?; - let node = unsafe { column_blob16(stmt, 5) } - .map_err(|rc| sqlite_rc_error(rc, "read node failed"))? - .ok_or_else(|| sqlite_rc_error(SQLITE_ERROR as c_int, "node missing"))?; - let new_parent = unsafe { column_blob16(stmt, 6) } - .map_err(|rc| sqlite_rc_error(rc, "read new_parent failed"))?; - let order_key = if unsafe { sqlite_column_type(stmt, 7) } == SQLITE_NULL as c_int { - Vec::new() - } else { - let ptr = unsafe { sqlite_column_blob(stmt, 7) } as *const u8; - let len = unsafe { sqlite_column_bytes(stmt, 7) } as usize; - if ptr.is_null() { - Vec::new() - } else { - unsafe { slice::from_raw_parts(ptr, len) }.to_vec() - } - }; - - let known_state = if unsafe { sqlite_column_type(stmt, 8) } == SQLITE_NULL as c_int - { - None - } else { - let ptr = unsafe { sqlite_column_blob(stmt, 8) } as *const u8; - let len = unsafe { sqlite_column_bytes(stmt, 8) } as usize; - if ptr.is_null() || len == 0 { - None - } else { - Some(vv_from_bytes(unsafe { slice::from_raw_parts(ptr, len) })?) - } - }; - - let payload = if unsafe { sqlite_column_type(stmt, 9) } == SQLITE_NULL as c_int { - None - } else { - let ptr = unsafe { sqlite_column_blob(stmt, 9) } as *const u8; - let len = unsafe { sqlite_column_bytes(stmt, 9) } as usize; - if ptr.is_null() { - None - } else { - Some(unsafe { slice::from_raw_parts(ptr, len) }.to_vec()) - } - }; - - let op_kind = match kind { - "insert" => { - let parent = parent.ok_or_else(|| { - sqlite_rc_error(SQLITE_ERROR as c_int, "insert missing parent") - })?; - treecrdt_core::OperationKind::Insert { - parent: sqlite_bytes_to_node_id(parent), - node: sqlite_bytes_to_node_id(node), - order_key, - payload, - } - } - "move" => { - let new_parent = new_parent.ok_or_else(|| { - sqlite_rc_error(SQLITE_ERROR as c_int, "move missing new_parent") - })?; - treecrdt_core::OperationKind::Move { - node: sqlite_bytes_to_node_id(node), - new_parent: sqlite_bytes_to_node_id(new_parent), - order_key, - } - } - "delete" => treecrdt_core::OperationKind::Delete { - node: sqlite_bytes_to_node_id(node), - }, - "tombstone" => treecrdt_core::OperationKind::Tombstone { - node: sqlite_bytes_to_node_id(node), - }, - "payload" => treecrdt_core::OperationKind::Payload { - node: sqlite_bytes_to_node_id(node), - payload, - }, - _ => { - unsafe { sqlite_finalize(stmt) }; - return Err(sqlite_rc_error(SQLITE_ERROR as c_int, "unknown op kind")); - } - }; - - let op = treecrdt_core::Operation { - meta: treecrdt_core::OperationMetadata { - id: treecrdt_core::OperationId { - replica: treecrdt_core::ReplicaId(replica), - counter, - }, - lamport: lamport_val, - known_state, - }, - kind: op_kind, - }; - - if let Err(err) = visit(op) { + if let Err(err) = visit(read_operation_row(stmt)?) { unsafe { sqlite_finalize(stmt) }; return Err(err); } @@ -595,3 +489,222 @@ impl treecrdt_core::Storage for SqliteOpStorage { Ok(val) } } + +impl treecrdt_core::FrontierRewindStorage for SqliteOpStorage { + fn scan_frontier_range( + &self, + start: &treecrdt_core::MaterializationFrontierRef<'_>, + end: Option<&treecrdt_core::MaterializationKey<&[u8]>>, + visit: &mut dyn FnMut(treecrdt_core::Operation) -> treecrdt_core::Result<()>, + ) -> treecrdt_core::Result<()> { + let sql = if end.is_some() { + CString::new( + "SELECT replica,counter,lamport,kind,parent,node,new_parent,order_key,known_state,payload \ + FROM ops \ + WHERE (lamport > ?1 OR (lamport = ?1 AND (replica > ?2 OR (replica = ?2 AND counter >= ?3)))) \ + AND (lamport < ?4 OR (lamport = ?4 AND (replica < ?5 OR (replica = ?5 AND counter <= ?6)))) \ + ORDER BY lamport, replica, counter", + ) + .expect("scan frontier range bounded sql") + } else { + CString::new( + "SELECT replica,counter,lamport,kind,parent,node,new_parent,order_key,known_state,payload \ + FROM ops \ + WHERE (lamport > ?1 OR (lamport = ?1 AND (replica > ?2 OR (replica = ?2 AND counter >= ?3)))) \ + ORDER BY lamport, replica, counter", + ) + .expect("scan frontier range sql") + }; + let mut stmt: *mut sqlite3_stmt = null_mut(); + let rc = sqlite_prepare_v2(self.db, sql.as_ptr(), -1, &mut stmt, null_mut()); + if rc != SQLITE_OK as c_int { + return Err(sqlite_rc_error(rc, "prepare frontier range failed")); + } + + let mut bind_err = false; + unsafe { + bind_err |= sqlite_bind_int64(stmt, 1, start.lamport as i64) != SQLITE_OK as c_int; + bind_err |= sqlite_bind_blob( + stmt, + 2, + start.replica.as_ptr() as *const c_void, + start.replica.len() as c_int, + None, + ) != SQLITE_OK as c_int; + bind_err |= sqlite_bind_int64(stmt, 3, start.counter as i64) != SQLITE_OK as c_int; + } + if let Some(end) = end { + unsafe { + bind_err |= sqlite_bind_int64(stmt, 4, end.lamport as i64) != SQLITE_OK as c_int; + bind_err |= sqlite_bind_blob( + stmt, + 5, + end.replica.as_ptr() as *const c_void, + end.replica.len() as c_int, + None, + ) != SQLITE_OK as c_int; + bind_err |= sqlite_bind_int64(stmt, 6, end.counter as i64) != SQLITE_OK as c_int; + } + } + if bind_err { + unsafe { sqlite_finalize(stmt) }; + return Err(sqlite_rc_error(SQLITE_ERROR as c_int, "bind frontier range failed")); + } + + loop { + let step_rc = unsafe { sqlite_step(stmt) }; + if step_rc == SQLITE_ROW as c_int { + if let Err(err) = visit(read_operation_row(stmt)?) { + unsafe { sqlite_finalize(stmt) }; + return Err(err); + } + } else if step_rc == SQLITE_DONE as c_int { + break; + } else { + unsafe { sqlite_finalize(stmt) }; + return Err(sqlite_rc_error(step_rc, "frontier range step failed")); + } + } + + let finalize_rc = unsafe { sqlite_finalize(stmt) }; + if finalize_rc != SQLITE_OK as c_int { + return Err(sqlite_rc_error(finalize_rc, "finalize frontier range failed")); + } + Ok(()) + } + + fn latest_structural_before( + &self, + node: NodeId, + before: &treecrdt_core::MaterializationFrontierRef<'_>, + ) -> treecrdt_core::Result> { + let sql = CString::new( + "SELECT replica,counter,lamport,kind,parent,node,new_parent,order_key,known_state,payload \ + FROM ops \ + WHERE node = ?1 \ + AND kind IN ('insert', 'move') \ + AND (lamport < ?2 OR (lamport = ?2 AND (replica < ?3 OR (replica = ?3 AND counter < ?4)))) \ + ORDER BY lamport DESC, replica DESC, counter DESC \ + LIMIT 1", + ) + .expect("latest structural before sql"); + let mut stmt: *mut sqlite3_stmt = null_mut(); + let rc = sqlite_prepare_v2(self.db, sql.as_ptr(), -1, &mut stmt, null_mut()); + if rc != SQLITE_OK as c_int { + return Err(sqlite_rc_error(rc, "prepare latest structural failed")); + } + + let node_bytes = sqlite_node_id_bytes(node); + let mut bind_err = false; + unsafe { + bind_err |= sqlite_bind_blob( + stmt, + 1, + node_bytes.as_ptr() as *const c_void, + node_bytes.len() as c_int, + None, + ) != SQLITE_OK as c_int; + bind_err |= sqlite_bind_int64(stmt, 2, before.lamport as i64) != SQLITE_OK as c_int; + bind_err |= sqlite_bind_blob( + stmt, + 3, + before.replica.as_ptr() as *const c_void, + before.replica.len() as c_int, + None, + ) != SQLITE_OK as c_int; + bind_err |= sqlite_bind_int64(stmt, 4, before.counter as i64) != SQLITE_OK as c_int; + } + if bind_err { + unsafe { sqlite_finalize(stmt) }; + return Err(sqlite_rc_error( + SQLITE_ERROR as c_int, + "bind latest structural failed", + )); + } + + let step_rc = unsafe { sqlite_step(stmt) }; + let op = if step_rc == SQLITE_ROW as c_int { + Some(read_operation_row(stmt)?) + } else if step_rc == SQLITE_DONE as c_int { + None + } else { + unsafe { sqlite_finalize(stmt) }; + return Err(sqlite_rc_error(step_rc, "latest structural step failed")); + }; + + let finalize_rc = unsafe { sqlite_finalize(stmt) }; + if finalize_rc != SQLITE_OK as c_int { + return Err(sqlite_rc_error( + finalize_rc, + "finalize latest structural failed", + )); + } + Ok(op) + } + + fn latest_payload_before( + &self, + node: NodeId, + before: &treecrdt_core::MaterializationFrontierRef<'_>, + ) -> treecrdt_core::Result> { + let sql = CString::new( + "SELECT replica,counter,lamport,kind,parent,node,new_parent,order_key,known_state,payload \ + FROM ops \ + WHERE node = ?1 \ + AND (kind = 'payload' OR (kind = 'insert' AND payload IS NOT NULL)) \ + AND (lamport < ?2 OR (lamport = ?2 AND (replica < ?3 OR (replica = ?3 AND counter < ?4)))) \ + ORDER BY lamport DESC, replica DESC, counter DESC \ + LIMIT 1", + ) + .expect("latest payload before sql"); + let mut stmt: *mut sqlite3_stmt = null_mut(); + let rc = sqlite_prepare_v2(self.db, sql.as_ptr(), -1, &mut stmt, null_mut()); + if rc != SQLITE_OK as c_int { + return Err(sqlite_rc_error(rc, "prepare latest payload failed")); + } + + let node_bytes = sqlite_node_id_bytes(node); + let mut bind_err = false; + unsafe { + bind_err |= sqlite_bind_blob( + stmt, + 1, + node_bytes.as_ptr() as *const c_void, + node_bytes.len() as c_int, + None, + ) != SQLITE_OK as c_int; + bind_err |= sqlite_bind_int64(stmt, 2, before.lamport as i64) != SQLITE_OK as c_int; + bind_err |= sqlite_bind_blob( + stmt, + 3, + before.replica.as_ptr() as *const c_void, + before.replica.len() as c_int, + None, + ) != SQLITE_OK as c_int; + bind_err |= sqlite_bind_int64(stmt, 4, before.counter as i64) != SQLITE_OK as c_int; + } + if bind_err { + unsafe { sqlite_finalize(stmt) }; + return Err(sqlite_rc_error( + SQLITE_ERROR as c_int, + "bind latest payload failed", + )); + } + + let step_rc = unsafe { sqlite_step(stmt) }; + let op = if step_rc == SQLITE_ROW as c_int { + Some(read_operation_row(stmt)?) + } else if step_rc == SQLITE_DONE as c_int { + None + } else { + unsafe { sqlite_finalize(stmt) }; + return Err(sqlite_rc_error(step_rc, "latest payload step failed")); + }; + + let finalize_rc = unsafe { sqlite_finalize(stmt) }; + if finalize_rc != SQLITE_OK as c_int { + return Err(sqlite_rc_error(finalize_rc, "finalize latest payload failed")); + } + Ok(op) + } +} diff --git a/packages/treecrdt-sqlite-ext/src/extension/functions/schema.rs b/packages/treecrdt-sqlite-ext/src/extension/functions/schema.rs index ec3130c3..c9690cb8 100644 --- a/packages/treecrdt-sqlite-ext/src/extension/functions/schema.rs +++ b/packages/treecrdt-sqlite-ext/src/extension/functions/schema.rs @@ -359,6 +359,7 @@ CREATE TABLE IF NOT EXISTS tree_payload ( const INDEXES: &str = r#" CREATE INDEX IF NOT EXISTS idx_ops_lamport ON ops(lamport, replica, counter); CREATE INDEX IF NOT EXISTS idx_ops_op_ref ON ops(op_ref); +CREATE INDEX IF NOT EXISTS idx_ops_node_kind_order ON ops(node, kind, lamport, replica, counter); CREATE INDEX IF NOT EXISTS idx_tree_nodes_parent_order_key_node ON tree_nodes(parent, order_key, node); CREATE INDEX IF NOT EXISTS idx_tree_nodes_parent_tombstone_order_key_node ON tree_nodes(parent, tombstone, order_key, node); CREATE INDEX IF NOT EXISTS idx_oprefs_children_parent_seq ON oprefs_children(parent, seq); From b4c414f407d3ca8548d729c54788131e3964d719 Mon Sep 17 00:00:00 2001 From: Marcus Pousette Date: Fri, 17 Apr 2026 10:14:05 +0200 Subject: [PATCH 03/13] feat: add direct rewind materialization fast path --- packages/treecrdt-core/src/lib.rs | 2 +- packages/treecrdt-core/src/materialization.rs | 129 ++++++------------ packages/treecrdt-core/src/tree.rs | 4 - packages/treecrdt-postgres-rs/src/store.rs | 4 +- .../tests/postgres_test.rs | 10 ++ .../src/extension/functions/materialize.rs | 4 +- .../tests/extension_roundtrip.rs | 8 ++ packages/treecrdt-test-support/src/lib.rs | 26 ++++ 8 files changed, 87 insertions(+), 100 deletions(-) diff --git a/packages/treecrdt-core/src/lib.rs b/packages/treecrdt-core/src/lib.rs index 726c4b53..fe7846d2 100644 --- a/packages/treecrdt-core/src/lib.rs +++ b/packages/treecrdt-core/src/lib.rs @@ -17,7 +17,7 @@ pub use ids::{Lamport, NodeId, OperationId, ReplicaId}; pub use materialization::{ apply_incremental_ops_with_delta, apply_persisted_remote_ops_with_delta, catch_up_materialized_state, materialize_persisted_remote_ops_with_delta, - try_partial_rewind_catch_up_materialized_state, try_shortcut_out_of_order_payload_noops, + try_direct_rewind_catch_up_materialized_state, try_shortcut_out_of_order_payload_noops, CatchUpResult, FrontierRewindStorage, IncrementalApplyResult, MaterializationCursor, MaterializationFrontier, MaterializationFrontierRef, MaterializationHead, MaterializationHeadRef, MaterializationKey, MaterializationState, MaterializationStateRef, diff --git a/packages/treecrdt-core/src/materialization.rs b/packages/treecrdt-core/src/materialization.rs index 0b259be8..d72b1209 100644 --- a/packages/treecrdt-core/src/materialization.rs +++ b/packages/treecrdt-core/src/materialization.rs @@ -75,10 +75,10 @@ pub trait MaterializationCursor { fn state(&self) -> MaterializationStateRef<'_>; } -/// Optional storage hooks for partial rewind/replay of a frontier-invalidated suffix. +/// Optional storage hooks for direct rewind/replay of a frontier-invalidated suffix. /// /// The default implementations are intentionally naive and scan the full op log in memory. Real -/// storage backends should override these with ordered SQL lookups so the partial rewind fast path +/// storage backends should override these with ordered SQL lookups so the direct rewind fast path /// only touches the invalidated suffix plus node-scoped predecessor queries. pub trait FrontierRewindStorage: Storage { fn scan_frontier_range( @@ -312,58 +312,13 @@ fn payload_from_op(op: &Operation) -> Option>> { } } -fn clone_materialized_state( - nodes: &N, - payloads: &P, - replica_id: &ReplicaId, -) -> Result> { - let mut memory_nodes = MemoryNodeStore::default(); - let mut memory_payloads = MemoryPayloadStore::default(); - let mut attachments: Vec<(NodeId, NodeId, Vec)> = Vec::new(); - - for node in nodes.all_nodes()? { - memory_nodes.ensure_node(node)?; - if let Some(parent) = nodes.parent(node)? { - let order_key = nodes.order_key(node)?.unwrap_or_default(); - attachments.push((node, parent, order_key)); - } - - memory_nodes.set_tombstone(node, nodes.tombstone(node)?)?; - let last_change = nodes.last_change(node)?; - memory_nodes.set_last_change_exact(node, &last_change)?; - let deleted_at = nodes.deleted_at(node)?; - memory_nodes.set_deleted_at_exact(node, deleted_at.as_ref())?; - - if let Some(writer) = payloads.last_writer(node)? { - memory_payloads.set_payload(node, payloads.payload(node)?, writer)?; - } - } - - for (node, parent, order_key) in attachments { - if node == NodeId::ROOT { - continue; - } - memory_nodes.attach(node, parent, order_key)?; - } - - TreeCrdt::with_stores( - replica_id.clone(), - NoopStorage, - LamportClock::default(), - memory_nodes, - memory_payloads, - ) -} - -fn rewind_structure_op( - scratch: &mut TreeCrdt, +fn rewind_structure_op_in_place( + nodes: &mut N, storage: &S, op: &Operation, ) -> Result<()> { let node = op.kind.node(); let previous = storage.latest_structural_before(node, &frontier_from_op(op).as_borrowed())?; - - let nodes = scratch.node_store_mut(); nodes.ensure_node(node)?; nodes.detach(node)?; @@ -384,14 +339,13 @@ fn rewind_structure_op( Ok(()) } -fn rewind_payload_op( - scratch: &mut TreeCrdt, +fn rewind_payload_op_in_place( + payloads: &mut P, storage: &S, op: &Operation, ) -> Result<()> { let node = op.kind.node(); let previous = storage.latest_payload_before(node, &frontier_from_op(op).as_borrowed())?; - let payloads = scratch.payload_store_mut(); if let Some(previous) = previous { let payload = payload_from_op(&previous) @@ -408,25 +362,35 @@ fn rewind_payload_op( Ok(()) } -fn rewind_existing_suffix( - scratch: &mut TreeCrdt, +fn rewind_existing_suffix_in_place( + nodes: &mut N, + payloads: &mut P, storage: &S, existing_suffix_ops: &[Operation], -) -> Result<()> { +) -> Result<()> +where + S: FrontierRewindStorage, + N: NodeStore, + P: ExactPayloadStore, +{ for op in existing_suffix_ops.iter().rev() { match &op.kind { crate::ops::OperationKind::Insert { .. } => { if op_sets_payload(op) { - rewind_payload_op(scratch, storage, op)?; + rewind_payload_op_in_place(payloads, storage, op)?; } - rewind_structure_op(scratch, storage, op)?; + rewind_structure_op_in_place(nodes, storage, op)?; + } + crate::ops::OperationKind::Move { .. } => { + rewind_structure_op_in_place(nodes, storage, op)? + } + crate::ops::OperationKind::Payload { .. } => { + rewind_payload_op_in_place(payloads, storage, op)? } - crate::ops::OperationKind::Move { .. } => rewind_structure_op(scratch, storage, op)?, - crate::ops::OperationKind::Payload { .. } => rewind_payload_op(scratch, storage, op)?, crate::ops::OperationKind::Delete { .. } | crate::ops::OperationKind::Tombstone { .. } => { return Err(Error::Storage( - "delete/tombstone ops are not supported by partial rewind".into(), + "delete/tombstone ops are not supported by direct rewind".into(), )); } } @@ -752,12 +716,12 @@ fn replay_frontier_in_memory( )) } -/// Try a partial rewind/replay catch-up for append-time out-of-order suffixes. +/// Try a direct rewind/replay catch-up for append-time out-of-order suffixes. /// -/// This starts from the current materialized state, rewinds only the already-materialized suffix, -/// then replays the invalidated suffix in canonical order. It deliberately bails out for -/// delete/tombstone suffixes and for broader recovery cases. -pub fn try_partial_rewind_catch_up_materialized_state( +/// This rewinds the already-materialized suffix directly on the backend stores, truncates suffix +/// oprefs, and then replays the full invalidated suffix in canonical order. It deliberately bails +/// out for delete/tombstone suffixes and for broader recovery cases. +pub fn try_direct_rewind_catch_up_materialized_state( storage: &S, inserted_op_ids: &HashSet, stores: PersistedRemoteStores, @@ -813,62 +777,45 @@ where let prefix_seq = head .seq .saturating_sub(existing_suffix_ops.len().min(u64::MAX as usize) as u64); + let truncate_from = prefix_seq.saturating_add(1); let PersistedRemoteStores { replica_id, - clock: _clock, + clock, mut nodes, mut payloads, mut index, } = stores; - let mut scratch = PrefixSnapshot { - crdt: clone_materialized_state(&nodes, &payloads, &replica_id)?, - index: RecordingIndex::default(), - head: None, - seq: prefix_seq, - }; + index.truncate_from(truncate_from)?; + rewind_existing_suffix_in_place(&mut nodes, &mut payloads, storage, &existing_suffix_ops)?; - rewind_existing_suffix(&mut scratch.crdt, storage, &existing_suffix_ops)?; + let mut crdt = TreeCrdt::with_stores(replica_id, NoopStorage, clock, nodes, payloads)?; let mut affected = HashSet::new(); let mut seq = prefix_seq; let mut replay_head: Option = None; for op in full_suffix_ops { seq = seq.saturating_add(1); - let delta = scratch - .crdt - .apply_sorted_remote_with_materialization(op.clone(), &mut scratch.index, seq)?; + let delta = crdt.apply_sorted_remote_with_materialization(op.clone(), &mut index, seq)?; affected.extend(delta.affected_nodes); replay_head = Some(op); } - scratch.head = replay_head; - scratch.seq = seq; - let mut affected_nodes: Vec = affected.into_iter().collect(); affected_nodes.sort(); - patch_final_state_in_place( - &mut scratch, - prefix_seq, - &affected_nodes, - &mut nodes, - &mut payloads, - &mut index, - )?; - - flush_nodes(&mut nodes)?; + flush_nodes(crdt.node_store_mut())?; flush_index(&mut index)?; Ok(Some(CatchUpResult { - head: scratch.head.map(|head| MaterializationHead { + head: replay_head.map(|head| MaterializationHead { at: MaterializationKey { lamport: head.meta.lamport, replica: head.meta.id.replica.as_bytes().to_vec(), counter: head.meta.id.counter, }, - seq: scratch.seq, + seq, }), affected_nodes, })) diff --git a/packages/treecrdt-core/src/tree.rs b/packages/treecrdt-core/src/tree.rs index ecdf8380..680c20b2 100644 --- a/packages/treecrdt-core/src/tree.rs +++ b/packages/treecrdt-core/src/tree.rs @@ -967,10 +967,6 @@ where &mut self.nodes } - pub(crate) fn payload_store_mut(&mut self) -> &mut P { - &mut self.payloads - } - pub fn validate_invariants(&self) -> Result<()> { for pid in self.nodes.all_nodes()? { let pchildren = self.nodes.children(pid)?; diff --git a/packages/treecrdt-postgres-rs/src/store.rs b/packages/treecrdt-postgres-rs/src/store.rs index 91b03bb9..b384122f 100644 --- a/packages/treecrdt-postgres-rs/src/store.rs +++ b/packages/treecrdt-postgres-rs/src/store.rs @@ -7,7 +7,7 @@ use postgres::{Client, Row, Statement}; use treecrdt_core::{ apply_persisted_remote_ops_with_delta, catch_up_materialized_state, - materialize_persisted_remote_ops_with_delta, try_partial_rewind_catch_up_materialized_state, + materialize_persisted_remote_ops_with_delta, try_direct_rewind_catch_up_materialized_state, try_shortcut_out_of_order_payload_noops, Error, ExactNodeStore, ExactPayloadStore, FrontierRewindStorage, Lamport, LamportClock, MaterializationCursor, MaterializationFrontier, MaterializationHead, MaterializationKey, MaterializationState, NodeId, @@ -1898,7 +1898,7 @@ fn append_ops_in_tx( let apply_result = if apply_result.catch_up_needed { let refreshed_meta = load_tree_meta_for_update(client, doc_id)?; let catch_up = if meta.state().replay_from.is_none() { - try_partial_rewind_catch_up_materialized_state( + try_direct_rewind_catch_up_materialized_state( &PgOpStorage::new(ctx.clone()), &inserted_op_ids, PersistedRemoteStores { diff --git a/packages/treecrdt-postgres-rs/tests/postgres_test.rs b/packages/treecrdt-postgres-rs/tests/postgres_test.rs index fb1ea431..c98d78f1 100644 --- a/packages/treecrdt-postgres-rs/tests/postgres_test.rs +++ b/packages/treecrdt-postgres-rs/tests/postgres_test.rs @@ -285,6 +285,16 @@ fn postgres_backend_deferred_recovery_from_replay_frontier_catches_up_on_ensure( ); } +#[test] +fn postgres_backend_out_of_order_delete_suffix_falls_back_and_restores_parent() { + let Some(harness) = setup_conformance_harness() else { + return; + }; + materialization_conformance::out_of_order_delete_suffix_falls_back_and_restores_parent( + &harness, + ); +} + #[test] fn postgres_backend_failed_immediate_catch_up_rolls_back_inserted_ops_and_meta() { let Some(client) = connect() else { diff --git a/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs b/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs index ceffdc23..73194372 100644 --- a/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs +++ b/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs @@ -9,7 +9,7 @@ use std::collections::HashSet; use treecrdt_core::PayloadStore; use treecrdt_core::Storage; use treecrdt_core::{ - try_partial_rewind_catch_up_materialized_state, LamportClock, MaterializationCursor, ReplicaId, + try_direct_rewind_catch_up_materialized_state, LamportClock, MaterializationCursor, ReplicaId, }; fn merge_affected_nodes(mut left: Vec, right: Vec) -> Vec { @@ -322,7 +322,7 @@ pub(super) fn append_ops_impl( let apply_result = if apply_result.catch_up_needed { let refreshed_meta = load_tree_meta(db)?; let catch_up = if meta.state().replay_from.is_none() { - try_partial_rewind_catch_up_materialized_state( + try_direct_rewind_catch_up_materialized_state( &super::op_storage::SqliteOpStorage::with_doc_id(db, doc_id.to_vec()), &inserted_op_ids, treecrdt_core::PersistedRemoteStores { diff --git a/packages/treecrdt-sqlite-ext/tests/extension_roundtrip.rs b/packages/treecrdt-sqlite-ext/tests/extension_roundtrip.rs index cab60588..16003c92 100644 --- a/packages/treecrdt-sqlite-ext/tests/extension_roundtrip.rs +++ b/packages/treecrdt-sqlite-ext/tests/extension_roundtrip.rs @@ -499,6 +499,14 @@ fn remote_deferred_recovery_from_replay_frontier_catches_up_on_ensure() { ); } +#[test] +fn remote_append_out_of_order_delete_suffix_falls_back_and_restores_parent() { + let harness = setup_conformance_harness(); + materialization_conformance::out_of_order_delete_suffix_falls_back_and_restores_parent( + &harness, + ); +} + #[test] fn remote_failed_immediate_catch_up_rolls_back_inserted_ops_and_meta() { let conn = setup_conn(); diff --git a/packages/treecrdt-test-support/src/lib.rs b/packages/treecrdt-test-support/src/lib.rs index 22834a18..e7714282 100644 --- a/packages/treecrdt-test-support/src/lib.rs +++ b/packages/treecrdt-test-support/src/lib.rs @@ -231,3 +231,29 @@ pub fn deferred_recovery_from_replay_frontier_catches_up_on_ensure< assert_eq!(harness.head_seq(), 2); assert_eq!(harness.op_ref_counters_for_parent(NodeId::ROOT), vec![1, 2]); } + +pub fn out_of_order_delete_suffix_falls_back_and_restores_parent< + H: MaterializationConformanceHarness, +>( + harness: &H, +) { + let replica = ReplicaId::new(b"delete-fallback"); + let parent = node(1); + let child = node(2); + + let insert_parent = + Operation::insert(&replica, 1, 1, NodeId::ROOT, parent, order_key_from_position(0)); + let insert_child = Operation::insert(&replica, 2, 2, parent, child, order_key_from_position(0)); + + let mut vv = treecrdt_core::VersionVector::new(); + vv.observe(&replica, 1); + let delete_parent = Operation::delete(&replica, 3, 3, parent, Some(vv)); + + harness.append_ops(&[insert_parent, delete_parent]); + let _ = harness.append_ops_with_affected_nodes(&[insert_child]); + + assert_replay_cleared(harness); + assert_eq!(harness.head_seq(), 3); + assert_eq!(harness.visible_children(NodeId::ROOT), vec![parent]); + assert_eq!(harness.visible_children(parent), vec![child]); +} From 3ec2ea5999eff32ff329b895826d21163210453c Mon Sep 17 00:00:00 2001 From: Marcus Pousette Date: Fri, 17 Apr 2026 10:24:51 +0200 Subject: [PATCH 04/13] refactor(test): share adapter conformance cases --- .../tests/postgres_test.rs | 97 ++++++------------- .../tests/extension_roundtrip.rs | 93 +++++------------- packages/treecrdt-test-support/src/lib.rs | 35 +++++++ 3 files changed, 88 insertions(+), 137 deletions(-) diff --git a/packages/treecrdt-postgres-rs/tests/postgres_test.rs b/packages/treecrdt-postgres-rs/tests/postgres_test.rs index c98d78f1..99f1943b 100644 --- a/packages/treecrdt-postgres-rs/tests/postgres_test.rs +++ b/packages/treecrdt-postgres-rs/tests/postgres_test.rs @@ -14,7 +14,7 @@ use treecrdt_postgres::{ }; use treecrdt_test_support::{ self as materialization_conformance, node, order_key_from_position, - representative_remote_batch, MaterializationConformanceHarness, + MaterializationConformanceHarness, }; fn connect() -> Option>> { @@ -53,6 +53,17 @@ impl MaterializationConformanceHarness for PgConformanceHarness { tree_payload(&self.client, &self.doc_id, node).unwrap() } + fn op_count(&self) -> u64 { + let mut c = self.client.borrow_mut(); + let row = c + .query_one( + "SELECT COUNT(*) FROM treecrdt_ops WHERE doc_id = $1", + &[&self.doc_id], + ) + .unwrap(); + row.get::<_, i64>(0).max(0) as u64 + } + fn replay_frontier(&self) -> Option { let mut c = self.client.borrow_mut(); let row = c @@ -109,6 +120,21 @@ impl MaterializationConformanceHarness for PgConformanceHarness { let ops = get_ops_by_op_refs(&self.client, &self.doc_id, &refs).unwrap(); ops.iter().map(|op| op.meta.id.counter).collect() } + + fn op_kinds_for_parent(&self, parent: NodeId) -> Vec { + let refs = list_op_refs_children(&self.client, &self.doc_id, parent).unwrap(); + let ops = get_ops_by_op_refs(&self.client, &self.doc_id, &refs).unwrap(); + ops.iter() + .map(|op| match op.kind { + treecrdt_core::OperationKind::Insert { .. } => "insert", + treecrdt_core::OperationKind::Move { .. } => "move", + treecrdt_core::OperationKind::Delete { .. } => "delete", + treecrdt_core::OperationKind::Tombstone { .. } => "tombstone", + treecrdt_core::OperationKind::Payload { .. } => "payload", + }) + .map(str::to_string) + .collect() + } } fn setup_conformance_harness() -> Option { @@ -156,79 +182,18 @@ fn postgres_backend_apply_is_idempotent_and_max_lamport_monotonic() { #[test] fn postgres_backend_append_batch_materializes_only_inserted_ops() { - let Some(client) = connect() else { + let Some(harness) = setup_conformance_harness() else { return; }; - ensure_schema_once(&client); - - let doc_id = format!("test-{}", Uuid::new_v4()); - { - let mut c = client.borrow_mut(); - reset_doc_for_tests(&mut c, &doc_id).unwrap(); - } - - let replica = ReplicaId::new(b"dup"); - let n1 = node(1); - let op1 = Operation::insert(&replica, 1, 1, NodeId::ROOT, n1, order_key_from_position(0)); - let op2 = Operation::set_payload(&replica, 2, 2, n1, vec![9]); - - let inserted = append_ops(&client, &doc_id, &[op1.clone(), op1.clone(), op2.clone()]).unwrap(); - assert_eq!(inserted, 2); - assert_eq!(list_op_refs_all(&client, &doc_id).unwrap().len(), 2); - assert_eq!( - tree_children(&client, &doc_id, NodeId::ROOT).unwrap(), - vec![n1] - ); - - let head_seq = { - let mut c = client.borrow_mut(); - let row = c - .query_one( - "SELECT head_seq FROM treecrdt_meta WHERE doc_id = $1", - &[&doc_id], - ) - .unwrap(); - row.get::<_, i64>(0).max(0) as u64 - }; - assert_eq!(head_seq, 2); + materialization_conformance::append_batch_materializes_only_inserted_ops(&harness); } #[test] fn postgres_backend_append_with_affected_nodes_matches_representative_remote_batch() { - let Some(client) = connect() else { + let Some(harness) = setup_conformance_harness() else { return; }; - ensure_schema_once(&client); - - let doc_id = format!("test-{}", Uuid::new_v4()); - { - let mut c = client.borrow_mut(); - reset_doc_for_tests(&mut c, &doc_id).unwrap(); - } - - let replica = ReplicaId::new(b"rep"); - let (p1, p2, child, ops) = representative_remote_batch(&replica); - - let affected = append_ops_with_affected_nodes(&client, &doc_id, &ops).unwrap(); - assert_eq!(affected, vec![NodeId::ROOT, p1, p2, child]); - assert_eq!( - tree_children(&client, &doc_id, NodeId::ROOT).unwrap(), - vec![p1, p2] - ); - assert_eq!(tree_children(&client, &doc_id, p2).unwrap(), vec![child]); - assert_eq!( - tree_payload(&client, &doc_id, child).unwrap(), - Some(vec![8]) - ); - - let refs_p2 = list_op_refs_children(&client, &doc_id, p2).unwrap(); - let ops_p2 = get_ops_by_op_refs(&client, &doc_id, &refs_p2).unwrap(); - assert!(ops_p2 - .iter() - .any(|op| matches!(op.kind, treecrdt_core::OperationKind::Move { .. }))); - assert!(ops_p2 - .iter() - .any(|op| matches!(op.kind, treecrdt_core::OperationKind::Payload { .. }))); + materialization_conformance::representative_remote_batch_matches_shape(&harness); } #[test] diff --git a/packages/treecrdt-sqlite-ext/tests/extension_roundtrip.rs b/packages/treecrdt-sqlite-ext/tests/extension_roundtrip.rs index 16003c92..66d1da75 100644 --- a/packages/treecrdt-sqlite-ext/tests/extension_roundtrip.rs +++ b/packages/treecrdt-sqlite-ext/tests/extension_roundtrip.rs @@ -8,8 +8,7 @@ use treecrdt_core::{ order_key::allocate_between, NodeId, Operation, OperationKind, ReplicaId, VersionVector, }; use treecrdt_test_support::{ - self as materialization_conformance, representative_remote_batch, - MaterializationConformanceHarness, + self as materialization_conformance, MaterializationConformanceHarness, }; #[derive(Clone, Deserialize, Serialize)] @@ -205,6 +204,13 @@ impl MaterializationConformanceHarness for SqliteConformanceHarness { payload_bytes(&self.conn, &node_bytes_from_id(node)) } + fn op_count(&self) -> u64 { + self.conn + .query_row("SELECT COUNT(*) FROM ops", [], |row| row.get::<_, i64>(0)) + .unwrap() + .max(0) as u64 + } + fn replay_frontier(&self) -> Option { match read_replay_frontier(&self.conn) { (Some(lamport), Some(replica), Some(counter)) => { @@ -252,6 +258,16 @@ impl MaterializationConformanceHarness for SqliteConformanceHarness { .map(|op| op.counter) .collect() } + + fn op_kinds_for_parent(&self, parent: NodeId) -> Vec { + ops_by_oprefs( + &self.conn, + &oprefs_children(&self.conn, &node_bytes_from_id(parent)), + ) + .iter() + .map(|op| op.kind.clone()) + .collect() + } } fn setup_conformance_harness() -> SqliteConformanceHarness { @@ -382,79 +398,14 @@ fn local_insert_returns_appended_insert_op() { #[test] fn remote_append_materializes_only_inserted_ops() { - let conn = setup_conn(); - - let replica = b"dup".to_vec(); - let root = node_bytes(0); - let node = node_bytes(1); - let insert = JsonOp { - replica: replica.clone(), - counter: 1, - lamport: 1, - kind: "insert".into(), - parent: Some(<[u8; 16]>::try_from(root.as_slice()).unwrap()), - node: <[u8; 16]>::try_from(node.as_slice()).unwrap(), - new_parent: None, - order_key: Some((1u16).to_be_bytes().to_vec()), - known_state: None, - payload: None, - }; - let payload = JsonOp { - replica, - counter: 2, - lamport: 2, - kind: "payload".into(), - parent: None, - node: <[u8; 16]>::try_from(node.as_slice()).unwrap(), - new_parent: None, - order_key: None, - known_state: None, - payload: Some(vec![9]), - }; - - let (affected, op_count) = append_ops_json(&conn, &[insert.clone(), insert, payload]); - assert_eq!(affected, vec![root.clone(), node.clone()]); - assert_eq!(op_count, 2); - assert_eq!(visible_children(&conn, &root), vec![node]); - - let (_, _, _, head_seq) = read_tree_meta(&conn); - assert_eq!(head_seq, 2); + let harness = setup_conformance_harness(); + materialization_conformance::append_batch_materializes_only_inserted_ops(&harness); } #[test] fn remote_append_representative_batch_matches_postgres_shape() { - let conn = setup_conn(); - - let root = node_bytes(0); - let replica = ReplicaId::new(b"rep"); - let (p1, p2, child, ops) = representative_remote_batch(&replica); - let (affected, _) = append_ops_json(&conn, &json_ops(&ops)); - - assert_eq!( - affected, - vec![ - root.clone(), - node_bytes_from_id(p1), - node_bytes_from_id(p2), - node_bytes_from_id(child) - ] - ); - assert_eq!( - visible_children(&conn, &root), - vec![node_bytes_from_id(p1), node_bytes_from_id(p2)] - ); - assert_eq!( - visible_children(&conn, &node_bytes_from_id(p2)), - vec![node_bytes_from_id(child)] - ); - assert_eq!( - payload_bytes(&conn, &node_bytes_from_id(child)), - Some(vec![8]) - ); - - let ops_p2 = ops_by_oprefs(&conn, &oprefs_children(&conn, &node_bytes_from_id(p2))); - assert!(ops_p2.iter().any(|op| op.kind == "move")); - assert!(ops_p2.iter().any(|op| op.kind == "payload")); + let harness = setup_conformance_harness(); + materialization_conformance::representative_remote_batch_matches_shape(&harness); } #[test] diff --git a/packages/treecrdt-test-support/src/lib.rs b/packages/treecrdt-test-support/src/lib.rs index e7714282..7c143a15 100644 --- a/packages/treecrdt-test-support/src/lib.rs +++ b/packages/treecrdt-test-support/src/lib.rs @@ -7,11 +7,13 @@ pub trait MaterializationConformanceHarness { fn append_ops_with_affected_nodes(&self, ops: &[Operation]) -> Vec; fn visible_children(&self, parent: NodeId) -> Vec; fn payload(&self, node: NodeId) -> Option>; + fn op_count(&self) -> u64; fn replay_frontier(&self) -> Option; fn head_seq(&self) -> u64; fn force_replay_from_start(&self); fn ensure_materialized(&self); fn op_ref_counters_for_parent(&self, parent: NodeId) -> Vec; + fn op_kinds_for_parent(&self, parent: NodeId) -> Vec; } pub fn order_key_from_position(position: u16) -> Vec { @@ -44,6 +46,39 @@ pub fn representative_remote_batch( ) } +pub fn append_batch_materializes_only_inserted_ops( + harness: &H, +) { + let replica = ReplicaId::new(b"dup"); + let node = node(1); + let insert = Operation::insert(&replica, 1, 1, NodeId::ROOT, node, order_key_from_position(0)); + let payload = Operation::set_payload(&replica, 2, 2, node, vec![9]); + + let before = harness.op_count(); + harness.append_ops(&[insert.clone(), insert, payload]); + + assert_eq!(harness.op_count().saturating_sub(before), 2); + assert_eq!(harness.visible_children(NodeId::ROOT), vec![node]); + assert_eq!(harness.head_seq(), 2); +} + +pub fn representative_remote_batch_matches_shape( + harness: &H, +) { + let replica = ReplicaId::new(b"rep"); + let (p1, p2, child, ops) = representative_remote_batch(&replica); + + let affected = harness.append_ops_with_affected_nodes(&ops); + assert_eq!(affected, vec![NodeId::ROOT, p1, p2, child]); + assert_eq!(harness.visible_children(NodeId::ROOT), vec![p1, p2]); + assert_eq!(harness.visible_children(p2), vec![child]); + assert_eq!(harness.payload(child), Some(vec![8])); + + let kinds = harness.op_kinds_for_parent(p2); + assert!(kinds.iter().any(|kind| kind == "move")); + assert!(kinds.iter().any(|kind| kind == "payload")); +} + fn assert_replay_cleared(harness: &H) { assert_eq!(harness.replay_frontier(), None); } From 535a2f4fa62d34ede5a5026135601f1ec91c139b Mon Sep 17 00:00:00 2001 From: Marcus Pousette Date: Fri, 17 Apr 2026 10:35:35 +0200 Subject: [PATCH 05/13] style: format direct rewind changes --- packages/treecrdt-core/src/materialization.rs | 13 +++---- packages/treecrdt-postgres-rs/src/store.rs | 36 +++++++++---------- .../src/extension/functions/materialize.rs | 3 +- .../src/extension/functions/op_storage.rs | 24 ++++++++----- packages/treecrdt-test-support/src/lib.rs | 19 ++++++++-- 5 files changed, 54 insertions(+), 41 deletions(-) diff --git a/packages/treecrdt-core/src/materialization.rs b/packages/treecrdt-core/src/materialization.rs index d72b1209..63c68dc6 100644 --- a/packages/treecrdt-core/src/materialization.rs +++ b/packages/treecrdt-core/src/materialization.rs @@ -140,9 +140,7 @@ pub trait FrontierRewindStorage: Storage { cmp_frontiers(&frontier, before) == Ordering::Less && match &op.kind { crate::ops::OperationKind::Insert { - node: n, - payload, - .. + node: n, payload, .. } => *n == node && payload.is_some(), crate::ops::OperationKind::Payload { node: n, .. } => *n == node, _ => false, @@ -324,9 +322,7 @@ fn rewind_structure_op_in_place( match previous.as_ref().map(|prev| &prev.kind) { Some(crate::ops::OperationKind::Insert { - parent, - order_key, - .. + parent, order_key, .. }) => nodes.attach(node, *parent, order_key.clone())?, Some(crate::ops::OperationKind::Move { new_parent, @@ -774,9 +770,8 @@ where return Ok(None); } - let prefix_seq = head - .seq - .saturating_sub(existing_suffix_ops.len().min(u64::MAX as usize) as u64); + let prefix_seq = + head.seq.saturating_sub(existing_suffix_ops.len().min(u64::MAX as usize) as u64); let truncate_from = prefix_seq.saturating_add(1); let PersistedRemoteStores { diff --git a/packages/treecrdt-postgres-rs/src/store.rs b/packages/treecrdt-postgres-rs/src/store.rs index b384122f..0b46528f 100644 --- a/packages/treecrdt-postgres-rs/src/store.rs +++ b/packages/treecrdt-postgres-rs/src/store.rs @@ -9,10 +9,10 @@ use treecrdt_core::{ apply_persisted_remote_ops_with_delta, catch_up_materialized_state, materialize_persisted_remote_ops_with_delta, try_direct_rewind_catch_up_materialized_state, try_shortcut_out_of_order_payload_noops, Error, ExactNodeStore, ExactPayloadStore, - FrontierRewindStorage, Lamport, LamportClock, MaterializationCursor, - MaterializationFrontier, MaterializationHead, MaterializationKey, MaterializationState, NodeId, - NodeStore, Operation, OperationId, OperationKind, PayloadStore, PersistedRemoteStores, - ReplicaId, Result, Storage, TruncatingParentOpIndex, VersionVector, + FrontierRewindStorage, Lamport, LamportClock, MaterializationCursor, MaterializationFrontier, + MaterializationHead, MaterializationKey, MaterializationState, NodeId, NodeStore, Operation, + OperationId, OperationKind, PayloadStore, PersistedRemoteStores, ReplicaId, Result, Storage, + TruncatingParentOpIndex, VersionVector, }; use crate::opref::{derive_op_ref_v0, OPREF_V0_WIDTH}; @@ -1912,21 +1912,19 @@ fn append_ops_in_tx( |nodes| nodes.flush_last_change(), |index| index.flush(), )? - .unwrap_or( - catch_up_materialized_state( - PgOpStorage::new(ctx.clone()), - PersistedRemoteStores { - replica_id: ReplicaId::new(b"postgres"), - clock: LamportClock::default(), - nodes: PgNodeStore::new(ctx.clone()), - payloads: PgPayloadStore::new(ctx.clone()), - index: PgParentOpIndex::new(ctx.clone()), - }, - &refreshed_meta, - |nodes| nodes.flush_last_change(), - |index| index.flush(), - )?, - ) + .unwrap_or(catch_up_materialized_state( + PgOpStorage::new(ctx.clone()), + PersistedRemoteStores { + replica_id: ReplicaId::new(b"postgres"), + clock: LamportClock::default(), + nodes: PgNodeStore::new(ctx.clone()), + payloads: PgPayloadStore::new(ctx.clone()), + index: PgParentOpIndex::new(ctx.clone()), + }, + &refreshed_meta, + |nodes| nodes.flush_last_change(), + |index| index.flush(), + )?) } else { catch_up_materialized_state( PgOpStorage::new(ctx.clone()), diff --git a/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs b/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs index 73194372..c0c7d32f 100644 --- a/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs +++ b/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs @@ -344,8 +344,7 @@ pub(super) fn append_ops_impl( treecrdt_core::PersistedRemoteStores { replica_id: ReplicaId::new(b"sqlite-ext"), clock: LamportClock::default(), - nodes: SqliteNodeStore::prepare(db) - .map_err(|_| SQLITE_ERROR as c_int)?, + nodes: SqliteNodeStore::prepare(db).map_err(|_| SQLITE_ERROR as c_int)?, payloads: SqlitePayloadStore::prepare(db) .map_err(|_| SQLITE_ERROR as c_int)?, index: SqliteParentOpIndex::prepare(db, doc_id.to_vec()) diff --git a/packages/treecrdt-sqlite-ext/src/extension/functions/op_storage.rs b/packages/treecrdt-sqlite-ext/src/extension/functions/op_storage.rs index 99384bea..e83126b5 100644 --- a/packages/treecrdt-sqlite-ext/src/extension/functions/op_storage.rs +++ b/packages/treecrdt-sqlite-ext/src/extension/functions/op_storage.rs @@ -41,8 +41,8 @@ fn read_operation_row(stmt: *mut sqlite3_stmt) -> treecrdt_core::Result treecrdt_core::Result { - let new_parent = new_parent.ok_or_else(|| { - sqlite_rc_error(SQLITE_ERROR as c_int, "move missing new_parent") - })?; + let new_parent = new_parent + .ok_or_else(|| sqlite_rc_error(SQLITE_ERROR as c_int, "move missing new_parent"))?; treecrdt_core::OperationKind::Move { node: sqlite_bytes_to_node_id(node), new_parent: sqlite_bytes_to_node_id(new_parent), @@ -548,7 +547,10 @@ impl treecrdt_core::FrontierRewindStorage for SqliteOpStorage { } if bind_err { unsafe { sqlite_finalize(stmt) }; - return Err(sqlite_rc_error(SQLITE_ERROR as c_int, "bind frontier range failed")); + return Err(sqlite_rc_error( + SQLITE_ERROR as c_int, + "bind frontier range failed", + )); } loop { @@ -568,7 +570,10 @@ impl treecrdt_core::FrontierRewindStorage for SqliteOpStorage { let finalize_rc = unsafe { sqlite_finalize(stmt) }; if finalize_rc != SQLITE_OK as c_int { - return Err(sqlite_rc_error(finalize_rc, "finalize frontier range failed")); + return Err(sqlite_rc_error( + finalize_rc, + "finalize frontier range failed", + )); } Ok(()) } @@ -703,7 +708,10 @@ impl treecrdt_core::FrontierRewindStorage for SqliteOpStorage { let finalize_rc = unsafe { sqlite_finalize(stmt) }; if finalize_rc != SQLITE_OK as c_int { - return Err(sqlite_rc_error(finalize_rc, "finalize latest payload failed")); + return Err(sqlite_rc_error( + finalize_rc, + "finalize latest payload failed", + )); } Ok(op) } diff --git a/packages/treecrdt-test-support/src/lib.rs b/packages/treecrdt-test-support/src/lib.rs index 7c143a15..ed8f3651 100644 --- a/packages/treecrdt-test-support/src/lib.rs +++ b/packages/treecrdt-test-support/src/lib.rs @@ -51,7 +51,14 @@ pub fn append_batch_materializes_only_inserted_ops Date: Fri, 17 Apr 2026 10:45:12 +0200 Subject: [PATCH 06/13] refactor: trim direct rewind bloat --- packages/treecrdt-core/src/lib.rs | 6 +-- packages/treecrdt-core/src/materialization.rs | 30 ++++------- packages/treecrdt-postgres-rs/src/profile.rs | 4 +- packages/treecrdt-postgres-rs/src/store.rs | 54 ++++++------------- .../src/extension/functions/op_storage.rs | 39 +++----------- 5 files changed, 37 insertions(+), 96 deletions(-) diff --git a/packages/treecrdt-core/src/lib.rs b/packages/treecrdt-core/src/lib.rs index fe7846d2..ecbcb777 100644 --- a/packages/treecrdt-core/src/lib.rs +++ b/packages/treecrdt-core/src/lib.rs @@ -19,9 +19,9 @@ pub use materialization::{ catch_up_materialized_state, materialize_persisted_remote_ops_with_delta, try_direct_rewind_catch_up_materialized_state, try_shortcut_out_of_order_payload_noops, CatchUpResult, FrontierRewindStorage, IncrementalApplyResult, MaterializationCursor, - MaterializationFrontier, MaterializationFrontierRef, MaterializationHead, - MaterializationHeadRef, MaterializationKey, MaterializationState, MaterializationStateRef, - PayloadNoopShortcut, PersistedRemoteApplyResult, PersistedRemoteStores, + MaterializationFrontier, MaterializationFrontierRef, MaterializationHead, MaterializationKey, + MaterializationState, MaterializationStateRef, PayloadNoopShortcut, PersistedRemoteApplyResult, + PersistedRemoteStores, }; pub use ops::{cmp_op_key, cmp_ops, Operation, OperationKind, OperationMetadata}; pub use traits::{ diff --git a/packages/treecrdt-core/src/materialization.rs b/packages/treecrdt-core/src/materialization.rs index 63c68dc6..daf8304c 100644 --- a/packages/treecrdt-core/src/materialization.rs +++ b/packages/treecrdt-core/src/materialization.rs @@ -45,8 +45,6 @@ impl> MaterializationHead { } } -pub type MaterializationHeadRef<'a> = MaterializationHead<&'a [u8]>; - #[derive(Clone, Debug, Eq, PartialEq)] pub struct MaterializationState> { pub head: Option>, @@ -84,7 +82,6 @@ pub trait FrontierRewindStorage: Storage { fn scan_frontier_range( &self, start: &MaterializationFrontierRef<'_>, - end: Option<&MaterializationKey<&[u8]>>, visit: &mut dyn FnMut(Operation) -> Result<()>, ) -> Result<()> { let mut ops = self.load_since(0)?; @@ -94,11 +91,6 @@ pub trait FrontierRewindStorage: Storage { if cmp_frontiers(&frontier, start) == Ordering::Less { continue; } - if let Some(end) = end { - if cmp_frontiers(&frontier, end) == Ordering::Greater { - continue; - } - } visit(op)?; } Ok(()) @@ -750,23 +742,21 @@ where return Ok(None); } + let mut full_suffix_ops = Vec::new(); let mut existing_suffix_ops = Vec::new(); - storage.scan_frontier_range(frontier, Some(&head.at), &mut |op| { - if !inserted_op_ids.contains(&op.meta.id) { - existing_suffix_ops.push(op); + let mut requires_full_replay = false; + storage.scan_frontier_range(frontier, &mut |op| { + let op_frontier = frontier_from_op(&op); + if cmp_frontiers(&op_frontier, &head.at) != Ordering::Greater + && !inserted_op_ids.contains(&op.meta.id) + { + existing_suffix_ops.push(op.clone()); } - Ok(()) - })?; - if existing_suffix_ops.is_empty() { - return Ok(None); - } - - let mut full_suffix_ops = Vec::new(); - storage.scan_frontier_range(frontier, None, &mut |op| { + requires_full_replay |= op_requires_full_replay(&op); full_suffix_ops.push(op); Ok(()) })?; - if full_suffix_ops.is_empty() || full_suffix_ops.iter().any(op_requires_full_replay) { + if full_suffix_ops.is_empty() || existing_suffix_ops.is_empty() || requires_full_replay { return Ok(None); } diff --git a/packages/treecrdt-postgres-rs/src/profile.rs b/packages/treecrdt-postgres-rs/src/profile.rs index b4ba7006..bc9a4de0 100644 --- a/packages/treecrdt-postgres-rs/src/profile.rs +++ b/packages/treecrdt-postgres-rs/src/profile.rs @@ -22,7 +22,7 @@ pub(crate) struct PgAppendProfile { pub(crate) dedupe_filter_ms: f64, pub(crate) materialize_ms: f64, pub(crate) update_head_ms: f64, - pub(crate) catch_up_needed: bool, + pub(crate) catch_up_performed: bool, pub(crate) node_load_count: u64, pub(crate) node_load_ms: f64, pub(crate) node_ensure_count: u64, @@ -74,7 +74,7 @@ impl PgAppendProfile { "dedupeFilterMs": self.dedupe_filter_ms, "materializeMs": self.materialize_ms, "updateHeadMs": self.update_head_ms, - "catchUpNeeded": self.catch_up_needed, + "catchUpPerformed": self.catch_up_performed, "nodeLoadCount": self.node_load_count, "nodeLoadMs": self.node_load_ms, "nodeEnsureCount": self.node_ensure_count, diff --git a/packages/treecrdt-postgres-rs/src/store.rs b/packages/treecrdt-postgres-rs/src/store.rs index 0b46528f..dd8b8c5e 100644 --- a/packages/treecrdt-postgres-rs/src/store.rs +++ b/packages/treecrdt-postgres-rs/src/store.rs @@ -1220,43 +1220,19 @@ impl FrontierRewindStorage for PgOpStorage { fn scan_frontier_range( &self, start: &treecrdt_core::MaterializationFrontierRef<'_>, - end: Option<&treecrdt_core::MaterializationKey<&[u8]>>, visit: &mut dyn FnMut(Operation) -> Result<()>, ) -> Result<()> { let mut c = self.ctx.client.borrow_mut(); - let rows = if let Some(end) = end { - let stmt = self.ctx.stmt( - &mut c, - "SELECT lamport, replica, counter, kind, parent, node, new_parent, order_key, payload, known_state \ - FROM treecrdt_ops \ - WHERE doc_id = $1 \ - AND (lamport > $2 OR (lamport = $2 AND (replica > $3 OR (replica = $3 AND counter >= $4)))) \ - AND (lamport < $5 OR (lamport = $5 AND (replica < $6 OR (replica = $6 AND counter <= $7)))) \ - ORDER BY lamport, replica, counter", - )?; - c.query( - &stmt, - &[ - &self.ctx.doc_id, - &(start.lamport as i64), - &start.replica, - &(start.counter as i64), - &(end.lamport as i64), - &end.replica, - &(end.counter as i64), - ], - ) - .map_err(storage_debug)? - } else { - let stmt = self.ctx.stmt( - &mut c, - "SELECT lamport, replica, counter, kind, parent, node, new_parent, order_key, payload, known_state \ - FROM treecrdt_ops \ - WHERE doc_id = $1 \ - AND (lamport > $2 OR (lamport = $2 AND (replica > $3 OR (replica = $3 AND counter >= $4)))) \ - ORDER BY lamport, replica, counter", - )?; - c.query( + let stmt = self.ctx.stmt( + &mut c, + "SELECT lamport, replica, counter, kind, parent, node, new_parent, order_key, payload, known_state \ + FROM treecrdt_ops \ + WHERE doc_id = $1 \ + AND (lamport > $2 OR (lamport = $2 AND (replica > $3 OR (replica = $3 AND counter >= $4)))) \ + ORDER BY lamport, replica, counter", + )?; + let rows = c + .query( &stmt, &[ &self.ctx.doc_id, @@ -1265,8 +1241,7 @@ impl FrontierRewindStorage for PgOpStorage { &(start.counter as i64), ], ) - .map_err(storage_debug)? - }; + .map_err(storage_debug)?; drop(c); for row in rows { @@ -1895,7 +1870,8 @@ fn append_ops_in_tx( |frontier| set_tree_meta_replay_frontier(client, doc_id, frontier), )? }; - let apply_result = if apply_result.catch_up_needed { + let catch_up_performed = apply_result.catch_up_needed; + let apply_result = if catch_up_performed { let refreshed_meta = load_tree_meta_for_update(client, doc_id)?; let catch_up = if meta.state().replay_from.is_none() { try_direct_rewind_catch_up_materialized_state( @@ -1961,8 +1937,8 @@ fn append_ops_in_tx( if let Some(profile) = &append_profile { profile.borrow_mut().update_head_ms += update_head_ms; - if apply_result.catch_up_needed { - profile.borrow_mut().catch_up_needed = true; + if catch_up_performed { + profile.borrow_mut().catch_up_performed = true; } profile.borrow().log(doc_id, apply_result.inserted_count as usize); } diff --git a/packages/treecrdt-sqlite-ext/src/extension/functions/op_storage.rs b/packages/treecrdt-sqlite-ext/src/extension/functions/op_storage.rs index e83126b5..4096c0b4 100644 --- a/packages/treecrdt-sqlite-ext/src/extension/functions/op_storage.rs +++ b/packages/treecrdt-sqlite-ext/src/extension/functions/op_storage.rs @@ -493,27 +493,15 @@ impl treecrdt_core::FrontierRewindStorage for SqliteOpStorage { fn scan_frontier_range( &self, start: &treecrdt_core::MaterializationFrontierRef<'_>, - end: Option<&treecrdt_core::MaterializationKey<&[u8]>>, visit: &mut dyn FnMut(treecrdt_core::Operation) -> treecrdt_core::Result<()>, ) -> treecrdt_core::Result<()> { - let sql = if end.is_some() { - CString::new( - "SELECT replica,counter,lamport,kind,parent,node,new_parent,order_key,known_state,payload \ - FROM ops \ - WHERE (lamport > ?1 OR (lamport = ?1 AND (replica > ?2 OR (replica = ?2 AND counter >= ?3)))) \ - AND (lamport < ?4 OR (lamport = ?4 AND (replica < ?5 OR (replica = ?5 AND counter <= ?6)))) \ - ORDER BY lamport, replica, counter", - ) - .expect("scan frontier range bounded sql") - } else { - CString::new( - "SELECT replica,counter,lamport,kind,parent,node,new_parent,order_key,known_state,payload \ - FROM ops \ - WHERE (lamport > ?1 OR (lamport = ?1 AND (replica > ?2 OR (replica = ?2 AND counter >= ?3)))) \ - ORDER BY lamport, replica, counter", - ) - .expect("scan frontier range sql") - }; + let sql = CString::new( + "SELECT replica,counter,lamport,kind,parent,node,new_parent,order_key,known_state,payload \ + FROM ops \ + WHERE (lamport > ?1 OR (lamport = ?1 AND (replica > ?2 OR (replica = ?2 AND counter >= ?3)))) \ + ORDER BY lamport, replica, counter", + ) + .expect("scan frontier range sql"); let mut stmt: *mut sqlite3_stmt = null_mut(); let rc = sqlite_prepare_v2(self.db, sql.as_ptr(), -1, &mut stmt, null_mut()); if rc != SQLITE_OK as c_int { @@ -532,19 +520,6 @@ impl treecrdt_core::FrontierRewindStorage for SqliteOpStorage { ) != SQLITE_OK as c_int; bind_err |= sqlite_bind_int64(stmt, 3, start.counter as i64) != SQLITE_OK as c_int; } - if let Some(end) = end { - unsafe { - bind_err |= sqlite_bind_int64(stmt, 4, end.lamport as i64) != SQLITE_OK as c_int; - bind_err |= sqlite_bind_blob( - stmt, - 5, - end.replica.as_ptr() as *const c_void, - end.replica.len() as c_int, - None, - ) != SQLITE_OK as c_int; - bind_err |= sqlite_bind_int64(stmt, 6, end.counter as i64) != SQLITE_OK as c_int; - } - } if bind_err { unsafe { sqlite_finalize(stmt) }; return Err(sqlite_rc_error( From 64ad9b3ddf0ff166dba73f783f60f556440c68d8 Mon Sep 17 00:00:00 2001 From: Marcus Pousette Date: Fri, 17 Apr 2026 10:55:19 +0200 Subject: [PATCH 07/13] refactor(playground): trim sync hook bloat --- examples/playground/src/App.tsx | 14 ++------ .../src/playground/hooks/usePlaygroundSync.ts | 34 +++++++++---------- 2 files changed, 19 insertions(+), 29 deletions(-) diff --git a/examples/playground/src/App.tsx b/examples/playground/src/App.tsx index 24a70488..46ef2ff3 100644 --- a/examples/playground/src/App.tsx +++ b/examples/playground/src/App.tsx @@ -162,7 +162,6 @@ export default function App() { persistSyncSettings(syncServerUrl, syncTransportMode); }, [syncServerUrl, syncTransportMode]); - const counterRef = useRef(0); const lamportRef = useRef(0); const initEpochRef = useRef(0); const disposedRef = useRef(false); @@ -483,18 +482,14 @@ export default function App() { const active = nextClient ?? clientRef.current ?? client; if (!active) return; try { - const [lamport, counter] = await Promise.all([ - active.meta.headLamport(), - replica ? active.meta.replicaMaxCounter(replica) : Promise.resolve(0), - ]); + const lamport = await active.meta.headLamport(); lamportRef.current = Math.max(lamportRef.current, lamport); setHeadLamport(lamportRef.current); - counterRef.current = Math.max(counterRef.current, counter); } catch (err) { console.error("Failed to refresh meta", err); } }, - [client, replica] + [client] ); const refreshParentsScheduledRef = useRef(false); @@ -831,7 +826,6 @@ export default function App() { setPayloadVersion((v) => v + 1); knownOpsRef.current = new Set(); setCollapse({ defaultCollapsed: true, overrides: new Set([ROOT_ID]) }); - counterRef.current = 0; lamportRef.current = 0; setHeadLamport(0); setTotalNodes(null); @@ -887,7 +881,6 @@ export default function App() { await verifyLocalOps([op]); lamportRef.current = Math.max(lamportRef.current, op.meta.lamport); - counterRef.current = Math.max(counterRef.current, op.meta.id.counter); setHeadLamport(lamportRef.current); notifyLocalUpdate([op]); @@ -916,7 +909,6 @@ export default function App() { scheduleRefreshParents(parentsAffectedByOps(stateBefore, [op])); scheduleRefreshNodeCount(); lamportRef.current = Math.max(lamportRef.current, op.meta.lamport); - counterRef.current = Math.max(counterRef.current, op.meta.id.counter); setHeadLamport(lamportRef.current); } catch (err) { console.error("Failed to append move op", err); @@ -1014,7 +1006,6 @@ export default function App() { for (const op of ops) { lamportRef.current = Math.max(lamportRef.current, op.meta.lamport); - counterRef.current = Math.max(counterRef.current, op.meta.id.counter); } setHeadLamport(lamportRef.current); @@ -1067,7 +1058,6 @@ export default function App() { await ensureChildrenLoaded(parentId, { force: true }); } lamportRef.current = Math.max(lamportRef.current, op.meta.lamport); - counterRef.current = Math.max(counterRef.current, op.meta.id.counter); setHeadLamport(lamportRef.current); setCollapse((prev) => { const overrides = new Set(prev.overrides); diff --git a/examples/playground/src/playground/hooks/usePlaygroundSync.ts b/examples/playground/src/playground/hooks/usePlaygroundSync.ts index ca68d30d..eaa6a966 100644 --- a/examples/playground/src/playground/hooks/usePlaygroundSync.ts +++ b/examples/playground/src/playground/hooks/usePlaygroundSync.ts @@ -1,4 +1,4 @@ -import { useEffect, useRef, useState } from 'react'; +import { useCallback, useEffect, useRef, useState } from 'react'; import type { Operation } from '@treecrdt/interface'; import { bytesToHex } from '@treecrdt/interface/ids'; import { type TreecrdtIdentityChainV1 } from '@treecrdt/auth'; @@ -285,6 +285,17 @@ export function usePlaygroundSync(opts: UsePlaygroundSyncOptions): PlaygroundSyn setPeers(merged); }; + const refreshLoadedSyncState = useCallback( + async (extraParentIds: Iterable = []) => { + await refreshMeta(); + const parentIds = new Set(Object.keys(treeStateRef.current.childrenByParent)); + for (const parentId of extraParentIds) parentIds.add(parentId); + await refreshParents(Array.from(parentIds)); + await refreshNodeCount(); + }, + [refreshMeta, refreshNodeCount, refreshParents, treeStateRef] + ); + const isRemotePeerId = (peerId: string) => peerId.startsWith('remote:'); const syncOnceOptionsForPeer = (peerId: string, localCodewordsPerMessage: number) => ({ maxCodewords: PLAYGROUND_SYNC_MAX_CODEWORDS, @@ -629,9 +640,7 @@ export function usePlaygroundSync(opts: UsePlaygroundSyncOptions): PlaygroundSyn if (lastErr) throw lastErr; throw new Error('No peers responded to sync.'); } - await refreshMeta(); - await refreshParents(Object.keys(treeStateRef.current.childrenByParent)); - await refreshNodeCount(); + await refreshLoadedSyncState(); } catch (err) { console.error('Sync failed', err); setSyncError(formatSyncError(err)); @@ -702,9 +711,7 @@ export function usePlaygroundSync(opts: UsePlaygroundSyncOptions): PlaygroundSyn if (lastErr) throw lastErr; throw new Error('No peers responded to sync.'); } - await refreshMeta(); - await refreshParents(Object.keys(treeStateRef.current.childrenByParent)); - await refreshNodeCount(); + await refreshLoadedSyncState(); } catch (err) { console.error('Scoped sync failed', err); setSyncError(formatSyncError(err)); @@ -787,11 +794,7 @@ export function usePlaygroundSync(opts: UsePlaygroundSyncOptions): PlaygroundSyn ); } - await refreshMeta(); - const parentIds = new Set(Object.keys(treeStateRef.current.childrenByParent)); - parentIds.add(viewRootId); - await refreshParents(Array.from(parentIds)); - await refreshNodeCount(); + await refreshLoadedSyncState([viewRootId]); autoSyncDoneRef.current = true; if (typeof window !== 'undefined') { @@ -819,9 +822,7 @@ export function usePlaygroundSync(opts: UsePlaygroundSyncOptions): PlaygroundSyn authMaterial.localTokensB64.length, autoSyncJoinTick, joinMode, - refreshMeta, - refreshNodeCount, - refreshParents, + refreshLoadedSyncState, syncBusy, viewRootId, ]); @@ -981,8 +982,7 @@ export function usePlaygroundSync(opts: UsePlaygroundSyncOptions): PlaygroundSyn if (debugSync && ops.length > 0) { console.debug(`[sync:${selfPeerId}] applyOps(${ops.length})`); } - const affected = - ops.length > 0 ? ((await client.ops.appendMany(ops)) as unknown as string[]) : []; + const affected = ops.length > 0 ? await client.ops.appendMany(ops) : []; await onRemoteOpsApplied(ops, affected); }, }; From 06475b44144d0b3f9f69b2fb6f9c578ef3c27517 Mon Sep 17 00:00:00 2001 From: Marcus Pousette Date: Fri, 17 Apr 2026 14:59:55 +0200 Subject: [PATCH 08/13] style: fix clippy warnings --- packages/treecrdt-core/src/materialization.rs | 14 ++++---------- packages/treecrdt-core/src/version_vector.rs | 5 +---- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/packages/treecrdt-core/src/materialization.rs b/packages/treecrdt-core/src/materialization.rs index daf8304c..48af3563 100644 --- a/packages/treecrdt-core/src/materialization.rs +++ b/packages/treecrdt-core/src/materialization.rs @@ -103,9 +103,7 @@ pub trait FrontierRewindStorage: Storage { ) -> Result> { let mut ops = self.load_since(0)?; ops.sort_by(cmp_ops); - Ok(ops - .into_iter() - .filter(|op| { + Ok(ops.into_iter().rfind(|op| { let frontier = frontier_from_op(op); cmp_frontiers(&frontier, before) == Ordering::Less && matches!( @@ -114,8 +112,7 @@ pub trait FrontierRewindStorage: Storage { | crate::ops::OperationKind::Move { node: n, .. } if n == node ) - }) - .next_back()) + })) } fn latest_payload_before( @@ -125,9 +122,7 @@ pub trait FrontierRewindStorage: Storage { ) -> Result> { let mut ops = self.load_since(0)?; ops.sort_by(cmp_ops); - Ok(ops - .into_iter() - .filter(|op| { + Ok(ops.into_iter().rfind(|op| { let frontier = frontier_from_op(op); cmp_frontiers(&frontier, before) == Ordering::Less && match &op.kind { @@ -137,8 +132,7 @@ pub trait FrontierRewindStorage: Storage { crate::ops::OperationKind::Payload { node: n, .. } => *n == node, _ => false, } - }) - .next_back()) + })) } } diff --git a/packages/treecrdt-core/src/version_vector.rs b/packages/treecrdt-core/src/version_vector.rs index c470f3b0..7b3cba6f 100644 --- a/packages/treecrdt-core/src/version_vector.rs +++ b/packages/treecrdt-core/src/version_vector.rs @@ -91,10 +91,7 @@ impl ReplicaVersion { } fn absorb_frontier_ranges(&mut self) { - loop { - let Some(&(start, end)) = self.ranges.first() else { - break; - }; + while let Some(&(start, end)) = self.ranges.first() { if start == self.frontier + 1 { self.frontier = end; self.ranges.remove(0); From 0f59afbc5269e5745310338a8f359a41471e0acb Mon Sep 17 00:00:00 2001 From: Marcus Pousette Date: Fri, 17 Apr 2026 15:12:41 +0200 Subject: [PATCH 09/13] style: format materialization helpers --- packages/treecrdt-core/src/materialization.rs | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/packages/treecrdt-core/src/materialization.rs b/packages/treecrdt-core/src/materialization.rs index 48af3563..66d98346 100644 --- a/packages/treecrdt-core/src/materialization.rs +++ b/packages/treecrdt-core/src/materialization.rs @@ -104,15 +104,15 @@ pub trait FrontierRewindStorage: Storage { let mut ops = self.load_since(0)?; ops.sort_by(cmp_ops); Ok(ops.into_iter().rfind(|op| { - let frontier = frontier_from_op(op); - cmp_frontiers(&frontier, before) == Ordering::Less - && matches!( - op.kind, - crate::ops::OperationKind::Insert { node: n, .. } - | crate::ops::OperationKind::Move { node: n, .. } - if n == node - ) - })) + let frontier = frontier_from_op(op); + cmp_frontiers(&frontier, before) == Ordering::Less + && matches!( + op.kind, + crate::ops::OperationKind::Insert { node: n, .. } + | crate::ops::OperationKind::Move { node: n, .. } + if n == node + ) + })) } fn latest_payload_before( @@ -123,16 +123,16 @@ pub trait FrontierRewindStorage: Storage { let mut ops = self.load_since(0)?; ops.sort_by(cmp_ops); Ok(ops.into_iter().rfind(|op| { - let frontier = frontier_from_op(op); - cmp_frontiers(&frontier, before) == Ordering::Less - && match &op.kind { - crate::ops::OperationKind::Insert { - node: n, payload, .. - } => *n == node && payload.is_some(), - crate::ops::OperationKind::Payload { node: n, .. } => *n == node, - _ => false, - } - })) + let frontier = frontier_from_op(op); + cmp_frontiers(&frontier, before) == Ordering::Less + && match &op.kind { + crate::ops::OperationKind::Insert { + node: n, payload, .. + } => *n == node && payload.is_some(), + crate::ops::OperationKind::Payload { node: n, .. } => *n == node, + _ => false, + } + })) } } From 792b3d8eb88eea7d9813b18bf540552e1697f837 Mon Sep 17 00:00:00 2001 From: Marcus Pousette Date: Fri, 17 Apr 2026 16:53:11 +0200 Subject: [PATCH 10/13] docs: clarify direct rewind semantics --- packages/treecrdt-core/src/materialization.rs | 19 +++++++++++++++++++ packages/treecrdt-core/src/traits.rs | 15 +++++++++++++++ packages/treecrdt-postgres-rs/src/store.rs | 3 +++ .../src/extension/functions/materialize.rs | 3 +++ 4 files changed, 40 insertions(+) diff --git a/packages/treecrdt-core/src/materialization.rs b/packages/treecrdt-core/src/materialization.rs index 66d98346..96df8bab 100644 --- a/packages/treecrdt-core/src/materialization.rs +++ b/packages/treecrdt-core/src/materialization.rs @@ -302,6 +302,10 @@ fn rewind_structure_op_in_place( op: &Operation, ) -> Result<()> { let node = op.kind.node(); + // Direct rewind is deliberately local: ask storage for the previous structural winner for this + // node, clear the currently materialized attachment, then restore that predecessor if one + // exists. Anything more complicated (delete/tombstone/revival) stays on the conservative + // replay-from-frontier path. let previous = storage.latest_structural_before(node, &frontier_from_op(op).as_borrowed())?; nodes.ensure_node(node)?; nodes.detach(node)?; @@ -327,6 +331,8 @@ fn rewind_payload_op_in_place( op: &Operation, ) -> Result<()> { let node = op.kind.node(); + // Payload rewind needs the previous winning payload-bearing op, not just the current bytes. + // If no predecessor exists, direct rewind must clear the payload row entirely. let previous = storage.latest_payload_before(node, &frontier_from_op(op).as_borrowed())?; if let Some(previous) = previous { @@ -740,6 +746,10 @@ where let mut existing_suffix_ops = Vec::new(); let mut requires_full_replay = false; storage.scan_frontier_range(frontier, &mut |op| { + // One pass does double duty: + // - `full_suffix_ops` is the corrected suffix we will replay forward. + // - `existing_suffix_ops` is the subset that was already materialized before this append, + // which is exactly what must be unwound first. let op_frontier = frontier_from_op(&op); if cmp_frontiers(&op_frontier, &head.at) != Ordering::Greater && !inserted_op_ids.contains(&op.meta.id) @@ -754,6 +764,8 @@ where return Ok(None); } + // `head.seq` reflects the fully materialized suffix. Removing the already-materialized suffix + // yields the trusted prefix length and the first seq that must be rewritten in the index. let prefix_seq = head.seq.saturating_sub(existing_suffix_ops.len().min(u64::MAX as usize) as u64); let truncate_from = prefix_seq.saturating_add(1); @@ -769,6 +781,8 @@ where index.truncate_from(truncate_from)?; rewind_existing_suffix_in_place(&mut nodes, &mut payloads, storage, &existing_suffix_ops)?; + // After rewinding, the backend stores now represent the prefix immediately before the + // invalidated suffix. Replaying `full_suffix_ops` forward rebuilds only the corrected suffix. let mut crdt = TreeCrdt::with_stores(replica_id, NoopStorage, clock, nodes, payloads)?; let mut affected = HashSet::new(); @@ -814,6 +828,8 @@ where I: TruncatingParentOpIndex, { let truncate_from = prefix_seq.saturating_add(1); + // The in-memory fallback rebuild computes a fresh suffix index. Drop the stale persisted + // suffix first, then repopulate only the rebuilt suffix records below. index.truncate_from(truncate_from)?; for node in affected_nodes { @@ -827,6 +843,9 @@ where nodes.set_tombstone(*node, prefix.crdt.node_store_mut().tombstone(*node)?)?; + // These are exact setters on purpose: the backend may already contain newer-looking merged + // values from the stale suffix, so fallback catch-up must overwrite them with the rebuilt + // post-replay state rather than merge again. let last_change = prefix.crdt.node_store_mut().last_change(*node)?; nodes.set_last_change_exact(*node, &last_change)?; diff --git a/packages/treecrdt-core/src/traits.rs b/packages/treecrdt-core/src/traits.rs index f1fcb710..f41752ed 100644 --- a/packages/treecrdt-core/src/traits.rs +++ b/packages/treecrdt-core/src/traits.rs @@ -160,6 +160,11 @@ pub trait PayloadStore { } pub trait ExactPayloadStore: PayloadStore { + /// Remove any current payload winner for `node`. + /// + /// `PayloadStore::set_payload(...)` is a forward-materialization API: callers provide the new + /// winner tuple explicitly. Direct rewind/catch-up also needs an exact "no winner exists" + /// operation when rolling a payload-bearing suffix back to the pre-suffix state. fn clear_payload(&mut self, node: NodeId) -> Result<()>; } @@ -173,6 +178,10 @@ pub trait ParentOpIndex { } pub trait TruncatingParentOpIndex: ParentOpIndex { + /// Delete derived index rows for the invalidated materialized suffix starting at `seq`. + /// + /// This does not truncate the op log. It only removes stale `children(parent)` index entries + /// so the corrected suffix can be re-recorded in canonical order. fn truncate_from(&mut self, seq: u64) -> Result<()>; } @@ -196,7 +205,13 @@ impl TruncatingParentOpIndex for NoopParentOpIndex { } pub trait ExactNodeStore: NodeStore { + /// Overwrite `last_change` with the exact value from a rebuilt/rewound materialized state. + /// + /// `NodeStore::merge_last_change(...)` is sufficient for forward incremental application, but + /// rewind/catch-up needs to restore the precise post-replay value rather than merge with the + /// stale value currently persisted in the backend. fn set_last_change_exact(&mut self, node: NodeId, vv: &VersionVector) -> Result<()>; + /// Overwrite `deleted_at` with the exact rebuilt value, including clearing it to `None`. fn set_deleted_at_exact(&mut self, node: NodeId, vv: Option<&VersionVector>) -> Result<()>; } diff --git a/packages/treecrdt-postgres-rs/src/store.rs b/packages/treecrdt-postgres-rs/src/store.rs index dd8b8c5e..563aaefa 100644 --- a/packages/treecrdt-postgres-rs/src/store.rs +++ b/packages/treecrdt-postgres-rs/src/store.rs @@ -1874,6 +1874,9 @@ fn append_ops_in_tx( let apply_result = if catch_up_performed { let refreshed_meta = load_tree_meta_for_update(client, doc_id)?; let catch_up = if meta.state().replay_from.is_none() { + // No replay frontier existed before this append, so if one was just scheduled it came + // from the current batch. That is the narrow case where the direct rewind fast path is + // safe to try before falling back to conservative replay-from-frontier catch-up. try_direct_rewind_catch_up_materialized_state( &PgOpStorage::new(ctx.clone()), &inserted_op_ids, diff --git a/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs b/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs index c0c7d32f..84c1ac95 100644 --- a/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs +++ b/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs @@ -322,6 +322,9 @@ pub(super) fn append_ops_impl( let apply_result = if apply_result.catch_up_needed { let refreshed_meta = load_tree_meta(db)?; let catch_up = if meta.state().replay_from.is_none() { + // Mirror the Postgres path: only try direct rewind when this append introduced the + // frontier. If the document was already behind before the append, stay on the + // conservative replay-from-frontier catch-up path. try_direct_rewind_catch_up_materialized_state( &super::op_storage::SqliteOpStorage::with_doc_id(db, doc_id.to_vec()), &inserted_op_ids, From 4fff2893cacada637f3e1b73bdabdd93a57e1a0a Mon Sep 17 00:00:00 2001 From: Marcus Pousette Date: Mon, 20 Apr 2026 15:16:22 +0200 Subject: [PATCH 11/13] refactor(core): share materialized apply tail --- packages/treecrdt-core/src/tree.rs | 140 +++++++++++++---------------- 1 file changed, 63 insertions(+), 77 deletions(-) diff --git a/packages/treecrdt-core/src/tree.rs b/packages/treecrdt-core/src/tree.rs index 680c20b2..b2912e53 100644 --- a/packages/treecrdt-core/src/tree.rs +++ b/packages/treecrdt-core/src/tree.rs @@ -504,46 +504,21 @@ where index: &mut I, seq: u64, ) -> Result> { - let op_node = op.kind.node(); - let parent_after = match &op.kind { - OperationKind::Insert { parent, .. } => Some(*parent), - OperationKind::Move { new_parent, .. } => Some(*new_parent), - _ => None, - }; - let op_id = op.meta.id.clone(); - let op_kind = op.kind.clone(); - - let Some(mut delta) = self.apply_remote_with_delta(op)? else { + let snapshot = self.apply_remote_with_delta(op.clone())?.map(|delta| NodeSnapshot { + parent: delta.snapshot.parent, + order_key: delta.snapshot.order_key, + }); + let Some(snapshot) = snapshot else { return Ok(None); }; - - let parents = affected_parents(delta.snapshot.parent, &op_kind); - for parent in &parents { - if *parent == NodeId::TRASH { - continue; - } - index.record(*parent, &op_id, seq)?; - } - - // Ensure the latest payload op for `op_node` is discoverable under its current parent. - // This supports partial sync subscribers that only track `children(parent)` opRefs. - if let Some(parent_after) = parent_after { - if parent_after != NodeId::TRASH && delta.snapshot.parent != Some(parent_after) { - if let Some((_lamport, payload_id)) = self.payload_last_writer(op_node)? { - index.record(parent_after, &payload_id, seq)?; - } - } - } - - let mut starts = parents; - starts.push(op_node); - let tombstone_changed = self.refresh_tombstones_upward_with_delta(starts)?; - delta - .affected_nodes - .extend(tombstone_changed.into_iter().filter(|node| *node != NodeId::TRASH)); - delta.affected_nodes = sorted_node_ids(delta.affected_nodes); - - Ok(Some(delta)) + let affected_nodes = direct_affected_nodes(snapshot.parent, &op.kind); + Ok(Some(self.finalize_materialized_apply( + snapshot, + &op, + index, + seq, + affected_nodes, + )?)) } /// Apply a remote op and advance materialization sequence only when it is accepted. @@ -580,49 +555,12 @@ where self.counter = self.counter.max(op.meta.id.counter); } - let op_node = op.kind.node(); - let parent_after = match &op.kind { - OperationKind::Insert { parent, .. } => Some(*parent), - OperationKind::Move { new_parent, .. } => Some(*new_parent), - _ => None, - }; - let op_id = op.meta.id.clone(); - let op_kind = op.kind.clone(); - let snapshot = Self::apply_forward(&mut self.nodes, &mut self.payloads, &op)?; self.op_count = seq; self.head = Some(op.clone()); - let parents = affected_parents(snapshot.parent, &op_kind); - for parent in &parents { - if *parent == NodeId::TRASH { - continue; - } - index.record(*parent, &op_id, seq)?; - } - - if let Some(parent_after) = parent_after { - if parent_after != NodeId::TRASH && snapshot.parent != Some(parent_after) { - if let Some((_lamport, payload_id)) = self.payload_last_writer(op_node)? { - index.record(parent_after, &payload_id, seq)?; - } - } - } - - let mut affected_nodes = direct_affected_nodes(snapshot.parent, &op_kind); - let mut starts = parents; - starts.push(op_node); - let tombstone_changed = self.refresh_tombstones_upward_with_delta(starts)?; - affected_nodes.extend(tombstone_changed.into_iter().filter(|node| *node != NodeId::TRASH)); - affected_nodes = sorted_node_ids(affected_nodes); - - Ok(ApplyDelta { - snapshot: NodeSnapshotExport { - parent: snapshot.parent, - order_key: snapshot.order_key, - }, - affected_nodes, - }) + let affected_nodes = direct_affected_nodes(snapshot.parent, &op.kind); + self.finalize_materialized_apply(snapshot, &op, index, seq, affected_nodes) } /// Finalize adapter-owned local ops by refreshing tombstones and recording parent-op index rows. @@ -659,6 +597,54 @@ where Ok(()) } + fn finalize_materialized_apply( + &mut self, + snapshot: NodeSnapshot, + op: &Operation, + index: &mut I, + seq: u64, + mut affected_nodes: Vec, + ) -> Result { + let op_node = op.kind.node(); + let parent_after = match &op.kind { + OperationKind::Insert { parent, .. } => Some(*parent), + OperationKind::Move { new_parent, .. } => Some(*new_parent), + _ => None, + }; + let parents = affected_parents(snapshot.parent, &op.kind); + + for parent in &parents { + if *parent == NodeId::TRASH { + continue; + } + index.record(*parent, &op.meta.id, seq)?; + } + + // Ensure the latest payload op for `op_node` is discoverable under its current parent. + // This supports partial sync subscribers that only track `children(parent)` opRefs. + if let Some(parent_after) = parent_after { + if parent_after != NodeId::TRASH && snapshot.parent != Some(parent_after) { + if let Some((_lamport, payload_id)) = self.payload_last_writer(op_node)? { + index.record(parent_after, &payload_id, seq)?; + } + } + } + + let mut starts = parents; + starts.push(op_node); + let tombstone_changed = self.refresh_tombstones_upward_with_delta(starts)?; + affected_nodes.extend(tombstone_changed.into_iter().filter(|node| *node != NodeId::TRASH)); + affected_nodes = sorted_node_ids(affected_nodes); + + Ok(ApplyDelta { + snapshot: NodeSnapshotExport { + parent: snapshot.parent, + order_key: snapshot.order_key, + }, + affected_nodes, + }) + } + pub fn finalize_local_with_plan( &mut self, op: &Operation, From d434fb154b2b8a0491dacd0bff82f18b72ec1bb2 Mon Sep 17 00:00:00 2001 From: Marcus Pousette Date: Mon, 20 Apr 2026 15:31:47 +0200 Subject: [PATCH 12/13] refactor(postgres): split store orchestration modules --- packages/treecrdt-postgres-rs/src/store.rs | 526 +----------------- .../treecrdt-postgres-rs/src/store/append.rs | 338 +++++++++++ .../treecrdt-postgres-rs/src/store/meta.rs | 200 +++++++ 3 files changed, 551 insertions(+), 513 deletions(-) create mode 100644 packages/treecrdt-postgres-rs/src/store/append.rs create mode 100644 packages/treecrdt-postgres-rs/src/store/meta.rs diff --git a/packages/treecrdt-postgres-rs/src/store.rs b/packages/treecrdt-postgres-rs/src/store.rs index 563aaefa..c269bf9d 100644 --- a/packages/treecrdt-postgres-rs/src/store.rs +++ b/packages/treecrdt-postgres-rs/src/store.rs @@ -1,22 +1,27 @@ +mod append; +mod meta; + use std::cell::RefCell; use std::collections::{HashMap, HashSet}; use std::rc::Rc; use std::time::Instant; -use postgres::{Client, Row, Statement}; +use postgres::{Client, Row}; use treecrdt_core::{ - apply_persisted_remote_ops_with_delta, catch_up_materialized_state, - materialize_persisted_remote_ops_with_delta, try_direct_rewind_catch_up_materialized_state, - try_shortcut_out_of_order_payload_noops, Error, ExactNodeStore, ExactPayloadStore, - FrontierRewindStorage, Lamport, LamportClock, MaterializationCursor, MaterializationFrontier, - MaterializationHead, MaterializationKey, MaterializationState, NodeId, NodeStore, Operation, - OperationId, OperationKind, PayloadStore, PersistedRemoteStores, ReplicaId, Result, Storage, + Error, ExactNodeStore, ExactPayloadStore, FrontierRewindStorage, Lamport, NodeId, NodeStore, + Operation, OperationId, OperationKind, PayloadStore, ReplicaId, Result, Storage, TruncatingParentOpIndex, VersionVector, }; use crate::opref::{derive_op_ref_v0, OPREF_V0_WIDTH}; -use crate::profile::{append_profile_enabled, PgAppendProfile}; + +pub(crate) use self::append::ensure_materialized_in_tx; +pub use self::append::{append_ops, append_ops_with_affected_nodes, ensure_materialized}; +pub(crate) use self::meta::{ + ensure_doc_meta, load_tree_meta_for_update, set_tree_meta_replay_frontier, + update_tree_meta_head, PgCtx, TreeMeta, +}; pub(crate) fn storage_debug(e: E) -> Error { Error::Storage(format!("{e:?}")) @@ -52,192 +57,6 @@ pub(crate) fn vv_from_bytes(bytes: &[u8]) -> Result { serde_json::from_slice(bytes).map_err(|e| Error::Storage(e.to_string())) } -#[derive(Clone, Debug)] -pub(crate) struct TreeMeta(MaterializationState); - -impl MaterializationCursor for TreeMeta { - fn state(&self) -> MaterializationState<&[u8]> { - self.0.as_borrowed() - } -} - -pub(crate) fn ensure_doc_meta(client: &Rc>, doc_id: &str) -> Result<()> { - let mut c = client.borrow_mut(); - c.execute( - "INSERT INTO treecrdt_meta(doc_id) VALUES ($1) ON CONFLICT (doc_id) DO NOTHING", - &[&doc_id], - ) - .map_err(storage_debug)?; - Ok(()) -} - -fn load_tree_meta_row( - client: &Rc>, - doc_id: &str, - for_update: bool, -) -> Result { - let ctx = PgCtx::new(client.clone(), doc_id)?; - let mut c = client.borrow_mut(); - let stmt = if for_update { - ctx.stmt( - &mut c, - "SELECT head_lamport, head_replica, head_counter, head_seq, \ - replay_lamport, replay_replica, replay_counter \ - FROM treecrdt_meta WHERE doc_id = $1 FOR UPDATE", - )? - } else { - ctx.stmt( - &mut c, - "SELECT head_lamport, head_replica, head_counter, head_seq, \ - replay_lamport, replay_replica, replay_counter \ - FROM treecrdt_meta WHERE doc_id = $1 LIMIT 1", - )? - }; - let rows = c.query(&stmt, &[&doc_id]).map_err(storage_debug)?; - - let row = rows.first().ok_or_else(|| Error::Storage("missing treecrdt_meta row".into()))?; - - let head_lamport = row.get::<_, i64>(0).max(0) as Lamport; - let head_replica = row.get::<_, Vec>(1); - let head_counter = row.get::<_, i64>(2).max(0) as u64; - let head_seq = row.get::<_, i64>(3).max(0) as u64; - let replay_lamport = row.get::<_, Option>(4).map(|v| v.max(0) as Lamport); - let replay_replica = row.get::<_, Option>>(5); - let replay_counter = row.get::<_, Option>(6).map(|v| v.max(0) as u64); - - let head = if head_seq == 0 && head_lamport == 0 && head_replica.is_empty() && head_counter == 0 - { - None - } else { - Some(MaterializationHead { - at: MaterializationKey { - lamport: head_lamport, - replica: head_replica, - counter: head_counter, - }, - seq: head_seq, - }) - }; - let replay_from = match (replay_lamport, replay_replica, replay_counter) { - (Some(lamport), Some(replica), Some(counter)) => Some(MaterializationKey { - lamport, - replica, - counter, - }), - _ => None, - }; - - Ok(TreeMeta(MaterializationState { head, replay_from })) -} - -fn load_tree_meta(client: &Rc>, doc_id: &str) -> Result { - load_tree_meta_row(client, doc_id, false) -} - -pub(crate) fn load_tree_meta_for_update( - client: &Rc>, - doc_id: &str, -) -> Result { - load_tree_meta_row(client, doc_id, true) -} - -pub(crate) fn set_tree_meta_replay_frontier( - client: &Rc>, - doc_id: &str, - frontier: &MaterializationFrontier, -) -> Result<()> { - ensure_doc_meta(client, doc_id)?; - let mut c = client.borrow_mut(); - c.execute( - "UPDATE treecrdt_meta \ - SET replay_lamport = $2, replay_replica = $3, replay_counter = $4 \ - WHERE doc_id = $1", - &[ - &doc_id, - &(frontier.lamport as i64), - &frontier.replica, - &(frontier.counter as i64), - ], - ) - .map_err(|e| Error::Storage(e.to_string()))?; - Ok(()) -} - -pub(crate) fn update_tree_meta_head>( - client: &Rc>, - doc_id: &str, - head: Option<&MaterializationHead>, -) -> Result<()> { - ensure_doc_meta(client, doc_id)?; - let (lamport, replica, counter, seq): (Lamport, &[u8], u64, u64) = match head { - Some(head) => ( - head.at.lamport, - head.at.replica.as_ref(), - head.at.counter, - head.seq, - ), - None => (0, &[], 0, 0), - }; - let mut c = client.borrow_mut(); - c.execute( - "UPDATE treecrdt_meta \ - SET head_lamport = $2, \ - head_replica = $3, \ - head_counter = $4, \ - head_seq = $5, \ - replay_lamport = NULL, \ - replay_replica = NULL, \ - replay_counter = NULL \ - WHERE doc_id = $1", - &[ - &doc_id, - &(lamport as i64), - &replica, - &(counter as i64), - &(seq as i64), - ], - ) - .map_err(|e| Error::Storage(e.to_string()))?; - Ok(()) -} - -#[derive(Clone)] -pub(crate) struct PgCtx { - pub(crate) doc_id: String, - pub(crate) client: Rc>, - stmts: Rc>>, - append_profile: Option>>, -} - -impl PgCtx { - pub(crate) fn new(client: Rc>, doc_id: &str) -> Result { - Self::new_with_profile(client, doc_id, None) - } - - fn new_with_profile( - client: Rc>, - doc_id: &str, - append_profile: Option>>, - ) -> Result { - ensure_doc_meta(&client, doc_id)?; - Ok(Self { - doc_id: doc_id.to_string(), - client, - stmts: Rc::new(RefCell::new(HashMap::new())), - append_profile, - }) - } - - pub(crate) fn stmt(&self, c: &mut Client, sql: &'static str) -> Result { - if let Some(stmt) = self.stmts.borrow().get(sql) { - return Ok(stmt.clone()); - } - let stmt = c.prepare(sql).map_err(storage_debug)?; - self.stmts.borrow_mut().insert(sql, stmt.clone()); - Ok(stmt) - } -} - #[derive(Clone, Debug)] struct CachedNodeRow { parent: Option, @@ -1687,322 +1506,3 @@ fn select_inserted_ops( inserted_ops } - -fn materialize_inserted_ops( - ctx: PgCtx, - meta: &TreeMeta, - ops: Vec, -) -> Result { - // At this point treecrdt_ops already contains the inserted operations. This temporary - // TreeCrdt exists only to replay those ops through core semantics and update derived tables. - materialize_persisted_remote_ops_with_delta( - PersistedRemoteStores { - // Scratch identity for the temporary TreeCrdt; replayed ops keep their own ids. - replica_id: ReplicaId::new(b"postgres"), - clock: LamportClock::default(), - nodes: PgNodeStore::new(ctx.clone()), - payloads: PgPayloadStore::new(ctx.clone()), - index: PgParentOpIndex::new(ctx.clone()), - }, - meta, - ops, - |nodes, ops| { - if ops.iter().any(|op| matches!(op.kind, OperationKind::Payload { .. })) { - // Payload ops can depend on the current node row, so front-load the reads here. - nodes.preload_for_ops(ops)?; - } - Ok(()) - }, - |nodes| nodes.flush_last_change(), - |index| index.flush(), - ) -} - -fn merge_affected_nodes(mut left: Vec, right: Vec) -> Vec { - left.extend(right); - left.sort(); - left.dedup(); - left -} - -pub fn append_ops(client: &Rc>, doc_id: &str, ops: &[Operation]) -> Result { - { - let mut c = client.borrow_mut(); - c.batch_execute("BEGIN").map_err(|e| Error::Storage(e.to_string()))?; - } - - let res = append_ops_in_tx(client, doc_id, ops); - - match res { - Ok(v) => { - let mut c = client.borrow_mut(); - c.batch_execute("COMMIT").map_err(|e| Error::Storage(e.to_string()))?; - Ok(v.inserted_count) - } - Err(e) => { - let mut c = client.borrow_mut(); - let _ = c.batch_execute("ROLLBACK"); - Err(e) - } - } -} - -pub fn append_ops_with_affected_nodes( - client: &Rc>, - doc_id: &str, - ops: &[Operation], -) -> Result> { - { - let mut c = client.borrow_mut(); - c.batch_execute("BEGIN").map_err(|e| Error::Storage(e.to_string()))?; - } - - let res = append_ops_in_tx(client, doc_id, ops); - - match res { - Ok(v) => { - let mut c = client.borrow_mut(); - c.batch_execute("COMMIT").map_err(|e| Error::Storage(e.to_string()))?; - Ok(v.affected_nodes) - } - Err(e) => { - let mut c = client.borrow_mut(); - let _ = c.batch_execute("ROLLBACK"); - Err(e) - } - } -} - -#[derive(Default)] -struct AppendOpsResult { - inserted_count: u64, - affected_nodes: Vec, -} - -fn append_ops_in_tx( - client: &Rc>, - doc_id: &str, - ops: &[Operation], -) -> Result { - // Serialize per-doc writers across all server instances (incremental materialization updates - // derived tables + head_seq and is not safe to run concurrently for the same doc_id). - let meta = load_tree_meta_for_update(client, doc_id)?; - // This profiler is only for large-upload benchmark/debug runs. The normal - // append path keeps the hook disabled and pays only the `OnceLock` check. - let append_profile = append_profile_enabled().then(|| { - Rc::new(RefCell::new(PgAppendProfile::new( - ops.len(), - meta.state().replay_from.is_some(), - meta.state().head_seq(), - ))) - }); - let ctx = PgCtx::new_with_profile(client.clone(), doc_id, append_profile.clone())?; - - let bulk_insert_started_at = Instant::now(); - let inserted_op_refs = { - let mut c = client.borrow_mut(); - bulk_insert_ops_in_tx(&ctx, &mut c, ops)? - }; - if let Some(profile) = &append_profile { - let mut profile = profile.borrow_mut(); - profile.bulk_insert_ms += bulk_insert_started_at.elapsed().as_secs_f64() * 1000.0; - profile.bulk_inserted_ops += inserted_op_refs.len(); - } - - let dedupe_filter_started_at = Instant::now(); - // Only materialize the ops Postgres actually inserted. This keeps duplicate opRefs in the - // input batch from being replayed twice through core materialization. - let inserted_ops = select_inserted_ops(&ctx, ops, inserted_op_refs); - let inserted_op_ids: HashSet = - inserted_ops.iter().map(|op| op.meta.id.clone()).collect(); - if let Some(profile) = &append_profile { - profile.borrow_mut().dedupe_filter_ms += - dedupe_filter_started_at.elapsed().as_secs_f64() * 1000.0; - } - - let materialize_started_at = Instant::now(); - let mut update_head_ms = 0.0; - let mut update_head = |head: &MaterializationHead| { - let started_at = Instant::now(); - let result = update_tree_meta_head(&ctx.client, &ctx.doc_id, Some(head)); - update_head_ms += started_at.elapsed().as_secs_f64() * 1000.0; - result - }; - let apply_result = if let Some(shortcut) = { - let payloads = PgPayloadStore::new(ctx.clone()); - try_shortcut_out_of_order_payload_noops(&meta, inserted_ops.clone(), |node| { - payloads.last_writer(node) - })? - } { - if shortcut.remaining_ops.is_empty() { - update_head(&shortcut.resumed_head)?; - treecrdt_core::PersistedRemoteApplyResult { - inserted_count: inserted_ops.len().min(u64::MAX as usize) as u64, - affected_nodes: shortcut.affected_nodes, - catch_up_needed: false, - } - } else { - let shortcut_meta = TreeMeta(MaterializationState { - head: Some(shortcut.resumed_head.clone()), - replay_from: None, - }); - let result = - materialize_inserted_ops(ctx.clone(), &shortcut_meta, shortcut.remaining_ops)?; - let head = result.head.ok_or_else(|| { - Error::Storage("expected head after payload noop shortcut".into()) - })?; - update_head(&head)?; - treecrdt_core::PersistedRemoteApplyResult { - inserted_count: inserted_ops.len().min(u64::MAX as usize) as u64, - affected_nodes: merge_affected_nodes( - shortcut.affected_nodes, - result.affected_nodes, - ), - catch_up_needed: false, - } - } - } else { - apply_persisted_remote_ops_with_delta( - &meta, - inserted_ops, - |inserted| materialize_inserted_ops(ctx.clone(), &meta, inserted), - &mut update_head, - |frontier| set_tree_meta_replay_frontier(client, doc_id, frontier), - )? - }; - let catch_up_performed = apply_result.catch_up_needed; - let apply_result = if catch_up_performed { - let refreshed_meta = load_tree_meta_for_update(client, doc_id)?; - let catch_up = if meta.state().replay_from.is_none() { - // No replay frontier existed before this append, so if one was just scheduled it came - // from the current batch. That is the narrow case where the direct rewind fast path is - // safe to try before falling back to conservative replay-from-frontier catch-up. - try_direct_rewind_catch_up_materialized_state( - &PgOpStorage::new(ctx.clone()), - &inserted_op_ids, - PersistedRemoteStores { - replica_id: ReplicaId::new(b"postgres"), - clock: LamportClock::default(), - nodes: PgNodeStore::new(ctx.clone()), - payloads: PgPayloadStore::new(ctx.clone()), - index: PgParentOpIndex::new(ctx.clone()), - }, - &refreshed_meta, - |nodes| nodes.flush_last_change(), - |index| index.flush(), - )? - .unwrap_or(catch_up_materialized_state( - PgOpStorage::new(ctx.clone()), - PersistedRemoteStores { - replica_id: ReplicaId::new(b"postgres"), - clock: LamportClock::default(), - nodes: PgNodeStore::new(ctx.clone()), - payloads: PgPayloadStore::new(ctx.clone()), - index: PgParentOpIndex::new(ctx.clone()), - }, - &refreshed_meta, - |nodes| nodes.flush_last_change(), - |index| index.flush(), - )?) - } else { - catch_up_materialized_state( - PgOpStorage::new(ctx.clone()), - PersistedRemoteStores { - replica_id: ReplicaId::new(b"postgres"), - clock: LamportClock::default(), - nodes: PgNodeStore::new(ctx.clone()), - payloads: PgPayloadStore::new(ctx.clone()), - index: PgParentOpIndex::new(ctx.clone()), - }, - &refreshed_meta, - |nodes| nodes.flush_last_change(), - |index| index.flush(), - )? - }; - update_head( - catch_up - .head - .as_ref() - .ok_or_else(|| Error::Storage("expected head after immediate catch-up".into()))?, - )?; - treecrdt_core::PersistedRemoteApplyResult { - inserted_count: apply_result.inserted_count, - affected_nodes: catch_up.affected_nodes, - catch_up_needed: false, - } - } else { - apply_result - }; - if let Some(profile) = &append_profile { - profile.borrow_mut().materialize_ms += - materialize_started_at.elapsed().as_secs_f64() * 1000.0; - } - - if let Some(profile) = &append_profile { - profile.borrow_mut().update_head_ms += update_head_ms; - if catch_up_performed { - profile.borrow_mut().catch_up_performed = true; - } - profile.borrow().log(doc_id, apply_result.inserted_count as usize); - } - - Ok(AppendOpsResult { - inserted_count: apply_result.inserted_count, - affected_nodes: apply_result.affected_nodes, - }) -} - -pub fn ensure_materialized(client: &Rc>, doc_id: &str) -> Result<()> { - { - let mut c = client.borrow_mut(); - c.batch_execute("BEGIN").map_err(|e| Error::Storage(e.to_string()))?; - } - - let res = ensure_materialized_in_tx(client, doc_id); - - match res { - Ok(()) => { - let mut c = client.borrow_mut(); - c.batch_execute("COMMIT").map_err(|e| Error::Storage(e.to_string()))?; - Ok(()) - } - Err(e) => { - let mut c = client.borrow_mut(); - let _ = c.batch_execute("ROLLBACK"); - Err(e) - } - } -} - -pub(crate) fn ensure_materialized_in_tx(client: &Rc>, doc_id: &str) -> Result<()> { - let meta = load_tree_meta(client, doc_id)?; - if meta.state().replay_from.is_none() { - return Ok(()); - } - - // Take a per-doc lock so catch-up can't race with concurrent append/materialization. - let meta = load_tree_meta_for_update(client, doc_id)?; - if meta.state().replay_from.is_none() { - return Ok(()); - } - - let ctx = PgCtx::new(client.clone(), doc_id)?; - let storage = PgOpStorage::new(ctx.clone()); - let catch_up = catch_up_materialized_state( - storage, - PersistedRemoteStores { - replica_id: ReplicaId::new(b"postgres"), - clock: LamportClock::default(), - nodes: PgNodeStore::new(ctx.clone()), - payloads: PgPayloadStore::new(ctx.clone()), - index: PgParentOpIndex::new(ctx.clone()), - }, - &meta, - |nodes| nodes.flush_last_change(), - |index| index.flush(), - )?; - - update_tree_meta_head(client, doc_id, catch_up.head.as_ref())?; - - Ok(()) -} diff --git a/packages/treecrdt-postgres-rs/src/store/append.rs b/packages/treecrdt-postgres-rs/src/store/append.rs new file mode 100644 index 00000000..6c6ac7d0 --- /dev/null +++ b/packages/treecrdt-postgres-rs/src/store/append.rs @@ -0,0 +1,338 @@ +use std::cell::RefCell; +use std::collections::HashSet; +use std::rc::Rc; +use std::time::Instant; + +use postgres::Client; + +use treecrdt_core::{ + apply_persisted_remote_ops_with_delta, catch_up_materialized_state, + materialize_persisted_remote_ops_with_delta, try_direct_rewind_catch_up_materialized_state, + try_shortcut_out_of_order_payload_noops, Error, LamportClock, MaterializationCursor, + MaterializationHead, MaterializationState, NodeId, Operation, OperationId, OperationKind, + PersistedRemoteStores, ReplicaId, Result, +}; + +use crate::profile::{append_profile_enabled, PgAppendProfile}; + +use super::*; +use super::meta::load_tree_meta; + +fn materialize_inserted_ops( + ctx: PgCtx, + meta: &TreeMeta, + ops: Vec, +) -> Result { + // At this point treecrdt_ops already contains the inserted operations. This temporary + // TreeCrdt exists only to replay those ops through core semantics and update derived tables. + materialize_persisted_remote_ops_with_delta( + PersistedRemoteStores { + // Scratch identity for the temporary TreeCrdt; replayed ops keep their own ids. + replica_id: ReplicaId::new(b"postgres"), + clock: LamportClock::default(), + nodes: PgNodeStore::new(ctx.clone()), + payloads: PgPayloadStore::new(ctx.clone()), + index: PgParentOpIndex::new(ctx.clone()), + }, + meta, + ops, + |nodes, ops| { + if ops.iter().any(|op| matches!(op.kind, OperationKind::Payload { .. })) { + // Payload ops can depend on the current node row, so front-load the reads here. + nodes.preload_for_ops(ops)?; + } + Ok(()) + }, + |nodes| nodes.flush_last_change(), + |index| index.flush(), + ) +} + +fn merge_affected_nodes(mut left: Vec, right: Vec) -> Vec { + left.extend(right); + left.sort(); + left.dedup(); + left +} + +pub fn append_ops(client: &Rc>, doc_id: &str, ops: &[Operation]) -> Result { + { + let mut c = client.borrow_mut(); + c.batch_execute("BEGIN").map_err(|e| Error::Storage(e.to_string()))?; + } + + let res = append_ops_in_tx(client, doc_id, ops); + + match res { + Ok(v) => { + let mut c = client.borrow_mut(); + c.batch_execute("COMMIT").map_err(|e| Error::Storage(e.to_string()))?; + Ok(v.inserted_count) + } + Err(e) => { + let mut c = client.borrow_mut(); + let _ = c.batch_execute("ROLLBACK"); + Err(e) + } + } +} + +pub fn append_ops_with_affected_nodes( + client: &Rc>, + doc_id: &str, + ops: &[Operation], +) -> Result> { + { + let mut c = client.borrow_mut(); + c.batch_execute("BEGIN").map_err(|e| Error::Storage(e.to_string()))?; + } + + let res = append_ops_in_tx(client, doc_id, ops); + + match res { + Ok(v) => { + let mut c = client.borrow_mut(); + c.batch_execute("COMMIT").map_err(|e| Error::Storage(e.to_string()))?; + Ok(v.affected_nodes) + } + Err(e) => { + let mut c = client.borrow_mut(); + let _ = c.batch_execute("ROLLBACK"); + Err(e) + } + } +} + +#[derive(Default)] +struct AppendOpsResult { + inserted_count: u64, + affected_nodes: Vec, +} + +fn append_ops_in_tx( + client: &Rc>, + doc_id: &str, + ops: &[Operation], +) -> Result { + // Serialize per-doc writers across all server instances (incremental materialization updates + // derived tables + head_seq and is not safe to run concurrently for the same doc_id). + let meta = load_tree_meta_for_update(client, doc_id)?; + // This profiler is only for large-upload benchmark/debug runs. The normal + // append path keeps the hook disabled and pays only the `OnceLock` check. + let append_profile = append_profile_enabled().then(|| { + Rc::new(RefCell::new(PgAppendProfile::new( + ops.len(), + meta.state().replay_from.is_some(), + meta.state().head_seq(), + ))) + }); + let ctx = PgCtx::new_with_profile(client.clone(), doc_id, append_profile.clone())?; + + let bulk_insert_started_at = Instant::now(); + let inserted_op_refs = { + let mut c = client.borrow_mut(); + bulk_insert_ops_in_tx(&ctx, &mut c, ops)? + }; + if let Some(profile) = &append_profile { + let mut profile = profile.borrow_mut(); + profile.bulk_insert_ms += bulk_insert_started_at.elapsed().as_secs_f64() * 1000.0; + profile.bulk_inserted_ops += inserted_op_refs.len(); + } + + let dedupe_filter_started_at = Instant::now(); + // Only materialize the ops Postgres actually inserted. This keeps duplicate opRefs in the + // input batch from being replayed twice through core materialization. + let inserted_ops = select_inserted_ops(&ctx, ops, inserted_op_refs); + let inserted_op_ids: HashSet = + inserted_ops.iter().map(|op| op.meta.id.clone()).collect(); + if let Some(profile) = &append_profile { + profile.borrow_mut().dedupe_filter_ms += + dedupe_filter_started_at.elapsed().as_secs_f64() * 1000.0; + } + + let materialize_started_at = Instant::now(); + let mut update_head_ms = 0.0; + let mut update_head = |head: &MaterializationHead| { + let started_at = Instant::now(); + let result = update_tree_meta_head(&ctx.client, &ctx.doc_id, Some(head)); + update_head_ms += started_at.elapsed().as_secs_f64() * 1000.0; + result + }; + let apply_result = if let Some(shortcut) = { + let payloads = PgPayloadStore::new(ctx.clone()); + try_shortcut_out_of_order_payload_noops(&meta, inserted_ops.clone(), |node| { + payloads.last_writer(node) + })? + } { + if shortcut.remaining_ops.is_empty() { + update_head(&shortcut.resumed_head)?; + treecrdt_core::PersistedRemoteApplyResult { + inserted_count: inserted_ops.len().min(u64::MAX as usize) as u64, + affected_nodes: shortcut.affected_nodes, + catch_up_needed: false, + } + } else { + let shortcut_meta = TreeMeta(MaterializationState { + head: Some(shortcut.resumed_head.clone()), + replay_from: None, + }); + let result = + materialize_inserted_ops(ctx.clone(), &shortcut_meta, shortcut.remaining_ops)?; + let head = result.head.ok_or_else(|| { + Error::Storage("expected head after payload noop shortcut".into()) + })?; + update_head(&head)?; + treecrdt_core::PersistedRemoteApplyResult { + inserted_count: inserted_ops.len().min(u64::MAX as usize) as u64, + affected_nodes: merge_affected_nodes( + shortcut.affected_nodes, + result.affected_nodes, + ), + catch_up_needed: false, + } + } + } else { + apply_persisted_remote_ops_with_delta( + &meta, + inserted_ops, + |inserted| materialize_inserted_ops(ctx.clone(), &meta, inserted), + &mut update_head, + |frontier| set_tree_meta_replay_frontier(client, doc_id, frontier), + )? + }; + let catch_up_performed = apply_result.catch_up_needed; + let apply_result = if catch_up_performed { + let refreshed_meta = load_tree_meta_for_update(client, doc_id)?; + let catch_up = if meta.state().replay_from.is_none() { + // No replay frontier existed before this append, so if one was just scheduled it came + // from the current batch. That is the narrow case where the direct rewind fast path is + // safe to try before falling back to conservative replay-from-frontier catch-up. + try_direct_rewind_catch_up_materialized_state( + &PgOpStorage::new(ctx.clone()), + &inserted_op_ids, + PersistedRemoteStores { + replica_id: ReplicaId::new(b"postgres"), + clock: LamportClock::default(), + nodes: PgNodeStore::new(ctx.clone()), + payloads: PgPayloadStore::new(ctx.clone()), + index: PgParentOpIndex::new(ctx.clone()), + }, + &refreshed_meta, + |nodes| nodes.flush_last_change(), + |index| index.flush(), + )? + .unwrap_or(catch_up_materialized_state( + PgOpStorage::new(ctx.clone()), + PersistedRemoteStores { + replica_id: ReplicaId::new(b"postgres"), + clock: LamportClock::default(), + nodes: PgNodeStore::new(ctx.clone()), + payloads: PgPayloadStore::new(ctx.clone()), + index: PgParentOpIndex::new(ctx.clone()), + }, + &refreshed_meta, + |nodes| nodes.flush_last_change(), + |index| index.flush(), + )?) + } else { + catch_up_materialized_state( + PgOpStorage::new(ctx.clone()), + PersistedRemoteStores { + replica_id: ReplicaId::new(b"postgres"), + clock: LamportClock::default(), + nodes: PgNodeStore::new(ctx.clone()), + payloads: PgPayloadStore::new(ctx.clone()), + index: PgParentOpIndex::new(ctx.clone()), + }, + &refreshed_meta, + |nodes| nodes.flush_last_change(), + |index| index.flush(), + )? + }; + update_head( + catch_up + .head + .as_ref() + .ok_or_else(|| Error::Storage("expected head after immediate catch-up".into()))?, + )?; + treecrdt_core::PersistedRemoteApplyResult { + inserted_count: apply_result.inserted_count, + affected_nodes: catch_up.affected_nodes, + catch_up_needed: false, + } + } else { + apply_result + }; + if let Some(profile) = &append_profile { + profile.borrow_mut().materialize_ms += + materialize_started_at.elapsed().as_secs_f64() * 1000.0; + } + + if let Some(profile) = &append_profile { + profile.borrow_mut().update_head_ms += update_head_ms; + if catch_up_performed { + profile.borrow_mut().catch_up_performed = true; + } + profile.borrow().log(doc_id, apply_result.inserted_count as usize); + } + + Ok(AppendOpsResult { + inserted_count: apply_result.inserted_count, + affected_nodes: apply_result.affected_nodes, + }) +} + +pub fn ensure_materialized(client: &Rc>, doc_id: &str) -> Result<()> { + { + let mut c = client.borrow_mut(); + c.batch_execute("BEGIN").map_err(|e| Error::Storage(e.to_string()))?; + } + + let res = ensure_materialized_in_tx(client, doc_id); + + match res { + Ok(()) => { + let mut c = client.borrow_mut(); + c.batch_execute("COMMIT").map_err(|e| Error::Storage(e.to_string()))?; + Ok(()) + } + Err(e) => { + let mut c = client.borrow_mut(); + let _ = c.batch_execute("ROLLBACK"); + Err(e) + } + } +} + +pub(crate) fn ensure_materialized_in_tx(client: &Rc>, doc_id: &str) -> Result<()> { + let meta = load_tree_meta(client, doc_id)?; + if meta.state().replay_from.is_none() { + return Ok(()); + } + + // Take a per-doc lock so catch-up can't race with concurrent append/materialization. + let meta = load_tree_meta_for_update(client, doc_id)?; + if meta.state().replay_from.is_none() { + return Ok(()); + } + + let ctx = PgCtx::new(client.clone(), doc_id)?; + let storage = PgOpStorage::new(ctx.clone()); + let catch_up = catch_up_materialized_state( + storage, + PersistedRemoteStores { + replica_id: ReplicaId::new(b"postgres"), + clock: LamportClock::default(), + nodes: PgNodeStore::new(ctx.clone()), + payloads: PgPayloadStore::new(ctx.clone()), + index: PgParentOpIndex::new(ctx.clone()), + }, + &meta, + |nodes| nodes.flush_last_change(), + |index| index.flush(), + )?; + + update_tree_meta_head(client, doc_id, catch_up.head.as_ref())?; + + Ok(()) +} diff --git a/packages/treecrdt-postgres-rs/src/store/meta.rs b/packages/treecrdt-postgres-rs/src/store/meta.rs new file mode 100644 index 00000000..1a7e4f5c --- /dev/null +++ b/packages/treecrdt-postgres-rs/src/store/meta.rs @@ -0,0 +1,200 @@ +use std::cell::RefCell; +use std::collections::HashMap; +use std::rc::Rc; + +use postgres::{Client, Statement}; + +use treecrdt_core::{ + Error, Lamport, MaterializationCursor, MaterializationFrontier, MaterializationHead, + MaterializationKey, MaterializationState, Result, +}; + +use crate::profile::PgAppendProfile; + +use super::storage_debug; + +#[derive(Clone, Debug)] +pub(crate) struct TreeMeta(pub(crate) MaterializationState); + +impl MaterializationCursor for TreeMeta { + fn state(&self) -> MaterializationState<&[u8]> { + self.0.as_borrowed() + } +} + +pub(crate) fn ensure_doc_meta(client: &Rc>, doc_id: &str) -> Result<()> { + let mut c = client.borrow_mut(); + c.execute( + "INSERT INTO treecrdt_meta(doc_id) VALUES ($1) ON CONFLICT (doc_id) DO NOTHING", + &[&doc_id], + ) + .map_err(storage_debug)?; + Ok(()) +} + +fn load_tree_meta_row( + client: &Rc>, + doc_id: &str, + for_update: bool, +) -> Result { + let ctx = PgCtx::new(client.clone(), doc_id)?; + let mut c = client.borrow_mut(); + let stmt = if for_update { + ctx.stmt( + &mut c, + "SELECT head_lamport, head_replica, head_counter, head_seq, \ + replay_lamport, replay_replica, replay_counter \ + FROM treecrdt_meta WHERE doc_id = $1 FOR UPDATE", + )? + } else { + ctx.stmt( + &mut c, + "SELECT head_lamport, head_replica, head_counter, head_seq, \ + replay_lamport, replay_replica, replay_counter \ + FROM treecrdt_meta WHERE doc_id = $1 LIMIT 1", + )? + }; + let rows = c.query(&stmt, &[&doc_id]).map_err(storage_debug)?; + + let row = rows.first().ok_or_else(|| Error::Storage("missing treecrdt_meta row".into()))?; + + let head_lamport = row.get::<_, i64>(0).max(0) as Lamport; + let head_replica = row.get::<_, Vec>(1); + let head_counter = row.get::<_, i64>(2).max(0) as u64; + let head_seq = row.get::<_, i64>(3).max(0) as u64; + let replay_lamport = row.get::<_, Option>(4).map(|v| v.max(0) as Lamport); + let replay_replica = row.get::<_, Option>>(5); + let replay_counter = row.get::<_, Option>(6).map(|v| v.max(0) as u64); + + let head = if head_seq == 0 && head_lamport == 0 && head_replica.is_empty() && head_counter == 0 + { + None + } else { + Some(MaterializationHead { + at: MaterializationKey { + lamport: head_lamport, + replica: head_replica, + counter: head_counter, + }, + seq: head_seq, + }) + }; + let replay_from = match (replay_lamport, replay_replica, replay_counter) { + (Some(lamport), Some(replica), Some(counter)) => Some(MaterializationKey { + lamport, + replica, + counter, + }), + _ => None, + }; + + Ok(TreeMeta(MaterializationState { head, replay_from })) +} + +pub(super) fn load_tree_meta(client: &Rc>, doc_id: &str) -> Result { + load_tree_meta_row(client, doc_id, false) +} + +pub(crate) fn load_tree_meta_for_update( + client: &Rc>, + doc_id: &str, +) -> Result { + load_tree_meta_row(client, doc_id, true) +} + +pub(crate) fn set_tree_meta_replay_frontier( + client: &Rc>, + doc_id: &str, + frontier: &MaterializationFrontier, +) -> Result<()> { + ensure_doc_meta(client, doc_id)?; + let mut c = client.borrow_mut(); + c.execute( + "UPDATE treecrdt_meta \ + SET replay_lamport = $2, replay_replica = $3, replay_counter = $4 \ + WHERE doc_id = $1", + &[ + &doc_id, + &(frontier.lamport as i64), + &frontier.replica, + &(frontier.counter as i64), + ], + ) + .map_err(|e| Error::Storage(e.to_string()))?; + Ok(()) +} + +pub(crate) fn update_tree_meta_head>( + client: &Rc>, + doc_id: &str, + head: Option<&MaterializationHead>, +) -> Result<()> { + ensure_doc_meta(client, doc_id)?; + let (lamport, replica, counter, seq): (Lamport, &[u8], u64, u64) = match head { + Some(head) => ( + head.at.lamport, + head.at.replica.as_ref(), + head.at.counter, + head.seq, + ), + None => (0, &[], 0, 0), + }; + let mut c = client.borrow_mut(); + c.execute( + "UPDATE treecrdt_meta \ + SET head_lamport = $2, \ + head_replica = $3, \ + head_counter = $4, \ + head_seq = $5, \ + replay_lamport = NULL, \ + replay_replica = NULL, \ + replay_counter = NULL \ + WHERE doc_id = $1", + &[ + &doc_id, + &(lamport as i64), + &replica, + &(counter as i64), + &(seq as i64), + ], + ) + .map_err(|e| Error::Storage(e.to_string()))?; + Ok(()) +} + +#[derive(Clone)] +pub(crate) struct PgCtx { + pub(crate) doc_id: String, + pub(crate) client: Rc>, + stmts: Rc>>, + pub(super) append_profile: Option>>, +} + +impl PgCtx { + pub(crate) fn new(client: Rc>, doc_id: &str) -> Result { + Self::new_with_profile(client, doc_id, None) + } + + pub(super) fn new_with_profile( + client: Rc>, + doc_id: &str, + append_profile: Option>>, + ) -> Result { + ensure_doc_meta(&client, doc_id)?; + Ok(Self { + doc_id: doc_id.to_string(), + client, + stmts: Rc::new(RefCell::new(HashMap::new())), + append_profile, + }) + } + + pub(crate) fn stmt(&self, c: &mut Client, sql: &'static str) -> Result { + if let Some(stmt) = self.stmts.borrow().get(sql) { + return Ok(stmt.clone()); + } + let stmt = c.prepare(sql).map_err(storage_debug)?; + self.stmts.borrow_mut().insert(sql, stmt.clone()); + Ok(stmt) + } +} From e6e641bf15c6c93fe595989dfe398dbe6baccb81 Mon Sep 17 00:00:00 2001 From: Marcus Pousette Date: Mon, 20 Apr 2026 15:50:11 +0200 Subject: [PATCH 13/13] refactor(core): share adapter append orchestration --- packages/treecrdt-core/src/lib.rs | 9 +- packages/treecrdt-core/src/materialization.rs | 147 ++++++++++++++++++ .../treecrdt-postgres-rs/src/store/append.rs | 129 ++++----------- .../src/extension/functions/materialize.rs | 131 ++++------------ .../src/extension/functions/schema.rs | 6 +- 5 files changed, 211 insertions(+), 211 deletions(-) diff --git a/packages/treecrdt-core/src/lib.rs b/packages/treecrdt-core/src/lib.rs index ecbcb777..8fa89423 100644 --- a/packages/treecrdt-core/src/lib.rs +++ b/packages/treecrdt-core/src/lib.rs @@ -17,10 +17,11 @@ pub use ids::{Lamport, NodeId, OperationId, ReplicaId}; pub use materialization::{ apply_incremental_ops_with_delta, apply_persisted_remote_ops_with_delta, catch_up_materialized_state, materialize_persisted_remote_ops_with_delta, - try_direct_rewind_catch_up_materialized_state, try_shortcut_out_of_order_payload_noops, - CatchUpResult, FrontierRewindStorage, IncrementalApplyResult, MaterializationCursor, - MaterializationFrontier, MaterializationFrontierRef, MaterializationHead, MaterializationKey, - MaterializationState, MaterializationStateRef, PayloadNoopShortcut, PersistedRemoteApplyResult, + orchestrate_persisted_remote_append, try_direct_rewind_catch_up_materialized_state, + try_shortcut_out_of_order_payload_noops, CatchUpResult, FrontierRewindStorage, + IncrementalApplyResult, MaterializationCursor, MaterializationFrontier, + MaterializationFrontierRef, MaterializationHead, MaterializationKey, MaterializationState, + MaterializationStateRef, PayloadNoopShortcut, PersistedRemoteApplyResult, PersistedRemoteStores, }; pub use ops::{cmp_op_key, cmp_ops, Operation, OperationKind, OperationMetadata}; diff --git a/packages/treecrdt-core/src/materialization.rs b/packages/treecrdt-core/src/materialization.rs index 96df8bab..e72e604e 100644 --- a/packages/treecrdt-core/src/materialization.rs +++ b/packages/treecrdt-core/src/materialization.rs @@ -73,6 +73,18 @@ pub trait MaterializationCursor { fn state(&self) -> MaterializationStateRef<'_>; } +impl> MaterializationCursor for MaterializationState { + fn state(&self) -> MaterializationStateRef<'_> { + self.as_borrowed() + } +} + +impl MaterializationCursor for &T { + fn state(&self) -> MaterializationStateRef<'_> { + (**self).state() + } +} + /// Optional storage hooks for direct rewind/replay of a frontier-invalidated suffix. /// /// The default implementations are intentionally naive and scan the full op log in memory. Real @@ -1018,3 +1030,138 @@ where } } } + +/// Shared adapter append orchestration for already-persisted remote ops. +/// +/// Adapters still own transactions, dedupe, and concrete backend stores. This helper just +/// centralizes the repeated control flow around: +/// - payload noop shortcut +/// - incremental materialization vs replay frontier scheduling +/// - direct rewind fast path when the current batch introduced the frontier +/// - conservative catch-up fallback +#[allow(clippy::too_many_arguments)] +pub fn orchestrate_persisted_remote_append< + M, + LoadWriter, + MaterializeInserted, + UpdateHead, + ScheduleReplay, + LoadCatchUpMeta, + TryDirectRewind, + CatchUp, + MissingHead, + E, +>( + meta: &M, + inserted_ops: Vec, + mut load_last_writer: LoadWriter, + mut materialize_inserted: MaterializeInserted, + mut update_head: UpdateHead, + mut schedule_replay: ScheduleReplay, + mut load_catch_up_meta: LoadCatchUpMeta, + mut try_direct_rewind: TryDirectRewind, + mut catch_up: CatchUp, + mut missing_head_error: MissingHead, +) -> std::result::Result +where + M: MaterializationCursor, + LoadWriter: FnMut(NodeId) -> std::result::Result, E>, + MaterializeInserted: FnMut( + &dyn MaterializationCursor, + Vec, + ) -> std::result::Result, + UpdateHead: FnMut(&MaterializationHead) -> std::result::Result<(), E>, + ScheduleReplay: FnMut(&MaterializationFrontier) -> std::result::Result<(), E>, + LoadCatchUpMeta: FnMut() -> std::result::Result, + TryDirectRewind: FnMut( + &dyn MaterializationCursor, + &HashSet, + ) -> std::result::Result, E>, + CatchUp: FnMut(&dyn MaterializationCursor) -> std::result::Result, + MissingHead: FnMut(&'static str) -> E, +{ + let inserted_count = inserted_ops.len().min(u64::MAX as usize) as u64; + if inserted_count == 0 { + return Ok(PersistedRemoteApplyResult { + inserted_count: 0, + affected_nodes: Vec::new(), + catch_up_needed: false, + }); + } + + let inserted_op_ids: HashSet = + inserted_ops.iter().map(|op| op.meta.id.clone()).collect(); + let had_pending_frontier = meta.state().replay_from.is_some(); + + let apply_result = if let Some(shortcut) = { + try_shortcut_out_of_order_payload_noops(meta, inserted_ops.clone(), |node| { + load_last_writer(node) + })? + } { + if shortcut.remaining_ops.is_empty() { + update_head(&shortcut.resumed_head)?; + PersistedRemoteApplyResult { + inserted_count, + affected_nodes: shortcut.affected_nodes, + catch_up_needed: false, + } + } else { + let shortcut_meta = MaterializationState { + head: Some(shortcut.resumed_head.clone()), + replay_from: None, + }; + let result = materialize_inserted(&shortcut_meta, shortcut.remaining_ops)?; + let Some(head) = result.head else { + schedule_replay(&start_replay_frontier())?; + return Ok(PersistedRemoteApplyResult { + inserted_count, + affected_nodes: Vec::new(), + catch_up_needed: true, + }); + }; + update_head(&head)?; + + let mut affected_nodes = shortcut.affected_nodes; + affected_nodes.extend(result.affected_nodes); + affected_nodes.sort(); + affected_nodes.dedup(); + + PersistedRemoteApplyResult { + inserted_count, + affected_nodes, + catch_up_needed: false, + } + } + } else { + apply_persisted_remote_ops_with_delta( + meta, + inserted_ops, + |inserted| materialize_inserted(meta, inserted), + &mut update_head, + &mut schedule_replay, + )? + }; + + if !apply_result.catch_up_needed { + return Ok(apply_result); + } + + let refreshed_meta = load_catch_up_meta()?; + let catch_up_result = if !had_pending_frontier { + try_direct_rewind(&refreshed_meta, &inserted_op_ids)?.unwrap_or(catch_up(&refreshed_meta)?) + } else { + catch_up(&refreshed_meta)? + }; + + let head = catch_up_result + .head + .as_ref() + .ok_or_else(|| missing_head_error("expected head after immediate catch-up"))?; + update_head(head)?; + + Ok(PersistedRemoteApplyResult { + inserted_count: apply_result.inserted_count, + affected_nodes: catch_up_result.affected_nodes, + catch_up_needed: false, + }) +} diff --git a/packages/treecrdt-postgres-rs/src/store/append.rs b/packages/treecrdt-postgres-rs/src/store/append.rs index 6c6ac7d0..c97128d3 100644 --- a/packages/treecrdt-postgres-rs/src/store/append.rs +++ b/packages/treecrdt-postgres-rs/src/store/append.rs @@ -1,26 +1,24 @@ use std::cell::RefCell; -use std::collections::HashSet; use std::rc::Rc; use std::time::Instant; use postgres::Client; use treecrdt_core::{ - apply_persisted_remote_ops_with_delta, catch_up_materialized_state, - materialize_persisted_remote_ops_with_delta, try_direct_rewind_catch_up_materialized_state, - try_shortcut_out_of_order_payload_noops, Error, LamportClock, MaterializationCursor, - MaterializationHead, MaterializationState, NodeId, Operation, OperationId, OperationKind, + catch_up_materialized_state, materialize_persisted_remote_ops_with_delta, + orchestrate_persisted_remote_append, try_direct_rewind_catch_up_materialized_state, Error, + LamportClock, MaterializationCursor, MaterializationHead, NodeId, Operation, OperationKind, PersistedRemoteStores, ReplicaId, Result, }; use crate::profile::{append_profile_enabled, PgAppendProfile}; -use super::*; use super::meta::load_tree_meta; +use super::*; fn materialize_inserted_ops( ctx: PgCtx, - meta: &TreeMeta, + meta: &dyn MaterializationCursor, ops: Vec, ) -> Result { // At this point treecrdt_ops already contains the inserted operations. This temporary @@ -34,7 +32,7 @@ fn materialize_inserted_ops( payloads: PgPayloadStore::new(ctx.clone()), index: PgParentOpIndex::new(ctx.clone()), }, - meta, + &meta, ops, |nodes, ops| { if ops.iter().any(|op| matches!(op.kind, OperationKind::Payload { .. })) { @@ -48,13 +46,6 @@ fn materialize_inserted_ops( ) } -fn merge_affected_nodes(mut left: Vec, right: Vec) -> Vec { - left.extend(right); - left.sort(); - left.dedup(); - left -} - pub fn append_ops(client: &Rc>, doc_id: &str, ops: &[Operation]) -> Result { { let mut c = client.borrow_mut(); @@ -143,8 +134,6 @@ fn append_ops_in_tx( // Only materialize the ops Postgres actually inserted. This keeps duplicate opRefs in the // input batch from being replayed twice through core materialization. let inserted_ops = select_inserted_ops(&ctx, ops, inserted_op_refs); - let inserted_op_ids: HashSet = - inserted_ops.iter().map(|op| op.meta.id.clone()).collect(); if let Some(profile) = &append_profile { profile.borrow_mut().dedupe_filter_ms += dedupe_filter_started_at.elapsed().as_secs_f64() * 1000.0; @@ -158,71 +147,21 @@ fn append_ops_in_tx( update_head_ms += started_at.elapsed().as_secs_f64() * 1000.0; result }; - let apply_result = if let Some(shortcut) = { - let payloads = PgPayloadStore::new(ctx.clone()); - try_shortcut_out_of_order_payload_noops(&meta, inserted_ops.clone(), |node| { - payloads.last_writer(node) - })? - } { - if shortcut.remaining_ops.is_empty() { - update_head(&shortcut.resumed_head)?; - treecrdt_core::PersistedRemoteApplyResult { - inserted_count: inserted_ops.len().min(u64::MAX as usize) as u64, - affected_nodes: shortcut.affected_nodes, - catch_up_needed: false, - } - } else { - let shortcut_meta = TreeMeta(MaterializationState { - head: Some(shortcut.resumed_head.clone()), - replay_from: None, - }); - let result = - materialize_inserted_ops(ctx.clone(), &shortcut_meta, shortcut.remaining_ops)?; - let head = result.head.ok_or_else(|| { - Error::Storage("expected head after payload noop shortcut".into()) - })?; - update_head(&head)?; - treecrdt_core::PersistedRemoteApplyResult { - inserted_count: inserted_ops.len().min(u64::MAX as usize) as u64, - affected_nodes: merge_affected_nodes( - shortcut.affected_nodes, - result.affected_nodes, - ), - catch_up_needed: false, - } - } - } else { - apply_persisted_remote_ops_with_delta( - &meta, - inserted_ops, - |inserted| materialize_inserted_ops(ctx.clone(), &meta, inserted), - &mut update_head, - |frontier| set_tree_meta_replay_frontier(client, doc_id, frontier), - )? - }; - let catch_up_performed = apply_result.catch_up_needed; - let apply_result = if catch_up_performed { - let refreshed_meta = load_tree_meta_for_update(client, doc_id)?; - let catch_up = if meta.state().replay_from.is_none() { - // No replay frontier existed before this append, so if one was just scheduled it came - // from the current batch. That is the narrow case where the direct rewind fast path is - // safe to try before falling back to conservative replay-from-frontier catch-up. + let apply_result = orchestrate_persisted_remote_append( + &meta, + inserted_ops, + { + let payloads = PgPayloadStore::new(ctx.clone()); + move |node| payloads.last_writer(node) + }, + |meta, inserted| materialize_inserted_ops(ctx.clone(), meta, inserted), + &mut update_head, + |frontier| set_tree_meta_replay_frontier(client, doc_id, frontier), + || Ok(load_tree_meta_for_update(client, doc_id)?.0), + |meta, inserted_op_ids| { try_direct_rewind_catch_up_materialized_state( &PgOpStorage::new(ctx.clone()), - &inserted_op_ids, - PersistedRemoteStores { - replica_id: ReplicaId::new(b"postgres"), - clock: LamportClock::default(), - nodes: PgNodeStore::new(ctx.clone()), - payloads: PgPayloadStore::new(ctx.clone()), - index: PgParentOpIndex::new(ctx.clone()), - }, - &refreshed_meta, - |nodes| nodes.flush_last_change(), - |index| index.flush(), - )? - .unwrap_or(catch_up_materialized_state( - PgOpStorage::new(ctx.clone()), + inserted_op_ids, PersistedRemoteStores { replica_id: ReplicaId::new(b"postgres"), clock: LamportClock::default(), @@ -230,11 +169,12 @@ fn append_ops_in_tx( payloads: PgPayloadStore::new(ctx.clone()), index: PgParentOpIndex::new(ctx.clone()), }, - &refreshed_meta, + &meta, |nodes| nodes.flush_last_change(), |index| index.flush(), - )?) - } else { + ) + }, + |meta| { catch_up_materialized_state( PgOpStorage::new(ctx.clone()), PersistedRemoteStores { @@ -244,25 +184,14 @@ fn append_ops_in_tx( payloads: PgPayloadStore::new(ctx.clone()), index: PgParentOpIndex::new(ctx.clone()), }, - &refreshed_meta, + &meta, |nodes| nodes.flush_last_change(), |index| index.flush(), - )? - }; - update_head( - catch_up - .head - .as_ref() - .ok_or_else(|| Error::Storage("expected head after immediate catch-up".into()))?, - )?; - treecrdt_core::PersistedRemoteApplyResult { - inserted_count: apply_result.inserted_count, - affected_nodes: catch_up.affected_nodes, - catch_up_needed: false, - } - } else { - apply_result - }; + ) + }, + |message| Error::Storage(message.into()), + )?; + let catch_up_performed = apply_result.catch_up_needed; if let Some(profile) = &append_profile { profile.borrow_mut().materialize_ms += materialize_started_at.elapsed().as_secs_f64() * 1000.0; diff --git a/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs b/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs index 84c1ac95..e0e25d23 100644 --- a/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs +++ b/packages/treecrdt-sqlite-ext/src/extension/functions/materialize.rs @@ -2,23 +2,16 @@ use super::append::JsonAppendOp; use super::node_store::SqliteNodeStore; use super::op_index::SqliteParentOpIndex; use super::payload_store::SqlitePayloadStore; -use super::schema::{set_tree_meta_replay_frontier, tree_meta_from_state}; +use super::schema::set_tree_meta_replay_frontier; use super::util::sqlite_err_from_core; use super::*; -use std::collections::HashSet; use treecrdt_core::PayloadStore; use treecrdt_core::Storage; use treecrdt_core::{ - try_direct_rewind_catch_up_materialized_state, LamportClock, MaterializationCursor, ReplicaId, + orchestrate_persisted_remote_append, try_direct_rewind_catch_up_materialized_state, + LamportClock, MaterializationCursor, ReplicaId, }; -fn merge_affected_nodes(mut left: Vec, right: Vec) -> Vec { - left.extend(right); - left.sort(); - left.dedup(); - left -} - fn parse_node_id(bytes: &[u8]) -> Result { if bytes.len() != 16 { return Err(SQLITE_ERROR as c_int); @@ -96,8 +89,8 @@ fn json_append_op_to_operation(op: &JsonAppendOp) -> Result, ) -> Result { use treecrdt_core::{ materialize_persisted_remote_ops_with_delta, LamportClock, PersistedRemoteStores, ReplicaId, @@ -113,8 +106,8 @@ fn materialize_inserted_ops( index: SqliteParentOpIndex::prepare(db, doc_id.to_vec()) .map_err(|_| SQLITE_ERROR as c_int)?, }, - meta, - ops.to_vec(), + &meta, + ops, |_, _| Ok(()), |_| Ok(()), |_| Ok(()), @@ -274,60 +267,21 @@ pub(super) fn append_ops_impl( inserted_ops.push(operation); } } - let inserted_op_ids: HashSet = - inserted_ops.iter().map(|op| op.meta.id.clone()).collect(); - - let apply_result = if let Some(shortcut) = { - let payloads = SqlitePayloadStore::prepare(db).map_err(|_| SQLITE_ERROR as c_int)?; - treecrdt_core::try_shortcut_out_of_order_payload_noops( - &meta, - inserted_ops.clone(), - |node| payloads.last_writer(node).map_err(sqlite_err_from_core), - )? - } { - if shortcut.remaining_ops.is_empty() { - update_tree_meta_head(db, Some(&shortcut.resumed_head))?; - treecrdt_core::PersistedRemoteApplyResult { - inserted_count: inserted_ops.len().min(u64::MAX as usize) as u64, - affected_nodes: shortcut.affected_nodes, - catch_up_needed: false, - } - } else { - let shortcut_meta = tree_meta_from_state(treecrdt_core::MaterializationState { - head: Some(shortcut.resumed_head.clone()), - replay_from: None, - }); - let result = - materialize_inserted_ops(db, doc_id, &shortcut_meta, &shortcut.remaining_ops)?; - let head = result.head.ok_or(SQLITE_ERROR as c_int)?; - update_tree_meta_head(db, Some(&head))?; - treecrdt_core::PersistedRemoteApplyResult { - inserted_count: inserted_ops.len().min(u64::MAX as usize) as u64, - affected_nodes: merge_affected_nodes( - shortcut.affected_nodes, - result.affected_nodes, - ), - catch_up_needed: false, - } - } - } else { - treecrdt_core::apply_persisted_remote_ops_with_delta( - &meta, - inserted_ops, - |inserted| materialize_inserted_ops(db, doc_id, &meta, &inserted), - |head| update_tree_meta_head(db, Some(head)), - |frontier| set_tree_meta_replay_frontier(db, frontier), - )? - }; - let apply_result = if apply_result.catch_up_needed { - let refreshed_meta = load_tree_meta(db)?; - let catch_up = if meta.state().replay_from.is_none() { - // Mirror the Postgres path: only try direct rewind when this append introduced the - // frontier. If the document was already behind before the append, stay on the - // conservative replay-from-frontier catch-up path. + let apply_result = orchestrate_persisted_remote_append( + &meta, + inserted_ops, + { + let payloads = SqlitePayloadStore::prepare(db).map_err(|_| SQLITE_ERROR as c_int)?; + move |node| payloads.last_writer(node).map_err(sqlite_err_from_core) + }, + |meta, inserted| materialize_inserted_ops(db, doc_id, meta, inserted), + |head| update_tree_meta_head(db, Some(head)), + |frontier| set_tree_meta_replay_frontier(db, frontier), + || Ok(load_tree_meta(db)?.0), + |meta, inserted_op_ids| { try_direct_rewind_catch_up_materialized_state( &super::op_storage::SqliteOpStorage::with_doc_id(db, doc_id.to_vec()), - &inserted_op_ids, + inserted_op_ids, treecrdt_core::PersistedRemoteStores { replica_id: ReplicaId::new(b"sqlite-ext"), clock: LamportClock::default(), @@ -336,30 +290,13 @@ pub(super) fn append_ops_impl( index: SqliteParentOpIndex::prepare(db, doc_id.to_vec()) .map_err(|_| SQLITE_ERROR as c_int)?, }, - &refreshed_meta, + &meta, |_| Ok(()), |_| Ok(()), ) - .map_err(|_| SQLITE_ERROR as c_int)? - .unwrap_or( - treecrdt_core::catch_up_materialized_state( - super::op_storage::SqliteOpStorage::with_doc_id(db, doc_id.to_vec()), - treecrdt_core::PersistedRemoteStores { - replica_id: ReplicaId::new(b"sqlite-ext"), - clock: LamportClock::default(), - nodes: SqliteNodeStore::prepare(db).map_err(|_| SQLITE_ERROR as c_int)?, - payloads: SqlitePayloadStore::prepare(db) - .map_err(|_| SQLITE_ERROR as c_int)?, - index: SqliteParentOpIndex::prepare(db, doc_id.to_vec()) - .map_err(|_| SQLITE_ERROR as c_int)?, - }, - &refreshed_meta, - |_| Ok(()), - |_| Ok(()), - ) - .map_err(|_| SQLITE_ERROR as c_int)?, - ) - } else { + .map_err(|_| SQLITE_ERROR as c_int) + }, + |meta| { treecrdt_core::catch_up_materialized_state( super::op_storage::SqliteOpStorage::with_doc_id(db, doc_id.to_vec()), treecrdt_core::PersistedRemoteStores { @@ -370,24 +307,14 @@ pub(super) fn append_ops_impl( index: SqliteParentOpIndex::prepare(db, doc_id.to_vec()) .map_err(|_| SQLITE_ERROR as c_int)?, }, - &refreshed_meta, + &meta, |_| Ok(()), |_| Ok(()), ) - .map_err(|_| SQLITE_ERROR as c_int)? - }; - update_tree_meta_head( - db, - Some(catch_up.head.as_ref().ok_or(SQLITE_ERROR as c_int)?), - )?; - treecrdt_core::PersistedRemoteApplyResult { - inserted_count: apply_result.inserted_count, - affected_nodes: catch_up.affected_nodes, - catch_up_needed: false, - } - } else { - apply_result - }; + .map_err(|_| SQLITE_ERROR as c_int) + }, + |_| SQLITE_ERROR as c_int, + )?; let commit_rc = sqlite_exec(db, commit.as_ptr(), None, null_mut(), null_mut()); if commit_rc != SQLITE_OK as c_int { diff --git a/packages/treecrdt-sqlite-ext/src/extension/functions/schema.rs b/packages/treecrdt-sqlite-ext/src/extension/functions/schema.rs index c9690cb8..6bf6bc5a 100644 --- a/packages/treecrdt-sqlite-ext/src/extension/functions/schema.rs +++ b/packages/treecrdt-sqlite-ext/src/extension/functions/schema.rs @@ -12,7 +12,7 @@ use treecrdt_core::{ pub(super) const ROOT_NODE_ID: [u8; 16] = [0u8; 16]; #[derive(Clone, Debug)] -pub(super) struct TreeMeta(MaterializationState); +pub(super) struct TreeMeta(pub(super) MaterializationState); impl MaterializationCursor for TreeMeta { fn state(&self) -> MaterializationState<&[u8]> { @@ -20,10 +20,6 @@ impl MaterializationCursor for TreeMeta { } } -pub(super) fn tree_meta_from_state(state: MaterializationState) -> TreeMeta { - TreeMeta(state) -} - pub(super) fn load_doc_id(db: *mut sqlite3) -> Result>, c_int> { let sql = CString::new("SELECT value FROM meta WHERE key = 'doc_id' LIMIT 1").expect("doc id sql");