From 76cfc8432f1dce7bdb43d5ede37ccc618fb01126 Mon Sep 17 00:00:00 2001 From: nadav-govari Date: Tue, 17 Feb 2026 11:45:55 -0500 Subject: [PATCH 1/9] Implement IngesterCapacityScore broadcast (#6152) --- quickwit/quickwit-common/src/lib.rs | 1 + quickwit/quickwit-common/src/ring_buffer.rs | 170 +++++++ quickwit/quickwit-common/src/shared_consts.rs | 3 + quickwit/quickwit-ingest/Cargo.toml | 2 +- .../broadcast/ingester_capacity_score.rs | 455 ++++++++++++++++++ .../local_shards.rs} | 88 +--- .../src/ingest_v2/broadcast/mod.rs | 76 +++ .../quickwit-ingest/src/ingest_v2/state.rs | 87 +++- 8 files changed, 809 insertions(+), 73 deletions(-) create mode 100644 quickwit/quickwit-common/src/ring_buffer.rs create mode 100644 quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs rename quickwit/quickwit-ingest/src/ingest_v2/{broadcast.rs => broadcast/local_shards.rs} (91%) create mode 100644 quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs diff --git a/quickwit/quickwit-common/src/lib.rs b/quickwit/quickwit-common/src/lib.rs index 0f3af2bc5ba..11147f975f9 100644 --- a/quickwit/quickwit-common/src/lib.rs +++ b/quickwit/quickwit-common/src/lib.rs @@ -36,6 +36,7 @@ pub mod rate_limited_tracing; pub mod rate_limiter; pub mod rendezvous_hasher; pub mod retry; +pub mod ring_buffer; pub mod runtimes; pub mod shared_consts; pub mod sorted_iter; diff --git a/quickwit/quickwit-common/src/ring_buffer.rs b/quickwit/quickwit-common/src/ring_buffer.rs new file mode 100644 index 00000000000..5d884d8188f --- /dev/null +++ b/quickwit/quickwit-common/src/ring_buffer.rs @@ -0,0 +1,170 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::{Debug, Formatter}; + +/// Fixed-size buffer that keeps the last N elements pushed into it. +/// +/// `head` is the write cursor. It advances by one on each push and wraps +/// back to 0 when it reaches N, overwriting the oldest element. +/// +/// ```text +/// RingBuffer after pushing 1, 2, 3, 4, 5, 6: +/// +/// buffer = [5, 6, 3, 4] head = 2 len = 4 +/// ^ +/// next write goes here +/// +/// logical view (oldest → newest): [3, 4, 5, 6] +/// ``` +pub struct RingBuffer { + buffer: [T; N], + head: usize, + len: usize, +} + +impl Default for RingBuffer { + fn default() -> Self { + Self { + buffer: [T::default(); N], + head: 0, + len: 0, + } + } +} + +impl Debug for RingBuffer { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_list().entries(self.iter()).finish() + } +} + +impl RingBuffer { + pub fn push_back(&mut self, value: T) { + self.buffer[self.head] = value; + self.head = (self.head + 1) % N; + if self.len < N { + self.len += 1; + } + } + + pub fn last(&self) -> Option { + if self.len == 0 { + return None; + } + Some(self.buffer[(self.head + N - 1) % N]) + } + + pub fn front(&self) -> Option { + if self.len == 0 { + return None; + } + Some(self.buffer[(self.head + N - self.len) % N]) + } + + pub fn len(&self) -> usize { + self.len + } + + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + pub fn iter(&self) -> impl Iterator + '_ { + let start = (self.head + N - self.len) % N; + (0..self.len).map(move |i| &self.buffer[(start + i) % N]) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_empty() { + let rb = RingBuffer::::default(); + assert!(rb.is_empty()); + assert_eq!(rb.len(), 0); + assert_eq!(rb.last(), None); + assert_eq!(rb.front(), None); + assert_eq!(rb.iter().count(), 0); + } + + #[test] + fn test_single_push() { + let mut rb = RingBuffer::::default(); + rb.push_back(10); + assert_eq!(rb.len(), 1); + assert!(!rb.is_empty()); + assert_eq!(rb.last(), Some(10)); + assert_eq!(rb.front(), Some(10)); + assert_eq!(rb.iter().copied().collect::>(), vec![10]); + } + + #[test] + fn test_partial_fill() { + let mut rb = RingBuffer::::default(); + rb.push_back(1); + rb.push_back(2); + rb.push_back(3); + assert_eq!(rb.len(), 3); + assert_eq!(rb.last(), Some(3)); + assert_eq!(rb.front(), Some(1)); + assert_eq!(rb.iter().copied().collect::>(), vec![1, 2, 3]); + } + + #[test] + fn test_exactly_full() { + let mut rb = RingBuffer::::default(); + for i in 1..=4 { + rb.push_back(i); + } + assert_eq!(rb.len(), 4); + assert_eq!(rb.last(), Some(4)); + assert_eq!(rb.front(), Some(1)); + assert_eq!(rb.iter().copied().collect::>(), vec![1, 2, 3, 4]); + } + + #[test] + fn test_wrap_around() { + let mut rb = RingBuffer::::default(); + for i in 1..=6 { + rb.push_back(i); + } + assert_eq!(rb.len(), 4); + assert_eq!(rb.last(), Some(6)); + assert_eq!(rb.front(), Some(3)); + assert_eq!(rb.iter().copied().collect::>(), vec![3, 4, 5, 6]); + } + + #[test] + fn test_many_wraps() { + let mut rb = RingBuffer::::default(); + for i in 1..=100 { + rb.push_back(i); + } + assert_eq!(rb.len(), 3); + assert_eq!(rb.last(), Some(100)); + assert_eq!(rb.front(), Some(98)); + assert_eq!(rb.iter().copied().collect::>(), vec![98, 99, 100]); + } + + #[test] + fn test_debug() { + let mut rb = RingBuffer::::default(); + rb.push_back(1); + rb.push_back(2); + assert_eq!(format!("{:?}", rb), "[1, 2]"); + } +} diff --git a/quickwit/quickwit-common/src/shared_consts.rs b/quickwit/quickwit-common/src/shared_consts.rs index 9923705f0b2..437058f28fb 100644 --- a/quickwit/quickwit-common/src/shared_consts.rs +++ b/quickwit/quickwit-common/src/shared_consts.rs @@ -64,6 +64,9 @@ pub const SCROLL_BATCH_LEN: usize = 1_000; /// Prefix used in chitchat to broadcast the list of primary shards hosted by a leader. pub const INGESTER_PRIMARY_SHARDS_PREFIX: &str = "ingester.primary_shards:"; +/// Prefix used in chitchat to broadcast per-source ingester capacity scores and open shard counts. +pub const INGESTER_CAPACITY_SCORE_PREFIX: &str = "ingester.capacity_score:"; + /// File name for the encoded list of fields in the split pub const SPLIT_FIELDS_FILE_NAME: &str = "split_fields"; diff --git a/quickwit/quickwit-ingest/Cargo.toml b/quickwit/quickwit-ingest/Cargo.toml index 3dfa0bf6c0c..3149f2aaaf3 100644 --- a/quickwit/quickwit-ingest/Cargo.toml +++ b/quickwit/quickwit-ingest/Cargo.toml @@ -18,6 +18,7 @@ bytesize = { workspace = true } fail = { workspace = true, optional = true } futures = { workspace = true } http = { workspace = true } +itertools = { workspace = true } mockall = { workspace = true, optional = true } mrecordlog = { workspace = true } once_cell = { workspace = true } @@ -43,7 +44,6 @@ quickwit-doc-mapper = { workspace = true, features = ["testsuite"] } quickwit-proto = { workspace = true } [dev-dependencies] -itertools = { workspace = true } mockall = { workspace = true } rand = { workspace = true } rand_distr = { workspace = true } diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs new file mode 100644 index 00000000000..6f8abc66ef8 --- /dev/null +++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs @@ -0,0 +1,455 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::BTreeSet; + +use anyhow::{Context, Result}; +use bytesize::ByteSize; +use quickwit_cluster::{Cluster, ListenerHandle}; +use quickwit_common::pubsub::{Event, EventBroker}; +use quickwit_common::ring_buffer::RingBuffer; +use quickwit_common::shared_consts::INGESTER_CAPACITY_SCORE_PREFIX; +use quickwit_proto::ingest::ingester::IngesterStatus; +use quickwit_proto::types::{IndexUid, NodeId, SourceId, SourceUid}; +use serde::{Deserialize, Serialize}; +use tokio::task::JoinHandle; +use tracing::{info, warn}; + +use super::{BROADCAST_INTERVAL_PERIOD, make_key, parse_key}; +use crate::ingest_v2::state::WeakIngesterState; + +pub type OpenShardCounts = Vec<(IndexUid, SourceId, usize)>; + +/// The lookback window length is meant to capture readings far enough back in time to give +/// a rough rate of change estimate. At size 6, with broadcast interval of 5 seconds, this would be +/// 30 seconds of readings. +const WAL_CAPACITY_LOOKBACK_WINDOW_LEN: usize = 6; + +/// The ring buffer stores one extra element so that `delta()` can compare the newest reading +/// with the one that is exactly `WAL_CAPACITY_LOOKBACK_WINDOW_LEN` steps ago. Otherwise, that +/// reading would be discarded when the next reading is inserted. +const WAL_CAPACITY_READINGS_LEN: usize = WAL_CAPACITY_LOOKBACK_WINDOW_LEN + 1; + +struct WalMemoryCapacityTimeSeries { + readings: RingBuffer, +} + +impl WalMemoryCapacityTimeSeries { + fn new() -> Self { + Self { + readings: RingBuffer::default(), + } + } + + fn record(&mut self, memory_used: ByteSize, memory_allocated: ByteSize) { + let allocated = memory_allocated.as_u64(); + if allocated == 0 { + self.readings.push_back(1.0); + return; + } + let remaining = 1.0 - (memory_used.as_u64() as f64 / allocated as f64); + self.readings.push_back(remaining.clamp(0.0, 1.0)); + } + + fn current(&self) -> Option { + self.readings.last() + } + + /// How much remaining capacity changed between the oldest and newest readings. + /// Positive = improving, negative = draining. + fn delta(&self) -> Option { + let current = self.readings.last()?; + let oldest = self.readings.front()?; + Some(current - oldest) + } +} + +/// Computes a capacity score from 0 to 10 using a PD controller. +/// +/// The score has two components: +/// +/// - **P (proportional):** How much WAL capacity remains right now. An ingester with 100% free +/// capacity gets `PROPORTIONAL_WEIGHT` points; 50% gets half; and so on. If remaining capacity +/// drops to `MIN_PERMISSIBLE_CAPACITY` or below, the score is immediately 0. +/// +/// - **D (derivative):** Up to `DERIVATIVE_WEIGHT` bonus points based on how fast remaining +/// capacity is changing over the lookback window. A higher drain rate is worse, so we invert it: +/// `drain / MAX_DRAIN_RATE` normalizes the drain to a 0–1 penalty, and subtracting from 1 +/// converts it into a 0–1 bonus. Multiplied by `DERIVATIVE_WEIGHT`, a stable node gets the full +/// bonus and a node draining at `MAX_DRAIN_RATE` or faster gets nothing. +/// +/// Putting it together: a completely idle ingester scores 10 (8 + 2). +/// One that is full but stable scores ~2. One that is draining rapidly scores less. +/// A score of 0 means the ingester is at or below minimum permissible capacity. +/// +/// Below this remaining capacity fraction, the score is immediately 0. +const MIN_PERMISSIBLE_CAPACITY: f64 = 0.05; +/// Weight of the proportional term (max points from P). +const PROPORTIONAL_WEIGHT: f64 = 8.0; +/// Weight of the derivative term (max points from D). +const DERIVATIVE_WEIGHT: f64 = 2.0; +/// The drain rate (as a fraction of total capacity over the lookback window) at which the +/// derivative penalty is fully applied. Drain rates beyond this are clamped. +const MAX_DRAIN_RATE: f64 = 0.10; + +fn compute_capacity_score(remaining_capacity: f64, capacity_delta: f64) -> usize { + if remaining_capacity <= MIN_PERMISSIBLE_CAPACITY { + return 0; + } + let p = PROPORTIONAL_WEIGHT * remaining_capacity; + let drain = (-capacity_delta).clamp(0.0, MAX_DRAIN_RATE); + let d = DERIVATIVE_WEIGHT * (1.0 - drain / MAX_DRAIN_RATE); + (p + d).clamp(0.0, 10.0) as usize +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct IngesterCapacityScore { + pub capacity_score: usize, + pub open_shard_count: usize, +} + +/// Periodically snapshots the ingester's WAL memory usage and open shard counts, computes +/// a capacity score, and broadcasts it to other nodes via Chitchat. +pub(crate) struct BroadcastIngesterCapacityScoreTask { + cluster: Cluster, + weak_state: WeakIngesterState, + wal_capacity_time_series: WalMemoryCapacityTimeSeries, +} + +impl BroadcastIngesterCapacityScoreTask { + pub fn spawn(cluster: Cluster, weak_state: WeakIngesterState) -> JoinHandle<()> { + let mut broadcaster = Self { + cluster, + weak_state, + wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(), + }; + tokio::spawn(async move { broadcaster.run().await }) + } + + async fn snapshot(&self) -> Result> { + let state = self + .weak_state + .upgrade() + .context("ingester state has been dropped")?; + + // lock fully asserts that the ingester is ready. There's a likelihood that this task runs + // before the WAL is loaded, so we make sure that the ingester is ready just in case. + if *state.status_rx.borrow() != IngesterStatus::Ready { + return Ok(None); + } + + let guard = state + .lock_fully() + .await + .map_err(|_| anyhow::anyhow!("failed to acquire ingester state lock"))?; + let usage = guard.mrecordlog.resource_usage(); + let memory_used = ByteSize::b(usage.memory_used_bytes as u64); + let memory_allocated = ByteSize::b(usage.memory_allocated_bytes as u64); + let open_shard_counts = guard.get_open_shard_counts(); + + Ok(Some((memory_used, memory_allocated, open_shard_counts))) + } + + async fn run(&mut self) { + let mut interval = tokio::time::interval(BROADCAST_INTERVAL_PERIOD); + let mut previous_sources: BTreeSet = BTreeSet::new(); + + loop { + interval.tick().await; + + let (memory_used, memory_allocated, open_shard_counts) = match self.snapshot().await { + Ok(Some(snapshot)) => snapshot, + Ok(None) => continue, + Err(error) => { + info!("stopping ingester capacity broadcast: {error}"); + return; + } + }; + + self.wal_capacity_time_series + .record(memory_used, memory_allocated); + + let remaining_capacity = self.wal_capacity_time_series.current().unwrap_or(1.0); + let capacity_delta = self.wal_capacity_time_series.delta().unwrap_or(0.0); + let capacity_score = compute_capacity_score(remaining_capacity, capacity_delta); + + previous_sources = self + .broadcast_capacity(capacity_score, &open_shard_counts, &previous_sources) + .await; + } + } + + async fn broadcast_capacity( + &self, + capacity_score: usize, + open_shard_counts: &OpenShardCounts, + previous_sources: &BTreeSet, + ) -> BTreeSet { + let mut current_sources = BTreeSet::new(); + + for (index_uid, source_id, open_shard_count) in open_shard_counts { + let source_uid = SourceUid { + index_uid: index_uid.clone(), + source_id: source_id.clone(), + }; + let key = make_key(INGESTER_CAPACITY_SCORE_PREFIX, &source_uid); + let capacity = IngesterCapacityScore { + capacity_score, + open_shard_count: *open_shard_count, + }; + let value = serde_json::to_string(&capacity) + .expect("`IngesterCapacityScore` should be JSON serializable"); + self.cluster.set_self_key_value(key, value).await; + current_sources.insert(source_uid); + } + + for removed_source in previous_sources.difference(¤t_sources) { + let key = make_key(INGESTER_CAPACITY_SCORE_PREFIX, removed_source); + self.cluster.remove_self_key(&key).await; + } + + current_sources + } +} + +#[derive(Debug, Clone)] +pub struct IngesterCapacityScoreUpdate { + pub node_id: NodeId, + pub source_uid: SourceUid, + pub capacity_score: usize, + pub open_shard_count: usize, +} + +impl Event for IngesterCapacityScoreUpdate {} + +pub async fn setup_ingester_capacity_update_listener( + cluster: Cluster, + event_broker: EventBroker, +) -> ListenerHandle { + cluster + .subscribe(INGESTER_CAPACITY_SCORE_PREFIX, move |event| { + let Some(source_uid) = parse_key(event.key) else { + warn!("failed to parse source UID from key `{}`", event.key); + return; + }; + let Ok(ingester_capacity) = serde_json::from_str::(event.value) + else { + warn!("failed to parse ingester capacity `{}`", event.value); + return; + }; + let node_id: NodeId = event.node.node_id.clone().into(); + event_broker.publish(IngesterCapacityScoreUpdate { + node_id, + source_uid, + capacity_score: ingester_capacity.capacity_score, + open_shard_count: ingester_capacity.open_shard_count, + }); + }) + .await +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + use std::sync::atomic::{AtomicUsize, Ordering}; + + use quickwit_cluster::{ChannelTransport, create_cluster_for_test}; + use quickwit_proto::types::ShardId; + + use super::*; + use crate::ingest_v2::models::IngesterShard; + use crate::ingest_v2::state::IngesterState; + + fn ts() -> WalMemoryCapacityTimeSeries { + WalMemoryCapacityTimeSeries::new() + } + + /// Helper: record a reading with `used` out of `allocated` bytes. + fn record(series: &mut WalMemoryCapacityTimeSeries, used: u64, allocated: u64) { + series.record(ByteSize::b(used), ByteSize::b(allocated)); + } + + #[test] + fn test_wal_memory_capacity_current_after_record() { + let mut series = ts(); + // 192 of 256 used => 25% remaining + record(&mut series, 192, 256); + assert_eq!(series.current(), Some(0.25)); + + // 16 of 256 used => 93.75% remaining + record(&mut series, 16, 256); + assert_eq!(series.current(), Some(0.9375)); + } + + #[test] + fn test_wal_memory_capacity_record_saturates_at_zero() { + let mut series = ts(); + record(&mut series, 200, 100); + assert_eq!(series.current(), Some(0.0)); + } + + #[test] + fn test_wal_memory_capacity_delta_growing() { + let mut series = ts(); + // oldest: 60 of 100 used => 40% remaining + record(&mut series, 60, 100); + // current: 20 of 100 used => 80% remaining + record(&mut series, 20, 100); + // delta = 0.80 - 0.40 = 0.40 + assert_eq!(series.delta(), Some(0.40)); + } + + #[test] + fn test_wal_memory_capacity_delta_shrinking() { + let mut series = ts(); + // oldest: 20 of 100 used => 80% remaining + record(&mut series, 20, 100); + // current: 60 of 100 used => 40% remaining + record(&mut series, 60, 100); + // delta = 0.40 - 0.80 = -0.40 + assert_eq!(series.delta(), Some(-0.40)); + } + + #[test] + fn test_capacity_score_draining_vs_stable() { + // Node A: capacity draining — usage increases 10, 20, ..., 70 over 7 ticks. + let mut node_a = ts(); + for used in (10..=70).step_by(10) { + record(&mut node_a, used, 100); + } + let a_remaining = node_a.current().unwrap(); + let a_delta = node_a.delta().unwrap(); + let a_score = compute_capacity_score(a_remaining, a_delta); + + // Node B: steady at 50% usage over 7 ticks. + let mut node_b = ts(); + for _ in 0..7 { + record(&mut node_b, 50, 100); + } + let b_remaining = node_b.current().unwrap(); + let b_delta = node_b.delta().unwrap(); + let b_score = compute_capacity_score(b_remaining, b_delta); + + // p=2.4, d=0 (max drain) => 2 + assert_eq!(a_score, 2); + // p=4, d=2 (stable) => 6 + assert_eq!(b_score, 6); + assert!(b_score > a_score); + } + + #[tokio::test] + async fn test_snapshot_state_dropped() { + let transport = ChannelTransport::default(); + let cluster = create_cluster_for_test(Vec::new(), &["indexer"], &transport, true) + .await + .unwrap(); + let (_temp_dir, state) = IngesterState::for_test().await; + let weak_state = state.weak(); + drop(state); + + let task = BroadcastIngesterCapacityScoreTask { + cluster, + weak_state, + wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(), + }; + assert!(task.snapshot().await.is_err()); + } + + #[tokio::test] + async fn test_broadcast_ingester_capacity() { + let transport = ChannelTransport::default(); + let cluster = create_cluster_for_test(Vec::new(), &["indexer"], &transport, true) + .await + .unwrap(); + let event_broker = EventBroker::default(); + + let (_temp_dir, state) = IngesterState::for_test().await; + let index_uid = IndexUid::for_test("test-index", 0); + let mut state_guard = state.lock_partially().await.unwrap(); + let shard = IngesterShard::new_solo( + index_uid.clone(), + SourceId::from("test-source"), + ShardId::from(0), + ) + .advertisable() + .build(); + state_guard.shards.insert(shard.queue_id(), shard); + let open_shard_counts = state_guard.get_open_shard_counts(); + drop(state_guard); + + // Simulate 500 of 1000 bytes used => 50% remaining, 0 delta => score = 6 + let mut task = BroadcastIngesterCapacityScoreTask { + cluster: cluster.clone(), + weak_state: state.weak(), + wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(), + }; + task.wal_capacity_time_series + .record(ByteSize::b(500), ByteSize::b(1000)); + + let remaining = task.wal_capacity_time_series.current().unwrap(); + let delta = task.wal_capacity_time_series.delta().unwrap(); + let capacity_score = compute_capacity_score(remaining, delta); + assert_eq!(capacity_score, 6); + + let update_counter = Arc::new(AtomicUsize::new(0)); + let update_counter_clone = update_counter.clone(); + let index_uid_clone = index_uid.clone(); + let _sub = event_broker.subscribe(move |event: IngesterCapacityScoreUpdate| { + update_counter_clone.fetch_add(1, Ordering::Release); + assert_eq!(event.source_uid.index_uid, index_uid_clone); + assert_eq!(event.source_uid.source_id, "test-source"); + assert_eq!(event.capacity_score, 6); + assert_eq!(event.open_shard_count, 1); + }); + + let _listener = + setup_ingester_capacity_update_listener(cluster.clone(), event_broker).await; + + let previous_sources = BTreeSet::new(); + task.broadcast_capacity(capacity_score, &open_shard_counts, &previous_sources) + .await; + tokio::time::sleep(BROADCAST_INTERVAL_PERIOD * 2).await; + + assert_eq!(update_counter.load(Ordering::Acquire), 1); + + let source_uid = SourceUid { + index_uid: index_uid.clone(), + source_id: SourceId::from("test-source"), + }; + let key = make_key(INGESTER_CAPACITY_SCORE_PREFIX, &source_uid); + let value = cluster.get_self_key_value(&key).await.unwrap(); + let deserialized: IngesterCapacityScore = serde_json::from_str(&value).unwrap(); + assert_eq!(deserialized.capacity_score, 6); + assert_eq!(deserialized.open_shard_count, 1); + } + + #[test] + fn test_wal_memory_capacity_delta_spans_lookback_window() { + let mut series = ts(); + + // Fill to exactly the lookback window length (6 readings), all same value. + for _ in 0..WAL_CAPACITY_LOOKBACK_WINDOW_LEN { + record(&mut series, 50, 100); + } + assert_eq!(series.delta(), Some(0.0)); + + // 7th reading fills the ring buffer. Delta spans 6 intervals. + record(&mut series, 0, 100); + assert_eq!(series.delta(), Some(0.50)); + + // 8th reading evicts the oldest 50-remaining. Delta still spans 6 intervals. + record(&mut series, 0, 100); + assert_eq!(series.delta(), Some(0.50)); + } +} diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/local_shards.rs similarity index 91% rename from quickwit/quickwit-ingest/src/ingest_v2/broadcast.rs rename to quickwit/quickwit-ingest/src/ingest_v2/broadcast/local_shards.rs index 9bbbe94bb47..6ba10915f56 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/local_shards.rs @@ -18,6 +18,7 @@ use std::time::Duration; use bytesize::ByteSize; use quickwit_cluster::{Cluster, ListenerHandle}; use quickwit_common::pubsub::{Event, EventBroker}; +use quickwit_common::ring_buffer::RingBuffer; use quickwit_common::shared_consts::INGESTER_PRIMARY_SHARDS_PREFIX; use quickwit_common::sorted_iter::{KeyDiff, SortedByKeyIterator}; use quickwit_common::tower::{ConstantRate, Rate}; @@ -27,15 +28,10 @@ use serde::{Deserialize, Serialize, Serializer}; use tokio::task::JoinHandle; use tracing::{debug, warn}; -use super::metrics::INGEST_V2_METRICS; -use super::state::WeakIngesterState; +use super::{BROADCAST_INTERVAL_PERIOD, make_key, parse_key}; use crate::RateMibPerSec; - -const BROADCAST_INTERVAL_PERIOD: Duration = if cfg!(test) { - Duration::from_millis(50) -} else { - Duration::from_secs(5) -}; +use crate::ingest_v2::metrics::INGEST_V2_METRICS; +use crate::ingest_v2::state::WeakIngesterState; const ONE_MIB: ByteSize = ByteSize::mib(1); @@ -152,7 +148,7 @@ impl LocalShardsSnapshot { /// Takes a snapshot of the primary shards hosted by the ingester at regular intervals and /// broadcasts it to other nodes via Chitchat. -pub(super) struct BroadcastLocalShardsTask { +pub struct BroadcastLocalShardsTask { cluster: Cluster, weak_state: WeakIngesterState, shard_throughput_time_series_map: ShardThroughputTimeSeriesMap, @@ -229,36 +225,24 @@ impl ShardThroughputTimeSeriesMap { #[derive(Default)] struct ShardThroughputTimeSeries { shard_state: ShardState, - measurements: [ByteSize; SHARD_THROUGHPUT_LONG_TERM_WINDOW_LEN], - len: usize, + throughput: RingBuffer, } impl ShardThroughputTimeSeries { fn last(&self) -> ByteSize { - self.measurements.last().copied().unwrap_or_default() + self.throughput.last().unwrap_or_default() } fn average(&self) -> ByteSize { - if self.len == 0 { + if self.throughput.is_empty() { return ByteSize::default(); } - let sum = self - .measurements - .iter() - .rev() - .take(self.len) - .map(ByteSize::as_u64) - .sum::(); - ByteSize::b(sum / self.len as u64) + let sum = self.throughput.iter().map(ByteSize::as_u64).sum::(); + ByteSize::b(sum / self.throughput.len() as u64) } fn record(&mut self, new_throughput_measurement: ByteSize) { - self.len = (self.len + 1).min(SHARD_THROUGHPUT_LONG_TERM_WINDOW_LEN); - self.measurements.rotate_left(1); - let Some(last_measurement) = self.measurements.last_mut() else { - return; - }; - *last_measurement = new_throughput_measurement; + self.throughput.push_back(new_throughput_measurement); } } @@ -338,13 +322,13 @@ impl BroadcastLocalShardsTask { source_uid, shard_infos, } => { - let key = make_key(source_uid); + let key = make_key(INGESTER_PRIMARY_SHARDS_PREFIX, source_uid); let value = serde_json::to_string(&shard_infos) .expect("`ShardInfos` should be JSON serializable"); self.cluster.set_self_key_value(key, value).await; } ShardInfosChange::Removed { source_uid } => { - let key = make_key(source_uid); + let key = make_key(INGESTER_PRIMARY_SHARDS_PREFIX, source_uid); self.cluster.remove_self_key(&key).await; } } @@ -371,22 +355,6 @@ impl BroadcastLocalShardsTask { } } -fn make_key(source_uid: &SourceUid) -> String { - format!( - "{INGESTER_PRIMARY_SHARDS_PREFIX}{}:{}", - source_uid.index_uid, source_uid.source_id - ) -} - -fn parse_key(key: &str) -> Option { - let (index_uid_str, source_id_str) = key.rsplit_once(':')?; - - Some(SourceUid { - index_uid: index_uid_str.parse().ok()?, - source_id: source_id_str.to_string(), - }) -} - #[derive(Debug, Clone)] pub struct LocalShardsUpdate { pub leader_id: NodeId, @@ -429,10 +397,12 @@ mod tests { use std::sync::atomic::{AtomicUsize, Ordering}; use quickwit_cluster::{ChannelTransport, create_cluster_for_test}; + use quickwit_common::shared_consts::INGESTER_PRIMARY_SHARDS_PREFIX; use quickwit_proto::ingest::ShardState; - use quickwit_proto::types::{IndexUid, SourceId}; + use quickwit_proto::types::{IndexUid, NodeId, ShardId, SourceId, SourceUid}; use super::*; + use crate::RateMibPerSec; use crate::ingest_v2::models::IngesterShard; use crate::ingest_v2::state::IngesterState; @@ -626,30 +596,6 @@ mod tests { assert!(value_opt.is_none()); } - #[test] - fn test_make_key() { - let source_uid = SourceUid { - index_uid: IndexUid::for_test("test-index", 0), - source_id: SourceId::from("test-source"), - }; - let key = make_key(&source_uid); - assert_eq!( - key, - "ingester.primary_shards:test-index:00000000000000000000000000:test-source" - ); - } - - #[test] - fn test_parse_key() { - let key = "test-index:00000000000000000000000000:test-source"; - let source_uid = parse_key(key).unwrap(); - assert_eq!( - &source_uid.index_uid.to_string(), - "test-index:00000000000000000000000000" - ); - assert_eq!(source_uid.source_id, "test-source".to_string()); - } - #[tokio::test] async fn test_local_shards_update_listener() { let transport = ChannelTransport::default(); @@ -686,7 +632,7 @@ mod tests { index_uid: index_uid.clone(), source_id: SourceId::from("test-source"), }; - let key = make_key(&source_uid); + let key = make_key(INGESTER_PRIMARY_SHARDS_PREFIX, &source_uid); let value = serde_json::to_string(&vec![ShardInfo { shard_id: ShardId::from(1), shard_state: ShardState::Open, diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs new file mode 100644 index 00000000000..d2184a0e392 --- /dev/null +++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs @@ -0,0 +1,76 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#[allow(dead_code)] +mod ingester_capacity_score; +mod local_shards; + +use std::time::Duration; + +use quickwit_proto::types::SourceUid; + +pub(in crate::ingest_v2) const BROADCAST_INTERVAL_PERIOD: Duration = if cfg!(test) { + Duration::from_millis(50) +} else { + Duration::from_secs(5) +}; + +pub use local_shards::{ + BroadcastLocalShardsTask, LocalShardsUpdate, ShardInfo, ShardInfos, + setup_local_shards_update_listener, +}; + +fn make_key(prefix: &str, source_uid: &SourceUid) -> String { + format!("{prefix}{}:{}", source_uid.index_uid, source_uid.source_id) +} + +fn parse_key(key: &str) -> Option { + let (index_uid_str, source_id_str) = key.rsplit_once(':')?; + Some(SourceUid { + index_uid: index_uid_str.parse().ok()?, + source_id: source_id_str.to_string(), + }) +} + +#[cfg(test)] +mod tests { + use quickwit_common::shared_consts::INGESTER_PRIMARY_SHARDS_PREFIX; + use quickwit_proto::types::{IndexUid, SourceId, SourceUid}; + + use super::*; + + #[test] + fn test_make_key() { + let source_uid = SourceUid { + index_uid: IndexUid::for_test("test-index", 0), + source_id: SourceId::from("test-source"), + }; + let key = make_key(INGESTER_PRIMARY_SHARDS_PREFIX, &source_uid); + assert_eq!( + key, + "ingester.primary_shards:test-index:00000000000000000000000000:test-source" + ); + } + + #[test] + fn test_parse_key() { + let key = "test-index:00000000000000000000000000:test-source"; + let source_uid = parse_key(key).unwrap(); + assert_eq!( + &source_uid.index_uid.to_string(), + "test-index:00000000000000000000000000" + ); + assert_eq!(source_uid.source_id, "test-source".to_string()); + } +} diff --git a/quickwit/quickwit-ingest/src/ingest_v2/state.rs b/quickwit/quickwit-ingest/src/ingest_v2/state.rs index 591ef4f704f..bf1c648c6cb 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/state.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/state.rs @@ -19,6 +19,7 @@ use std::path::Path; use std::sync::{Arc, Weak}; use std::time::{Duration, Instant}; +use itertools::Itertools; use mrecordlog::error::{DeleteQueueError, TruncateError}; use quickwit_common::pretty::PrettyDisplay; use quickwit_common::rate_limiter::{RateLimiter, RateLimiterSettings}; @@ -87,6 +88,17 @@ impl InnerIngesterState { .max_by_key(|(available_permits, _)| *available_permits) .map(|(_, shard)| shard) } + + pub fn get_open_shard_counts(&self) -> Vec<(IndexUid, SourceId, usize)> { + self.shards + .values() + .filter(|shard| shard.is_advertisable && !shard.is_replica() && shard.is_open()) + .map(|shard| (shard.index_uid.clone(), shard.source_id.clone())) + .counts() + .into_iter() + .map(|((index_uid, source_id), count)| (index_uid, source_id, count)) + .collect() + } } impl IngesterState { @@ -467,7 +479,7 @@ impl WeakIngesterState { #[cfg(test)] mod tests { use bytesize::ByteSize; - use quickwit_proto::types::ShardId; + use quickwit_proto::types::{NodeId, ShardId, SourceId}; use tokio::time::timeout; use super::*; @@ -642,4 +654,77 @@ mod tests { locked_state.find_most_capacity_shard_mut(&index_uid, &SourceId::from("other-source")); assert!(shard_opt.is_none()); } + + fn open_shard( + index_uid: IndexUid, + source_id: SourceId, + shard_id: ShardId, + is_replica: bool, + ) -> IngesterShard { + let builder = if is_replica { + IngesterShard::new_replica(index_uid, source_id, shard_id, NodeId::from("test-leader")) + } else { + IngesterShard::new_solo(index_uid, source_id, shard_id) + }; + builder.advertisable().build() + } + + #[tokio::test] + async fn test_get_open_shard_counts() { + let (_temp_dir, state) = IngesterState::for_test().await; + let mut state_guard = state.lock_partially().await.unwrap(); + + let index_a = IndexUid::for_test("index-a", 0); + let index_b = IndexUid::for_test("index-b", 0); + let index_c = IndexUid::for_test("index-c", 0); + + // (index-a, source-a): 1 open solo shard. + let s = open_shard( + index_a.clone(), + SourceId::from("source-a"), + ShardId::from(1), + false, + ); + state_guard.shards.insert(s.queue_id(), s); + + // (index-b, source-b): 1 open solo + 1 replica. Only the solo should be counted. + let s = open_shard( + index_b.clone(), + SourceId::from("source-b"), + ShardId::from(2), + false, + ); + state_guard.shards.insert(s.queue_id(), s); + let s = open_shard( + index_b.clone(), + SourceId::from("source-b"), + ShardId::from(3), + true, + ); + state_guard.shards.insert(s.queue_id(), s); + + // (index-c, source-c): 2 open solo shards. + let s = open_shard( + index_c.clone(), + SourceId::from("source-c"), + ShardId::from(4), + false, + ); + state_guard.shards.insert(s.queue_id(), s); + let s = open_shard( + index_c.clone(), + SourceId::from("source-c"), + ShardId::from(5), + false, + ); + state_guard.shards.insert(s.queue_id(), s); + + let mut counts = state_guard.get_open_shard_counts(); + counts.sort_by(|a, b| a.0.cmp(&b.0)); + + assert_eq!(counts.len(), 3); + assert_eq!(counts[0], (index_a, SourceId::from("source-a"), 1)); + assert_eq!(counts[1], (index_b, SourceId::from("source-b"), 1)); + assert_eq!(counts[2], (index_c, SourceId::from("source-c"), 2)); + } } From 6b22b1d4e066ffb2b7d01568a53ad73962cc87a7 Mon Sep 17 00:00:00 2001 From: nadav-govari Date: Thu, 19 Feb 2026 13:44:03 -0500 Subject: [PATCH 2/9] Implement node based routing table (#6159) --- .../broadcast/ingester_capacity_score.rs | 79 ++-- .../src/ingest_v2/broadcast/mod.rs | 4 + .../quickwit-ingest/src/ingest_v2/ingester.rs | 5 +- quickwit/quickwit-ingest/src/ingest_v2/mod.rs | 7 +- .../src/ingest_v2/node_routing_table.rs | 417 ++++++++++++++++++ .../quickwit-ingest/src/ingest_v2/router.rs | 28 +- .../quickwit-ingest/src/ingest_v2/state.rs | 4 +- 7 files changed, 499 insertions(+), 45 deletions(-) create mode 100644 quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs index 6f8abc66ef8..1927eb788f7 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs @@ -42,23 +42,22 @@ const WAL_CAPACITY_LOOKBACK_WINDOW_LEN: usize = 6; const WAL_CAPACITY_READINGS_LEN: usize = WAL_CAPACITY_LOOKBACK_WINDOW_LEN + 1; struct WalMemoryCapacityTimeSeries { + memory_capacity: ByteSize, readings: RingBuffer, } impl WalMemoryCapacityTimeSeries { - fn new() -> Self { + fn new(memory_capacity: ByteSize) -> Self { + #[cfg(not(test))] + assert!(memory_capacity.as_u64() > 0); Self { + memory_capacity, readings: RingBuffer::default(), } } - fn record(&mut self, memory_used: ByteSize, memory_allocated: ByteSize) { - let allocated = memory_allocated.as_u64(); - if allocated == 0 { - self.readings.push_back(1.0); - return; - } - let remaining = 1.0 - (memory_used.as_u64() as f64 / allocated as f64); + fn record(&mut self, memory_used: ByteSize) { + let remaining = 1.0 - (memory_used.as_u64() as f64 / self.memory_capacity.as_u64() as f64); self.readings.push_back(remaining.clamp(0.0, 1.0)); } @@ -121,23 +120,27 @@ pub struct IngesterCapacityScore { /// Periodically snapshots the ingester's WAL memory usage and open shard counts, computes /// a capacity score, and broadcasts it to other nodes via Chitchat. -pub(crate) struct BroadcastIngesterCapacityScoreTask { +pub struct BroadcastIngesterCapacityScoreTask { cluster: Cluster, weak_state: WeakIngesterState, wal_capacity_time_series: WalMemoryCapacityTimeSeries, } impl BroadcastIngesterCapacityScoreTask { - pub fn spawn(cluster: Cluster, weak_state: WeakIngesterState) -> JoinHandle<()> { + pub fn spawn( + cluster: Cluster, + weak_state: WeakIngesterState, + memory_capacity: ByteSize, + ) -> JoinHandle<()> { let mut broadcaster = Self { cluster, weak_state, - wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(), + wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(memory_capacity), }; tokio::spawn(async move { broadcaster.run().await }) } - async fn snapshot(&self) -> Result> { + async fn snapshot(&self) -> Result> { let state = self .weak_state .upgrade() @@ -155,10 +158,9 @@ impl BroadcastIngesterCapacityScoreTask { .map_err(|_| anyhow::anyhow!("failed to acquire ingester state lock"))?; let usage = guard.mrecordlog.resource_usage(); let memory_used = ByteSize::b(usage.memory_used_bytes as u64); - let memory_allocated = ByteSize::b(usage.memory_allocated_bytes as u64); let open_shard_counts = guard.get_open_shard_counts(); - Ok(Some((memory_used, memory_allocated, open_shard_counts))) + Ok(Some((memory_used, open_shard_counts))) } async fn run(&mut self) { @@ -168,7 +170,7 @@ impl BroadcastIngesterCapacityScoreTask { loop { interval.tick().await; - let (memory_used, memory_allocated, open_shard_counts) = match self.snapshot().await { + let (memory_used, open_shard_counts) = match self.snapshot().await { Ok(Some(snapshot)) => snapshot, Ok(None) => continue, Err(error) => { @@ -177,8 +179,7 @@ impl BroadcastIngesterCapacityScoreTask { } }; - self.wal_capacity_time_series - .record(memory_used, memory_allocated); + self.wal_capacity_time_series.record(memory_used); let remaining_capacity = self.wal_capacity_time_series.current().unwrap_or(1.0); let capacity_delta = self.wal_capacity_time_series.delta().unwrap_or(0.0); @@ -272,30 +273,31 @@ mod tests { use crate::ingest_v2::state::IngesterState; fn ts() -> WalMemoryCapacityTimeSeries { - WalMemoryCapacityTimeSeries::new() + WalMemoryCapacityTimeSeries::new(ByteSize::b(100)) } - /// Helper: record a reading with `used` out of `allocated` bytes. - fn record(series: &mut WalMemoryCapacityTimeSeries, used: u64, allocated: u64) { - series.record(ByteSize::b(used), ByteSize::b(allocated)); + /// Helper: record a reading with `used` bytes against the series' fixed capacity. + fn record(series: &mut WalMemoryCapacityTimeSeries, used: u64) { + series.record(ByteSize::b(used)); } #[test] fn test_wal_memory_capacity_current_after_record() { - let mut series = ts(); + let mut series = WalMemoryCapacityTimeSeries::new(ByteSize::b(256)); // 192 of 256 used => 25% remaining - record(&mut series, 192, 256); + series.record(ByteSize::b(192)); assert_eq!(series.current(), Some(0.25)); // 16 of 256 used => 93.75% remaining - record(&mut series, 16, 256); + series.record(ByteSize::b(16)); assert_eq!(series.current(), Some(0.9375)); } #[test] fn test_wal_memory_capacity_record_saturates_at_zero() { let mut series = ts(); - record(&mut series, 200, 100); + // 200 used out of 100 capacity => clamped to 0.0 + record(&mut series, 200); assert_eq!(series.current(), Some(0.0)); } @@ -303,9 +305,9 @@ mod tests { fn test_wal_memory_capacity_delta_growing() { let mut series = ts(); // oldest: 60 of 100 used => 40% remaining - record(&mut series, 60, 100); + record(&mut series, 60); // current: 20 of 100 used => 80% remaining - record(&mut series, 20, 100); + record(&mut series, 20); // delta = 0.80 - 0.40 = 0.40 assert_eq!(series.delta(), Some(0.40)); } @@ -314,9 +316,9 @@ mod tests { fn test_wal_memory_capacity_delta_shrinking() { let mut series = ts(); // oldest: 20 of 100 used => 80% remaining - record(&mut series, 20, 100); + record(&mut series, 20); // current: 60 of 100 used => 40% remaining - record(&mut series, 60, 100); + record(&mut series, 60); // delta = 0.40 - 0.80 = -0.40 assert_eq!(series.delta(), Some(-0.40)); } @@ -326,7 +328,7 @@ mod tests { // Node A: capacity draining — usage increases 10, 20, ..., 70 over 7 ticks. let mut node_a = ts(); for used in (10..=70).step_by(10) { - record(&mut node_a, used, 100); + record(&mut node_a, used); } let a_remaining = node_a.current().unwrap(); let a_delta = node_a.delta().unwrap(); @@ -335,7 +337,7 @@ mod tests { // Node B: steady at 50% usage over 7 ticks. let mut node_b = ts(); for _ in 0..7 { - record(&mut node_b, 50, 100); + record(&mut node_b, 50); } let b_remaining = node_b.current().unwrap(); let b_delta = node_b.delta().unwrap(); @@ -361,7 +363,7 @@ mod tests { let task = BroadcastIngesterCapacityScoreTask { cluster, weak_state, - wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(), + wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(ByteSize::mb(1)), }; assert!(task.snapshot().await.is_err()); } @@ -388,14 +390,13 @@ mod tests { let open_shard_counts = state_guard.get_open_shard_counts(); drop(state_guard); - // Simulate 500 of 1000 bytes used => 50% remaining, 0 delta => score = 6 + // Simulate 500 of 1000 bytes capacity used => 50% remaining, 0 delta => score = 6 let mut task = BroadcastIngesterCapacityScoreTask { cluster: cluster.clone(), weak_state: state.weak(), - wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(), + wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(ByteSize::b(1000)), }; - task.wal_capacity_time_series - .record(ByteSize::b(500), ByteSize::b(1000)); + task.wal_capacity_time_series.record(ByteSize::b(500)); let remaining = task.wal_capacity_time_series.current().unwrap(); let delta = task.wal_capacity_time_series.delta().unwrap(); @@ -440,16 +441,16 @@ mod tests { // Fill to exactly the lookback window length (6 readings), all same value. for _ in 0..WAL_CAPACITY_LOOKBACK_WINDOW_LEN { - record(&mut series, 50, 100); + record(&mut series, 50); } assert_eq!(series.delta(), Some(0.0)); // 7th reading fills the ring buffer. Delta spans 6 intervals. - record(&mut series, 0, 100); + record(&mut series, 0); assert_eq!(series.delta(), Some(0.50)); // 8th reading evicts the oldest 50-remaining. Delta still spans 6 intervals. - record(&mut series, 0, 100); + record(&mut series, 0); assert_eq!(series.delta(), Some(0.50)); } } diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs index d2184a0e392..18a00209de1 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs @@ -26,6 +26,10 @@ pub(in crate::ingest_v2) const BROADCAST_INTERVAL_PERIOD: Duration = if cfg!(tes Duration::from_secs(5) }; +pub use ingester_capacity_score::{ + BroadcastIngesterCapacityScoreTask, IngesterCapacityScoreUpdate, + setup_ingester_capacity_update_listener, +}; pub use local_shards::{ BroadcastLocalShardsTask, LocalShardsUpdate, ShardInfo, ShardInfos, setup_local_shards_update_listener, diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs index 65c268881ac..fc1a44f19bb 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs @@ -59,7 +59,7 @@ use tokio::time::{sleep, timeout}; use tracing::{debug, error, info, warn}; use super::IngesterPool; -use super::broadcast::BroadcastLocalShardsTask; +use super::broadcast::{BroadcastIngesterCapacityScoreTask, BroadcastLocalShardsTask}; use super::doc_mapper::validate_doc_batch; use super::fetch::FetchStreamTask; use super::idle::CloseIdleShardsTask; @@ -144,7 +144,8 @@ impl Ingester { let state = IngesterState::load(wal_dir_path, rate_limiter_settings); let weak_state = state.weak(); - BroadcastLocalShardsTask::spawn(cluster, weak_state.clone()); + BroadcastLocalShardsTask::spawn(cluster.clone(), weak_state.clone()); + BroadcastIngesterCapacityScoreTask::spawn(cluster, weak_state.clone(), memory_capacity); CloseIdleShardsTask::spawn(weak_state, idle_shard_timeout); let ingester = Self { diff --git a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs index c8543faf793..3a801763feb 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs @@ -22,6 +22,8 @@ mod metrics; mod models; mod mrecord; mod mrecordlog_utils; +#[allow(dead_code)] +mod node_routing_table; mod publish_tracker; mod rate_meter; mod replication; @@ -36,7 +38,10 @@ use std::ops::{Add, AddAssign}; use std::time::Duration; use std::{env, fmt}; -pub use broadcast::{LocalShardsUpdate, ShardInfo, ShardInfos, setup_local_shards_update_listener}; +pub use broadcast::{ + LocalShardsUpdate, ShardInfo, ShardInfos, setup_ingester_capacity_update_listener, + setup_local_shards_update_listener, +}; use bytes::buf::Writer; use bytes::{BufMut, BytesMut}; use bytesize::ByteSize; diff --git a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs new file mode 100644 index 00000000000..2e49e26b783 --- /dev/null +++ b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs @@ -0,0 +1,417 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::{HashMap, HashSet}; + +use quickwit_proto::ingest::Shard; +use quickwit_proto::types::{IndexId, IndexUid, NodeId, SourceId, SourceUid}; +use rand::rng; +use rand::seq::IndexedRandom; + +use crate::IngesterPool; + +/// A single ingester node's routing-relevant data for a specific (index, source) pair. +/// Each entry is self-describing: it carries its own node_id, index_uid, and source_id +/// so it can always be attributed back to a specific source on a specific node. +#[derive(Debug, Clone)] +pub(super) struct IngesterNode { + pub node_id: NodeId, + pub index_uid: IndexUid, + pub source_id: SourceId, + /// Score from 0-10. Higher means more available capacity. + pub capacity_score: usize, + /// Number of open shards on this node for this (index, source) pair. Tiebreaker for power of + /// two choices comparison - we favor a node with more open shards. + pub open_shard_count: usize, +} + +#[derive(Debug)] +pub(super) struct RoutingEntry { + nodes: HashMap, +} + +/// Given a slice of candidates, picks the better of two random choices. +/// Higher capacity_score wins; tiebreak on more open_shard_count (more landing spots). +fn power_of_two_choices<'a>(candidates: &[&'a IngesterNode]) -> &'a IngesterNode { + debug_assert!(candidates.len() >= 2); + let mut iter = candidates.choose_multiple(&mut rng(), 2); + let (&a, &b) = (iter.next().unwrap(), iter.next().unwrap()); + + if (a.capacity_score, a.open_shard_count) >= (b.capacity_score, b.open_shard_count) { + a + } else { + b + } +} + +impl RoutingEntry { + /// Pick an ingester node to persist the request to. Uses power of two choices based on reported + /// ingester capacity, if more than one eligible node exists. + pub fn pick_node( + &self, + ingester_pool: &IngesterPool, + unavailable_leaders: &HashSet, + ) -> Option<&IngesterNode> { + let eligible: Vec<&IngesterNode> = self + .nodes + .values() + .filter(|node| { + node.capacity_score > 0 + && node.open_shard_count > 0 + && ingester_pool.contains_key(&node.node_id) + && !unavailable_leaders.contains(&node.node_id) + }) + .collect(); + + match eligible.len() { + 0 => None, + 1 => Some(eligible[0]), + _ => Some(power_of_two_choices(&eligible)), + } + } +} + +#[derive(Debug, Default)] +pub(super) struct NodeBasedRoutingTable { + table: HashMap<(IndexId, SourceId), RoutingEntry>, +} + +impl NodeBasedRoutingTable { + pub fn find_entry(&self, index_id: &str, source_id: &str) -> Option<&RoutingEntry> { + let key = (index_id.to_string(), source_id.to_string()); + self.table.get(&key) + } + + pub fn debug_info(&self) -> HashMap> { + let mut per_index: HashMap> = HashMap::new(); + for ((index_id, source_id), entry) in &self.table { + for (node_id, node) in &entry.nodes { + per_index + .entry(index_id.clone()) + .or_default() + .push(serde_json::json!({ + "source_id": source_id, + "node_id": node_id, + "capacity_score": node.capacity_score, + "open_shard_count": node.open_shard_count, + })); + } + } + per_index + } + + pub fn has_open_nodes( + &self, + index_id: &str, + source_id: &str, + ingester_pool: &IngesterPool, + unavailable_leaders: &HashSet, + ) -> bool { + let key = (index_id.to_string(), source_id.to_string()); + let Some(entry) = self.table.get(&key) else { + return false; + }; + entry.nodes.values().any(|node| { + node.capacity_score > 0 + && node.open_shard_count > 0 + && ingester_pool.contains_key(&node.node_id) + && !unavailable_leaders.contains(&node.node_id) + }) + } + + /// Applies a capacity update from the IngesterCapacityScoreUpdate broadcast. This is the + /// primary way the table learns about node availability and capacity. + pub fn apply_capacity_update( + &mut self, + node_id: NodeId, + source_uid: SourceUid, + capacity_score: usize, + open_shard_count: usize, + ) { + let key = ( + source_uid.index_uid.index_id.to_string(), + source_uid.source_id.clone(), + ); + + let entry = self.table.entry(key).or_insert_with(|| RoutingEntry { + nodes: HashMap::new(), + }); + + let ingester_node = IngesterNode { + node_id: node_id.clone(), + index_uid: source_uid.index_uid, + source_id: source_uid.source_id, + capacity_score, + open_shard_count, + }; + entry.nodes.insert(node_id, ingester_node); + } + + /// Merges nodes from a GetOrCreateOpenShards control plane response into the + /// table. Only adds nodes that aren't already present — existing nodes keep + /// their real capacity scores from the broadcast. + /// TODO: New nodes get a default capacity_score of 5 until GetOrCreateOpenShards contains + /// capacity scores. + pub fn merge_from_shards( + &mut self, + index_uid: IndexUid, + source_id: SourceId, + shards: Vec, + ) { + let key = (index_uid.index_id.to_string(), source_id.clone()); + + let mut per_leader_count: HashMap = HashMap::new(); + for shard in &shards { + if shard.is_open() { + *per_leader_count + .entry(NodeId::from(shard.leader_id.clone())) + .or_default() += 1; + } + } + + let entry = self.table.entry(key).or_insert_with(|| RoutingEntry { + nodes: HashMap::new(), + }); + + for (node_id, open_shard_count) in per_leader_count { + if entry.nodes.contains_key(&node_id) { + continue; + } + let ingester_node = IngesterNode { + node_id: node_id.clone(), + index_uid: index_uid.clone(), + source_id: source_id.clone(), + capacity_score: 5, + open_shard_count, + }; + entry.nodes.insert(node_id, ingester_node); + } + } +} + +#[cfg(test)] +mod tests { + use quickwit_proto::ingest::ShardState; + use quickwit_proto::ingest::ingester::IngesterServiceClient; + use quickwit_proto::types::ShardId; + + use super::*; + + fn source_uid(index_id: &str, incarnation_id: u128, source_id: &str) -> SourceUid { + SourceUid { + index_uid: IndexUid::for_test(index_id, incarnation_id), + source_id: source_id.to_string(), + } + } + + #[test] + fn test_apply_capacity_update() { + let mut table = NodeBasedRoutingTable::default(); + let uid = source_uid("test-index", 0, "test-source"); + let key = ("test-index".to_string(), "test-source".to_string()); + + // Insert first node. + table.apply_capacity_update("node-1".into(), uid.clone(), 8, 3); + let entry = table.table.get(&key).unwrap(); + assert_eq!(entry.nodes.len(), 1); + assert_eq!(entry.nodes.get("node-1").unwrap().capacity_score, 8); + + // Update existing node. + table.apply_capacity_update("node-1".into(), uid.clone(), 4, 5); + let node = table.table.get(&key).unwrap().nodes.get("node-1").unwrap(); + assert_eq!(node.capacity_score, 4); + assert_eq!(node.open_shard_count, 5); + + // Add second node. + table.apply_capacity_update("node-2".into(), uid.clone(), 6, 2); + assert_eq!(table.table.get(&key).unwrap().nodes.len(), 2); + + // Zero shards: node stays in table but becomes ineligible for routing. + table.apply_capacity_update("node-1".into(), uid.clone(), 0, 0); + let entry = table.table.get(&key).unwrap(); + assert_eq!(entry.nodes.len(), 2); + assert_eq!(entry.nodes.get("node-1").unwrap().open_shard_count, 0); + assert_eq!(entry.nodes.get("node-1").unwrap().capacity_score, 0); + } + + #[test] + fn test_has_open_nodes() { + let mut table = NodeBasedRoutingTable::default(); + let pool = IngesterPool::default(); + let uid = source_uid("test-index", 0, "test-source"); + + // Empty table. + assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new())); + + // Node exists but is not in pool. + table.apply_capacity_update("node-1".into(), uid.clone(), 8, 3); + assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new())); + + // Node is in pool → true. + pool.insert("node-1".into(), IngesterServiceClient::mocked()); + assert!(table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new())); + + // Node is unavailable → false. + let unavailable: HashSet = HashSet::from(["node-1".into()]); + assert!(!table.has_open_nodes("test-index", "test-source", &pool, &unavailable)); + + // Second node available → true despite first being unavailable. + table.apply_capacity_update("node-2".into(), uid.clone(), 6, 2); + pool.insert("node-2".into(), IngesterServiceClient::mocked()); + assert!(table.has_open_nodes("test-index", "test-source", &pool, &unavailable)); + + // Node with capacity_score=0 is not eligible. + table.apply_capacity_update("node-2".into(), uid, 0, 2); + assert!(!table.has_open_nodes("test-index", "test-source", &pool, &unavailable)); + } + + #[test] + fn test_pick_node() { + let mut table = NodeBasedRoutingTable::default(); + let pool = IngesterPool::default(); + let uid = source_uid("test-index", 0, "test-source"); + let key = ("test-index".to_string(), "test-source".to_string()); + + // Node exists but not in pool → None. + table.apply_capacity_update("node-1".into(), uid.clone(), 8, 3); + assert!( + table + .table + .get(&key) + .unwrap() + .pick_node(&pool, &HashSet::new()) + .is_none() + ); + + // Single node in pool → picks it. + pool.insert("node-1".into(), IngesterServiceClient::mocked()); + let picked = table + .table + .get(&key) + .unwrap() + .pick_node(&pool, &HashSet::new()) + .unwrap(); + assert_eq!(picked.node_id, NodeId::from("node-1")); + + // Multiple nodes → something is returned. + table.apply_capacity_update("node-2".into(), uid.clone(), 2, 1); + pool.insert("node-2".into(), IngesterServiceClient::mocked()); + assert!( + table + .table + .get(&key) + .unwrap() + .pick_node(&pool, &HashSet::new()) + .is_some() + ); + + // Node with capacity_score=0 is skipped. + table.apply_capacity_update("node-1".into(), uid.clone(), 0, 3); + table.apply_capacity_update("node-2".into(), uid, 0, 1); + assert!( + table + .table + .get(&key) + .unwrap() + .pick_node(&pool, &HashSet::new()) + .is_none() + ); + } + + #[test] + fn test_power_of_two_choices() { + // 3 candidates: best appears in the random pair 2/3 of the time and always + // wins when it does, so it should win ~67% of 1000 runs. Asserting > 550 + // is ~7.5 standard deviations from the mean — effectively impossible to flake. + let high = IngesterNode { + node_id: "high".into(), + index_uid: IndexUid::for_test("idx", 0), + source_id: "src".into(), + capacity_score: 9, + open_shard_count: 2, + }; + let mid = IngesterNode { + node_id: "mid".into(), + index_uid: IndexUid::for_test("idx", 0), + source_id: "src".into(), + capacity_score: 5, + open_shard_count: 2, + }; + let low = IngesterNode { + node_id: "low".into(), + index_uid: IndexUid::for_test("idx", 0), + source_id: "src".into(), + capacity_score: 1, + open_shard_count: 2, + }; + let candidates: Vec<&IngesterNode> = vec![&high, &mid, &low]; + + let mut high_wins = 0; + for _ in 0..1000 { + if power_of_two_choices(&candidates).node_id == "high" { + high_wins += 1; + } + } + assert!(high_wins > 550, "high won only {high_wins}/1000 times"); + } + + #[test] + fn test_merge_from_shards() { + let mut table = NodeBasedRoutingTable::default(); + let index_uid = IndexUid::for_test("test-index", 0); + let key = ("test-index".to_string(), "test-source".to_string()); + + let make_shard = |id: u64, leader: &str, open: bool| Shard { + index_uid: Some(index_uid.clone()), + source_id: "test-source".to_string(), + shard_id: Some(ShardId::from(id)), + shard_state: if open { + ShardState::Open as i32 + } else { + ShardState::Closed as i32 + }, + leader_id: leader.to_string(), + ..Default::default() + }; + + // Two open shards on node-1, one open on node-2, one closed (ignored). + let shards = vec![ + make_shard(1, "node-1", true), + make_shard(2, "node-1", true), + make_shard(3, "node-2", true), + make_shard(4, "node-2", false), + ]; + table.merge_from_shards(index_uid.clone(), "test-source".into(), shards); + + let entry = table.table.get(&key).unwrap(); + assert_eq!(entry.nodes.len(), 2); + + let n1 = entry.nodes.get("node-1").unwrap(); + assert_eq!(n1.open_shard_count, 2); + assert_eq!(n1.capacity_score, 5); + + let n2 = entry.nodes.get("node-2").unwrap(); + assert_eq!(n2.open_shard_count, 1); + + // Merging again adds new nodes but preserves existing ones. + let shards = vec![make_shard(10, "node-3", true)]; + table.merge_from_shards(index_uid, "test-source".into(), shards); + + let entry = table.table.get(&key).unwrap(); + assert_eq!(entry.nodes.len(), 3); + assert!(entry.nodes.contains_key("node-1")); + assert!(entry.nodes.contains_key("node-2")); + assert!(entry.nodes.contains_key("node-3")); + } +} diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs index 67ad31a2722..ccd00f0209c 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs @@ -43,12 +43,13 @@ use tokio::sync::{Mutex, Semaphore}; use tokio::time::error::Elapsed; use tracing::{error, info}; -use super::broadcast::LocalShardsUpdate; +use super::broadcast::{IngesterCapacityScoreUpdate, LocalShardsUpdate}; use super::debouncing::{ DebouncedGetOrCreateOpenShardsRequest, GetOrCreateOpenShardsRequestDebouncer, }; use super::ingester::PERSIST_REQUEST_TIMEOUT; use super::metrics::IngestResultMetrics; +use super::node_routing_table::NodeBasedRoutingTable; use super::routing_table::{NextOpenShardError, RoutingTable}; use super::workbench::IngestWorkbench; use super::{IngesterPool, pending_subrequests}; @@ -105,6 +106,9 @@ struct RouterState { debouncer: GetOrCreateOpenShardsRequestDebouncer, // Holds the routing table mapping index and source IDs to shards. routing_table: RoutingTable, + // Node-based routing table, populated by capacity broadcasts. + // Not yet used for routing — will replace `routing_table` in a follow-up PR. + node_routing_table: NodeBasedRoutingTable, } impl fmt::Debug for IngestRouter { @@ -130,6 +134,7 @@ impl IngestRouter { self_node_id: self_node_id.clone(), table: HashMap::default(), }, + node_routing_table: NodeBasedRoutingTable::default(), })); let ingest_semaphore_permits = get_ingest_router_buffer_size().as_u64() as usize; let ingest_semaphore = Arc::new(Semaphore::new(ingest_semaphore_permits)); @@ -151,7 +156,10 @@ impl IngestRouter { .subscribe::(weak_router_state.clone()) .forever(); self.event_broker - .subscribe::(weak_router_state) + .subscribe::(weak_router_state.clone()) + .forever(); + self.event_broker + .subscribe::(weak_router_state) .forever(); } @@ -694,6 +702,22 @@ impl EventSubscriber for WeakRouterState { } } +#[async_trait] +impl EventSubscriber for WeakRouterState { + async fn handle_event(&mut self, update: IngesterCapacityScoreUpdate) { + let Some(state) = self.0.upgrade() else { + return; + }; + let mut state_guard = state.lock().await; + state_guard.node_routing_table.apply_capacity_update( + update.node_id, + update.source_uid, + update.capacity_score, + update.open_shard_count, + ); + } +} + pub(super) struct PersistRequestSummary { pub leader_id: NodeId, pub subrequest_ids: Vec, diff --git a/quickwit/quickwit-ingest/src/ingest_v2/state.rs b/quickwit/quickwit-ingest/src/ingest_v2/state.rs index bf1c648c6cb..a14f4ae9a44 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/state.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/state.rs @@ -154,8 +154,10 @@ impl IngesterState { /// queues. Empty queues are deleted, while non-empty queues are recovered. However, the /// corresponding shards are closed and become read-only. pub async fn init(&self, wal_dir_path: &Path, rate_limiter_settings: RateLimiterSettings) { - let mut inner_guard = self.inner.lock().await; + // Acquire locks in the same order as `lock_fully` (mrecordlog first, then inner) to + // prevent ABBA deadlocks with the broadcast capacity task. let mut mrecordlog_guard = self.mrecordlog.write().await; + let mut inner_guard = self.inner.lock().await; let now = Instant::now(); From adb619a9a597dbf1bd105b67ee05cb20eb1b9108 Mon Sep 17 00:00:00 2001 From: nadav-govari Date: Thu, 26 Feb 2026 13:56:54 -0500 Subject: [PATCH 3/9] Use new node based routing table for routing decisions (#6163) --- .../broadcast/ingester_capacity_score.rs | 40 +- .../src/ingest_v2/debouncing.rs | 4 +- .../quickwit-ingest/src/ingest_v2/ingester.rs | 48 +- quickwit/quickwit-ingest/src/ingest_v2/mod.rs | 1 + .../src/ingest_v2/node_routing_table.rs | 2 +- .../quickwit-ingest/src/ingest_v2/router.rs | 1247 +++++------------ .../src/ingest_v2/workbench.rs | 14 +- .../protos/quickwit/ingester.proto | 5 +- .../quickwit/quickwit.ingest.ingester.rs | 15 +- quickwit/quickwit-proto/src/ingest/mod.rs | 9 +- quickwit/quickwit-serve/src/lib.rs | 8 +- .../tag_fields/0002_negative_tags.yaml | 8 +- .../scenarii/tag_fields/_setup.quickwit.yaml | 10 +- .../tag_fields/_teardown.quickwit.yaml | 2 +- 14 files changed, 432 insertions(+), 981 deletions(-) diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs index 1927eb788f7..9531db17deb 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs @@ -41,12 +41,12 @@ const WAL_CAPACITY_LOOKBACK_WINDOW_LEN: usize = 6; /// reading would be discarded when the next reading is inserted. const WAL_CAPACITY_READINGS_LEN: usize = WAL_CAPACITY_LOOKBACK_WINDOW_LEN + 1; -struct WalMemoryCapacityTimeSeries { +struct WalDiskCapacityTimeSeries { memory_capacity: ByteSize, readings: RingBuffer, } -impl WalMemoryCapacityTimeSeries { +impl WalDiskCapacityTimeSeries { fn new(memory_capacity: ByteSize) -> Self { #[cfg(not(test))] assert!(memory_capacity.as_u64() > 0); @@ -123,19 +123,19 @@ pub struct IngesterCapacityScore { pub struct BroadcastIngesterCapacityScoreTask { cluster: Cluster, weak_state: WeakIngesterState, - wal_capacity_time_series: WalMemoryCapacityTimeSeries, + wal_capacity_time_series: WalDiskCapacityTimeSeries, } impl BroadcastIngesterCapacityScoreTask { pub fn spawn( cluster: Cluster, weak_state: WeakIngesterState, - memory_capacity: ByteSize, + disk_capacity: ByteSize, ) -> JoinHandle<()> { let mut broadcaster = Self { cluster, weak_state, - wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(memory_capacity), + wal_capacity_time_series: WalDiskCapacityTimeSeries::new(disk_capacity), }; tokio::spawn(async move { broadcaster.run().await }) } @@ -157,10 +157,10 @@ impl BroadcastIngesterCapacityScoreTask { .await .map_err(|_| anyhow::anyhow!("failed to acquire ingester state lock"))?; let usage = guard.mrecordlog.resource_usage(); - let memory_used = ByteSize::b(usage.memory_used_bytes as u64); + let disk_used = ByteSize::b(usage.disk_used_bytes as u64); let open_shard_counts = guard.get_open_shard_counts(); - Ok(Some((memory_used, open_shard_counts))) + Ok(Some((disk_used, open_shard_counts))) } async fn run(&mut self) { @@ -170,7 +170,7 @@ impl BroadcastIngesterCapacityScoreTask { loop { interval.tick().await; - let (memory_used, open_shard_counts) = match self.snapshot().await { + let (disk_used, open_shard_counts) = match self.snapshot().await { Ok(Some(snapshot)) => snapshot, Ok(None) => continue, Err(error) => { @@ -179,7 +179,7 @@ impl BroadcastIngesterCapacityScoreTask { } }; - self.wal_capacity_time_series.record(memory_used); + self.wal_capacity_time_series.record(disk_used); let remaining_capacity = self.wal_capacity_time_series.current().unwrap_or(1.0); let capacity_delta = self.wal_capacity_time_series.delta().unwrap_or(0.0); @@ -272,18 +272,18 @@ mod tests { use crate::ingest_v2::models::IngesterShard; use crate::ingest_v2::state::IngesterState; - fn ts() -> WalMemoryCapacityTimeSeries { - WalMemoryCapacityTimeSeries::new(ByteSize::b(100)) + fn ts() -> WalDiskCapacityTimeSeries { + WalDiskCapacityTimeSeries::new(ByteSize::b(100)) } /// Helper: record a reading with `used` bytes against the series' fixed capacity. - fn record(series: &mut WalMemoryCapacityTimeSeries, used: u64) { + fn record(series: &mut WalDiskCapacityTimeSeries, used: u64) { series.record(ByteSize::b(used)); } #[test] - fn test_wal_memory_capacity_current_after_record() { - let mut series = WalMemoryCapacityTimeSeries::new(ByteSize::b(256)); + fn test_wal_disk_capacity_current_after_record() { + let mut series = WalDiskCapacityTimeSeries::new(ByteSize::b(256)); // 192 of 256 used => 25% remaining series.record(ByteSize::b(192)); assert_eq!(series.current(), Some(0.25)); @@ -294,7 +294,7 @@ mod tests { } #[test] - fn test_wal_memory_capacity_record_saturates_at_zero() { + fn test_wal_disk_capacity_record_saturates_at_zero() { let mut series = ts(); // 200 used out of 100 capacity => clamped to 0.0 record(&mut series, 200); @@ -302,7 +302,7 @@ mod tests { } #[test] - fn test_wal_memory_capacity_delta_growing() { + fn test_wal_disk_capacity_delta_growing() { let mut series = ts(); // oldest: 60 of 100 used => 40% remaining record(&mut series, 60); @@ -313,7 +313,7 @@ mod tests { } #[test] - fn test_wal_memory_capacity_delta_shrinking() { + fn test_wal_disk_capacity_delta_shrinking() { let mut series = ts(); // oldest: 20 of 100 used => 80% remaining record(&mut series, 20); @@ -363,7 +363,7 @@ mod tests { let task = BroadcastIngesterCapacityScoreTask { cluster, weak_state, - wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(ByteSize::mb(1)), + wal_capacity_time_series: WalDiskCapacityTimeSeries::new(ByteSize::mb(1)), }; assert!(task.snapshot().await.is_err()); } @@ -394,7 +394,7 @@ mod tests { let mut task = BroadcastIngesterCapacityScoreTask { cluster: cluster.clone(), weak_state: state.weak(), - wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(ByteSize::b(1000)), + wal_capacity_time_series: WalDiskCapacityTimeSeries::new(ByteSize::b(1000)), }; task.wal_capacity_time_series.record(ByteSize::b(500)); @@ -436,7 +436,7 @@ mod tests { } #[test] - fn test_wal_memory_capacity_delta_spans_lookback_window() { + fn test_wal_disk_capacity_delta_spans_lookback_window() { let mut series = ts(); // Fill to exactly the lookback window length (6 readings), all same value. diff --git a/quickwit/quickwit-ingest/src/ingest_v2/debouncing.rs b/quickwit/quickwit-ingest/src/ingest_v2/debouncing.rs index 041f2928c45..19d6f5d691d 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/debouncing.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/debouncing.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use quickwit_proto::control_plane::{ GetOrCreateOpenShardsRequest, GetOrCreateOpenShardsSubrequest, }; -use quickwit_proto::ingest::ShardIds; use quickwit_proto::types::{IndexId, SourceId}; use tokio::sync::{OwnedRwLockWriteGuard, RwLock}; @@ -69,7 +68,6 @@ impl GetOrCreateOpenShardsRequestDebouncer { #[derive(Default)] pub(super) struct DebouncedGetOrCreateOpenShardsRequest { subrequests: Vec, - pub closed_shards: Vec, pub unavailable_leaders: Vec, rendezvous: Rendezvous, } @@ -85,8 +83,8 @@ impl DebouncedGetOrCreateOpenShardsRequest { } let request = GetOrCreateOpenShardsRequest { subrequests: self.subrequests, - closed_shards: self.closed_shards, unavailable_leaders: self.unavailable_leaders, + ..Default::default() }; (Some(request), self.rendezvous) } diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs index fc1a44f19bb..5cd231d5a4a 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs @@ -37,16 +37,7 @@ use quickwit_proto::control_plane::{ AdviseResetShardsRequest, ControlPlaneService, ControlPlaneServiceClient, }; use quickwit_proto::indexing::ShardPositionsUpdate; -use quickwit_proto::ingest::ingester::{ - AckReplicationMessage, CloseShardsRequest, CloseShardsResponse, DecommissionRequest, - DecommissionResponse, FetchMessage, IngesterService, IngesterServiceClient, - IngesterServiceStream, IngesterStatus, InitShardFailure, InitShardSuccess, InitShardsRequest, - InitShardsResponse, ObservationMessage, OpenFetchStreamRequest, OpenObservationStreamRequest, - OpenReplicationStreamRequest, OpenReplicationStreamResponse, PersistFailure, - PersistFailureReason, PersistRequest, PersistResponse, PersistSuccess, ReplicateFailureReason, - ReplicateSubrequest, RetainShardsForSource, RetainShardsRequest, RetainShardsResponse, - SynReplicationMessage, TruncateShardsRequest, TruncateShardsResponse, -}; +use quickwit_proto::ingest::ingester::*; use quickwit_proto::ingest::{ CommitTypeV2, DocBatchV2, IngestV2Error, IngestV2Result, ParseFailure, Shard, ShardIds, }; @@ -145,7 +136,7 @@ impl Ingester { let weak_state = state.weak(); BroadcastLocalShardsTask::spawn(cluster.clone(), weak_state.clone()); - BroadcastIngesterCapacityScoreTask::spawn(cluster, weak_state.clone(), memory_capacity); + BroadcastIngesterCapacityScoreTask::spawn(cluster, weak_state.clone(), disk_capacity); CloseIdleShardsTask::spawn(weak_state, idle_shard_timeout); let ingester = Self { @@ -469,7 +460,7 @@ impl Ingester { index_uid: subrequest.index_uid, source_id: subrequest.source_id, shard_id: subrequest.shard_id, - reason: PersistFailureReason::ShardClosed as i32, + reason: PersistFailureReason::NodeUnavailable as i32, }; persist_failures.push(persist_failure); } @@ -499,7 +490,7 @@ impl Ingester { index_uid: subrequest.index_uid, source_id: subrequest.source_id, shard_id: subrequest.shard_id, - reason: PersistFailureReason::ShardNotFound as i32, + reason: PersistFailureReason::NoShardsAvailable as i32, }; persist_failures.push(persist_failure); continue; @@ -558,7 +549,7 @@ impl Ingester { index_uid: subrequest.index_uid, source_id: subrequest.source_id, shard_id: Some(shard_id), - reason: PersistFailureReason::ShardRateLimited as i32, + reason: PersistFailureReason::NoShardsAvailable as i32, }; persist_failures.push(persist_failure); continue; @@ -673,7 +664,7 @@ impl Ingester { // TODO: Handle replication error: // 1. Close and evict all the shards hosted by the follower. // 2. Close and evict the replication client. - // 3. Return `PersistFailureReason::ShardClosed` to router. + // 3. Return `PersistFailureReason::NodeUnavailable` to router. continue; } }; @@ -689,14 +680,8 @@ impl Ingester { for replicate_failure in replicate_response.failures { // TODO: If the replica shard is closed, close the primary shard if it is not // already. - let persist_failure_reason = match replicate_failure.reason() { - ReplicateFailureReason::Unspecified => PersistFailureReason::Unspecified, - ReplicateFailureReason::ShardNotFound => { - PersistFailureReason::ShardNotFound - } - ReplicateFailureReason::ShardClosed => PersistFailureReason::ShardClosed, - ReplicateFailureReason::WalFull => PersistFailureReason::WalFull, - }; + let persist_failure_reason: PersistFailureReason = + replicate_failure.reason().into(); let persist_failure = PersistFailure { subrequest_id: replicate_failure.subrequest_id, index_uid: replicate_failure.index_uid, @@ -736,7 +721,7 @@ impl Ingester { "failed to persist records to shard `{queue_id}`: {io_error}" ); shards_to_close.insert(queue_id); - PersistFailureReason::ShardClosed + PersistFailureReason::NodeUnavailable } AppendDocBatchError::QueueNotFound(_) => { error!( @@ -744,7 +729,7 @@ impl Ingester { not found" ); shards_to_delete.insert(queue_id); - PersistFailureReason::ShardNotFound + PersistFailureReason::NodeUnavailable } }; let persist_failure = PersistFailure { @@ -2159,7 +2144,7 @@ mod tests { let persist_failure = &persist_response.failures[0]; assert_eq!( persist_failure.reason(), - PersistFailureReason::ShardRateLimited + PersistFailureReason::NoShardsAvailable ); } @@ -2222,7 +2207,10 @@ mod tests { assert_eq!(persist_failure.index_uid(), &index_uid); assert_eq!(persist_failure.source_id, "test-source"); assert_eq!(persist_failure.shard_id(), ShardId::from(1)); - assert_eq!(persist_failure.reason(), PersistFailureReason::ShardClosed,); + assert_eq!( + persist_failure.reason(), + PersistFailureReason::NodeUnavailable, + ); let state_guard = ingester.state.lock_fully().await.unwrap(); let shard = state_guard.shards.get(&queue_id).unwrap(); @@ -2274,7 +2262,7 @@ mod tests { assert_eq!(persist_failure.shard_id(), ShardId::from(1)); assert_eq!( persist_failure.reason(), - PersistFailureReason::ShardNotFound + PersistFailureReason::NodeUnavailable ); let state_guard = ingester.state.lock_fully().await.unwrap(); @@ -2704,7 +2692,7 @@ mod tests { assert_eq!(persist_failure.shard_id(), ShardId::from(1)); assert_eq!( persist_failure.reason(), - PersistFailureReason::ShardNotFound + PersistFailureReason::NoShardsAvailable ); let state_guard = ingester.state.lock_fully().await.unwrap(); @@ -2783,7 +2771,7 @@ mod tests { assert_eq!(persist_failure.shard_id(), ShardId::from(1)); assert_eq!( persist_failure.reason(), - PersistFailureReason::ShardRateLimited + PersistFailureReason::NoShardsAvailable ); let state_guard = ingester.state.lock_fully().await.unwrap(); diff --git a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs index 3a801763feb..0bb3d6b6138 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs @@ -28,6 +28,7 @@ mod publish_tracker; mod rate_meter; mod replication; mod router; +#[allow(dead_code)] mod routing_table; mod state; mod workbench; diff --git a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs index 2e49e26b783..f354011ede2 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs @@ -38,7 +38,7 @@ pub(super) struct IngesterNode { #[derive(Debug)] pub(super) struct RoutingEntry { - nodes: HashMap, + pub nodes: HashMap, } /// Given a slice of candidates, picks the better of two random choices. diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs index ccd00f0209c..da3d989d93e 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs @@ -27,33 +27,29 @@ use quickwit_proto::control_plane::{ ControlPlaneService, ControlPlaneServiceClient, GetOrCreateOpenShardsRequest, GetOrCreateOpenShardsSubrequest, }; -use quickwit_proto::indexing::ShardPositionsUpdate; use quickwit_proto::ingest::ingester::{ IngesterService, PersistFailureReason, PersistRequest, PersistResponse, PersistSubrequest, }; use quickwit_proto::ingest::router::{ IngestFailureReason, IngestRequestV2, IngestResponseV2, IngestRouterService, }; -use quickwit_proto::ingest::{ - CommitTypeV2, IngestV2Error, IngestV2Result, RateLimitingCause, ShardState, -}; -use quickwit_proto::types::{IndexUid, NodeId, ShardId, SourceId, SubrequestId}; +use quickwit_proto::ingest::{CommitTypeV2, IngestV2Error, IngestV2Result, RateLimitingCause}; +use quickwit_proto::types::{NodeId, SubrequestId}; use serde_json::{Value as JsonValue, json}; use tokio::sync::{Mutex, Semaphore}; use tokio::time::error::Elapsed; use tracing::{error, info}; -use super::broadcast::{IngesterCapacityScoreUpdate, LocalShardsUpdate}; +use super::broadcast::IngesterCapacityScoreUpdate; use super::debouncing::{ DebouncedGetOrCreateOpenShardsRequest, GetOrCreateOpenShardsRequestDebouncer, }; use super::ingester::PERSIST_REQUEST_TIMEOUT; use super::metrics::IngestResultMetrics; use super::node_routing_table::NodeBasedRoutingTable; -use super::routing_table::{NextOpenShardError, RoutingTable}; use super::workbench::IngestWorkbench; use super::{IngesterPool, pending_subrequests}; -use crate::{LeaderId, get_ingest_router_buffer_size}; +use crate::get_ingest_router_buffer_size; /// Duration after which ingest requests time out with [`IngestV2Error::Timeout`]. fn ingest_request_timeout() -> Duration { @@ -102,12 +98,7 @@ pub struct IngestRouter { } struct RouterState { - // Debounces `GetOrCreateOpenShardsRequest` requests to the control plane. debouncer: GetOrCreateOpenShardsRequestDebouncer, - // Holds the routing table mapping index and source IDs to shards. - routing_table: RoutingTable, - // Node-based routing table, populated by capacity broadcasts. - // Not yet used for routing — will replace `routing_table` in a follow-up PR. node_routing_table: NodeBasedRoutingTable, } @@ -130,10 +121,6 @@ impl IngestRouter { ) -> Self { let state = Arc::new(Mutex::new(RouterState { debouncer: GetOrCreateOpenShardsRequestDebouncer::default(), - routing_table: RoutingTable { - self_node_id: self_node_id.clone(), - table: HashMap::default(), - }, node_routing_table: NodeBasedRoutingTable::default(), })); let ingest_semaphore_permits = get_ingest_router_buffer_size().as_u64() as usize; @@ -152,12 +139,6 @@ impl IngestRouter { pub fn subscribe(&self) { let weak_router_state = WeakRouterState(Arc::downgrade(&self.state)); - self.event_broker - .subscribe::(weak_router_state.clone()) - .forever(); - self.event_broker - .subscribe::(weak_router_state.clone()) - .forever(); self.event_broker .subscribe::(weak_router_state) .forever(); @@ -171,22 +152,19 @@ impl IngestRouter { ingester_pool: &IngesterPool, ) -> DebouncedGetOrCreateOpenShardsRequest { let mut debounced_request = DebouncedGetOrCreateOpenShardsRequest::default(); - - // `closed_shards` and `unavailable_leaders` are populated by calls to `has_open_shards` - // as we're looking for open shards to route the subrequests to. - let unavailable_leaders: &mut HashSet = &mut workbench.unavailable_leaders; + let unavailable_leaders = &workbench.unavailable_leaders; let mut state_guard = self.state.lock().await; for subrequest in pending_subrequests(&workbench.subworkbenches) { - if !state_guard.routing_table.has_open_shards( + if !state_guard.node_routing_table.has_open_nodes( &subrequest.index_id, &subrequest.source_id, ingester_pool, - &mut debounced_request.closed_shards, unavailable_leaders, ) { - // No shard available! Let's attempt to create one. + // No known nodes with open shards for this source. Ask the control + // plane to create shards so we have somewhere to route to. let acquire_result = state_guard .debouncer .acquire(&subrequest.index_id, &subrequest.source_id); @@ -208,9 +186,6 @@ impl IngestRouter { } drop(state_guard); - if !debounced_request.is_empty() && !debounced_request.closed_shards.is_empty() { - info!(closed_shards=?debounced_request.closed_shards, "reporting closed shard(s) to control plane"); - } if !debounced_request.is_empty() && !unavailable_leaders.is_empty() { info!(unavailable_leaders=?unavailable_leaders, "reporting unavailable leader(s) to control plane"); @@ -267,7 +242,7 @@ impl IngestRouter { let mut state_guard = self.state.lock().await; for success in response.successes { - state_guard.routing_table.replace_shards( + state_guard.node_routing_table.merge_from_shards( success.index_uid().clone(), success.source_id, success.open_shards, @@ -285,8 +260,7 @@ impl IngestRouter { workbench: &mut IngestWorkbench, mut persist_futures: FuturesUnordered>, ) { - let mut closed_shards: HashMap<(IndexUid, SourceId), Vec> = HashMap::new(); - let mut deleted_shards: HashMap<(IndexUid, SourceId), Vec> = HashMap::new(); + let mut unavailable_leaders: HashSet = HashSet::new(); while let Some((persist_summary, persist_result)) = persist_futures.next().await { match persist_result { @@ -298,33 +272,12 @@ impl IngestRouter { workbench.record_persist_failure(&persist_failure); match persist_failure.reason() { - PersistFailureReason::ShardClosed => { - let shard_id = persist_failure.shard_id().clone(); - let index_uid: IndexUid = persist_failure.index_uid().clone(); - let source_id: SourceId = persist_failure.source_id; - closed_shards - .entry((index_uid, source_id)) - .or_default() - .push(shard_id); - } - PersistFailureReason::ShardNotFound => { - let shard_id = persist_failure.shard_id().clone(); - let index_uid: IndexUid = persist_failure.index_uid().clone(); - let source_id: SourceId = persist_failure.source_id; - deleted_shards - .entry((index_uid, source_id)) - .or_default() - .push(shard_id); - } - PersistFailureReason::WalFull - | PersistFailureReason::ShardRateLimited => { - // Let's record that the shard is rate limited or that the ingester - // that hosts has its wal full. - // - // That way we will avoid to retry the persist request on the very - // same node. - let shard_id = persist_failure.shard_id().clone(); - workbench.rate_limited_shards.insert(shard_id); + PersistFailureReason::NoShardsAvailable => {} + PersistFailureReason::NodeUnavailable + | PersistFailureReason::WalFull + | PersistFailureReason::Timeout => { + unavailable_leaders + .insert(NodeId::from(persist_response.leader_id.clone())); } _ => {} } @@ -348,20 +301,7 @@ impl IngestRouter { } }; } - if !closed_shards.is_empty() || !deleted_shards.is_empty() { - let mut state_guard = self.state.lock().await; - - for ((index_uid, source_id), shard_ids) in closed_shards { - state_guard - .routing_table - .close_shards(&index_uid, source_id, &shard_ids); - } - for ((index_uid, source_id), shard_ids) in deleted_shards { - state_guard - .routing_table - .delete_shards(&index_uid, source_id, &shard_ids); - } - } + workbench.unavailable_leaders.extend(unavailable_leaders); } async fn batch_persist(&self, workbench: &mut IngestWorkbench, commit_type: CommitTypeV2) { @@ -373,47 +313,35 @@ impl IngestRouter { self.populate_routing_table_debounced(workbench, debounced_request) .await; - // Subrequests for which no shards are available to route the subrequests to. + let unavailable_leaders = &workbench.unavailable_leaders; let mut no_shards_available_subrequest_ids: Vec = Vec::new(); - // Subrequests for which the shards are rate limited. - let mut rate_limited_subrequest_ids: Vec = Vec::new(); - - let mut per_leader_persist_subrequests: HashMap<&LeaderId, Vec> = + let mut per_leader_persist_subrequests: HashMap<&NodeId, Vec> = HashMap::new(); - let rate_limited_shards: &HashSet = &workbench.rate_limited_shards; let state_guard = self.state.lock().await; for subrequest in pending_subrequests(&workbench.subworkbenches) { - let next_open_shard_res_opt = state_guard - .routing_table + let ingester_node = state_guard + .node_routing_table .find_entry(&subrequest.index_id, &subrequest.source_id) - .map(|entry| { - entry.next_open_shard_round_robin(&self.ingester_pool, rate_limited_shards) - }); - let next_open_shard = match next_open_shard_res_opt { - Some(Ok(next_open_shard)) => next_open_shard, - Some(Err(NextOpenShardError::RateLimited)) => { - rate_limited_subrequest_ids.push(subrequest.subrequest_id); - continue; - } - Some(Err(NextOpenShardError::NoShardsAvailable)) | None => { + .and_then(|entry| entry.pick_node(&self.ingester_pool, unavailable_leaders)); + + let ingester_node = match ingester_node { + Some(node) => node, + None => { no_shards_available_subrequest_ids.push(subrequest.subrequest_id); continue; } }; let persist_subrequest = PersistSubrequest { subrequest_id: subrequest.subrequest_id, - index_uid: next_open_shard.index_uid.clone().into(), - source_id: next_open_shard.source_id.clone(), - // We don't necessarily persist to this shard. We persist to the shard with the most - // capacity on that node. - // TODO: Clean this up. - shard_id: Some(next_open_shard.shard_id.clone()), + index_uid: Some(ingester_node.index_uid.clone()), + source_id: subrequest.source_id.clone(), + shard_id: None, doc_batch: subrequest.doc_batch.clone(), }; per_leader_persist_subrequests - .entry(&next_open_shard.leader_id) + .entry(&ingester_node.node_id) .or_default() .push(persist_subrequest); } @@ -461,9 +389,6 @@ impl IngestRouter { for subrequest_id in no_shards_available_subrequest_ids { workbench.record_no_shards_available(subrequest_id); } - for subrequest_id in rate_limited_subrequest_ids { - workbench.record_rate_limited(subrequest_id); - } self.process_persist_results(workbench, persist_futures) .await; } @@ -516,7 +441,7 @@ impl IngestRouter { pub async fn debug_info(&self) -> JsonValue { let state_guard = self.state.lock().await; - let routing_table_json = state_guard.routing_table.debug_info(); + let routing_table_json = state_guard.node_routing_table.debug_info(); json!({ "routing_table": routing_table_json, @@ -640,68 +565,6 @@ impl IngestRouterService for IngestRouter { #[derive(Clone)] struct WeakRouterState(Weak>); -#[async_trait] -impl EventSubscriber for WeakRouterState { - async fn handle_event(&mut self, local_shards_update: LocalShardsUpdate) { - let Some(state) = self.0.upgrade() else { - return; - }; - let leader_id = local_shards_update.leader_id; - let index_uid = local_shards_update.source_uid.index_uid; - let source_id = local_shards_update.source_uid.source_id; - - let mut open_shard_ids: Vec = Vec::new(); - let mut closed_shard_ids: Vec = Vec::new(); - - for shard_info in local_shards_update.shard_infos { - match shard_info.shard_state { - ShardState::Open => open_shard_ids.push(shard_info.shard_id), - ShardState::Closed => closed_shard_ids.push(shard_info.shard_id), - ShardState::Unavailable | ShardState::Unspecified => { - // Ingesters never broadcast the `Unavailable`` state because, from their point - // of view, they are never unavailable. - } - } - } - let mut state_guard = state.lock().await; - - state_guard - .routing_table - .close_shards(&index_uid, &source_id, &closed_shard_ids); - - state_guard.routing_table.insert_open_shards( - &leader_id, - index_uid, - source_id, - &open_shard_ids, - ); - } -} - -#[async_trait] -impl EventSubscriber for WeakRouterState { - async fn handle_event(&mut self, shard_positions_update: ShardPositionsUpdate) { - let Some(state) = self.0.upgrade() else { - return; - }; - let mut deleted_shard_ids: Vec = Vec::new(); - - for (shard_id, shard_position) in shard_positions_update.updated_shard_positions { - if shard_position.is_eof() { - deleted_shard_ids.push(shard_id); - } - } - let mut state_guard = state.lock().await; - - let index_uid = shard_positions_update.source_uid.index_uid; - let source_id = shard_positions_update.source_uid.source_id; - - state_guard - .routing_table - .delete_shards(&index_uid, &source_id, &deleted_shard_ids); - } -} - #[async_trait] impl EventSubscriber for WeakRouterState { async fn handle_event(&mut self, update: IngesterCapacityScoreUpdate) { @@ -725,9 +588,6 @@ pub(super) struct PersistRequestSummary { #[cfg(test)] mod tests { - use std::collections::BTreeSet; - - use mockall::Sequence; use quickwit_proto::control_plane::{ GetOrCreateOpenShardsFailure, GetOrCreateOpenShardsFailureReason, GetOrCreateOpenShardsResponse, GetOrCreateOpenShardsSuccess, MockControlPlaneService, @@ -737,15 +597,11 @@ mod tests { }; use quickwit_proto::ingest::router::IngestSubrequest; use quickwit_proto::ingest::{ - CommitTypeV2, DocBatchV2, ParseFailure, ParseFailureReason, Shard, ShardIds, ShardState, + CommitTypeV2, DocBatchV2, ParseFailure, ParseFailureReason, Shard, ShardState, }; - use quickwit_proto::types::{DocUid, Position, SourceUid}; - use tokio::task::yield_now; + use quickwit_proto::types::{DocUid, IndexUid, Position, ShardId, SourceUid}; use super::*; - use crate::RateMibPerSec; - use crate::ingest_v2::broadcast::ShardInfo; - use crate::ingest_v2::routing_table::{RoutingEntry, RoutingTableEntry}; use crate::ingest_v2::workbench::SubworkbenchFailure; #[tokio::test] @@ -770,34 +626,18 @@ mod tests { assert!(get_or_create_open_shard_request_opt.is_none()); assert!(rendezvous.is_empty()); - let mut state_guard = router.state.lock().await; - - let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0); - state_guard.routing_table.table.insert( - ("test-index-0".into(), "test-source".into()), - RoutingTableEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - local_shards: vec![ - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(1), - shard_state: ShardState::Closed, - leader_id: "test-ingester-0".into(), - }, - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(2), - shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), - }, - ], - ..Default::default() - }, - ); - drop(state_guard); + { + let mut state_guard = router.state.lock().await; + state_guard.node_routing_table.apply_capacity_update( + "test-ingester-0".into(), + SourceUid { + index_uid: IndexUid::for_test("test-index-0", 0), + source_id: "test-source".to_string(), + }, + 8, + 1, + ); + } let ingest_subrequests: Vec = vec![ IngestSubrequest { @@ -833,24 +673,12 @@ mod tests { assert_eq!(subrequest.index_id, "test-index-1"); assert_eq!(subrequest.source_id, "test-source"); - assert_eq!(get_or_create_open_shard_request.closed_shards.len(), 1); - assert_eq!( - get_or_create_open_shard_request.closed_shards[0], - ShardIds { - index_uid: Some(IndexUid::for_test("test-index-0", 0)), - source_id: "test-source".to_string(), - shard_ids: vec![ShardId::from(1)], - } - ); - assert_eq!( - get_or_create_open_shard_request.unavailable_leaders.len(), - 1 - ); - assert_eq!( - get_or_create_open_shard_request.unavailable_leaders[0], - "test-ingester-0" + assert!( + get_or_create_open_shard_request + .unavailable_leaders + .is_empty() ); - assert_eq!(workbench.unavailable_leaders.len(), 1); + assert!(workbench.unavailable_leaders.is_empty()); let (get_or_create_open_shard_request_opt, rendezvous_2) = router .make_get_or_create_open_shard_request(&mut workbench, &ingester_pool) @@ -867,27 +695,26 @@ mod tests { ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked()); { - // Ingester-0 has been marked as unavailable due to the previous requests. + // Ingester-0 is in pool and in table, but marked unavailable on the workbench + // (simulating a prior transport error). has_open_nodes returns false → both + // subrequests trigger CP request. + workbench + .unavailable_leaders + .insert("test-ingester-0".into()); let (get_or_create_open_shard_request_opt, _rendezvous) = router .make_get_or_create_open_shard_request(&mut workbench, &ingester_pool) .await .take(); let get_or_create_open_shard_request = get_or_create_open_shard_request_opt.unwrap(); assert_eq!(get_or_create_open_shard_request.subrequests.len(), 2); - assert_eq!(workbench.unavailable_leaders.len(), 1); assert_eq!( - workbench - .unavailable_leaders - .iter() - .next() - .unwrap() - .to_string(), - "test-ingester-0" + get_or_create_open_shard_request.unavailable_leaders.len(), + 1 ); } { - // With a fresh workbench, the ingester is not marked as unavailable, and present in the - // pool. + // Fresh workbench: ingester-0 is in pool, in table, and NOT unavailable. + // has_open_nodes returns true for index-0 → only index-1 triggers request. let mut workbench = IngestWorkbench::new(ingest_subrequests, 3); let (get_or_create_open_shard_request_opt, _rendezvous) = router .make_get_or_create_open_shard_request(&mut workbench, &ingester_pool) @@ -900,9 +727,10 @@ mod tests { assert_eq!(subrequest.index_id, "test-index-1"); assert_eq!(subrequest.source_id, "test-source"); - assert_eq!( - get_or_create_open_shard_request.unavailable_leaders.len(), - 0 + assert!( + get_or_create_open_shard_request + .unavailable_leaders + .is_empty() ); } } @@ -947,6 +775,7 @@ mod tests { source_id: "test-source".to_string(), shard_id: Some(ShardId::from(1)), shard_state: ShardState::Open as i32, + leader_id: "test-ingester-0".to_string(), ..Default::default() }], }, @@ -960,6 +789,7 @@ mod tests { source_id: "test-source".to_string(), shard_id: Some(ShardId::from(1)), shard_state: ShardState::Open as i32, + leader_id: "test-ingester-1".to_string(), ..Default::default() }, Shard { @@ -967,6 +797,7 @@ mod tests { source_id: "test-source".to_string(), shard_id: Some(ShardId::from(2)), shard_state: ShardState::Open as i32, + leader_id: "test-ingester-1".to_string(), ..Default::default() }, ], @@ -1057,23 +888,6 @@ mod tests { .populate_routing_table(&mut workbench, get_or_create_open_shards_request) .await; - let state_guard = router.state.lock().await; - let routing_table = &state_guard.routing_table; - assert_eq!(routing_table.len(), 2); - - let routing_entry_0 = routing_table - .find_entry("test-index-0", "test-source") - .unwrap(); - assert_eq!(routing_entry_0.len(), 1); - assert_eq!(routing_entry_0.all_shards()[0].shard_id, ShardId::from(1)); - - let routing_entry_1 = routing_table - .find_entry("test-index-1", "test-source") - .unwrap(); - assert_eq!(routing_entry_1.len(), 2); - assert_eq!(routing_entry_1.all_shards()[0].shard_id, ShardId::from(1)); - assert_eq!(routing_entry_1.all_shards()[1].shard_id, ShardId::from(2)); - let subworkbench = workbench.subworkbenches.get(&2).unwrap(); assert!(matches!( subworkbench.last_failure_opt, @@ -1278,7 +1092,7 @@ mod tests { index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), shard_id: Some(ShardId::from(1)), - reason: PersistFailureReason::ShardRateLimited as i32, + reason: PersistFailureReason::NoShardsAvailable as i32, }], }); (persist_summary, persist_result) @@ -1294,89 +1108,6 @@ mod tests { )); } - #[tokio::test] - async fn test_router_process_persist_results_closes_and_deletes_shards() { - let self_node_id = "test-router".into(); - let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); - let ingester_pool = IngesterPool::default(); - let replication_factor = 1; - let router = IngestRouter::new( - self_node_id, - control_plane, - ingester_pool.clone(), - replication_factor, - EventBroker::default(), - ); - let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0); - let mut state_guard = router.state.lock().await; - state_guard.routing_table.replace_shards( - index_uid.clone(), - "test-source", - vec![ - Shard { - index_uid: Some(index_uid.clone()), - shard_id: Some(ShardId::from(1)), - shard_state: ShardState::Open as i32, - leader_id: "test-ingester-0".to_string(), - ..Default::default() - }, - Shard { - index_uid: Some(index_uid.clone()), - shard_id: Some(ShardId::from(2)), - shard_state: ShardState::Open as i32, - leader_id: "test-ingester-0".to_string(), - ..Default::default() - }, - ], - ); - drop(state_guard); - - let mut workbench = IngestWorkbench::new(Vec::new(), 2); - let persist_futures = FuturesUnordered::new(); - - persist_futures.push(async { - let persist_summary = PersistRequestSummary { - leader_id: "test-ingester-0".into(), - subrequest_ids: vec![0], - }; - let persist_result = Ok::<_, IngestV2Error>(PersistResponse { - leader_id: "test-ingester-0".to_string(), - successes: Vec::new(), - failures: vec![ - PersistFailure { - subrequest_id: 0, - index_uid: Some(index_uid.clone()), - source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), - reason: PersistFailureReason::ShardNotFound as i32, - }, - PersistFailure { - subrequest_id: 1, - index_uid: Some(index_uid.clone()), - source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(2)), - reason: PersistFailureReason::ShardClosed as i32, - }, - ], - }); - (persist_summary, persist_result) - }); - router - .process_persist_results(&mut workbench, persist_futures) - .await; - - let state_guard = router.state.lock().await; - let routing_table_entry = state_guard - .routing_table - .find_entry("test-index-0", "test-source") - .unwrap(); - assert_eq!(routing_table_entry.len(), 1); - - let shard = routing_table_entry.all_shards()[0]; - assert_eq!(shard.shard_id, ShardId::from(2)); - assert_eq!(shard.shard_state, ShardState::Closed); - } - #[tokio::test] async fn test_router_process_persist_results_does_not_remove_unavailable_leaders() { let self_node_id = "test-router".into(); @@ -1470,151 +1201,76 @@ mod tests { let self_node_id = "test-router".into(); let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); let ingester_pool = IngesterPool::default(); - let replication_factor = 1; let router = IngestRouter::new( self_node_id, control_plane, ingester_pool.clone(), - replication_factor, + 1, EventBroker::default(), ); - let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0); - let index_uid2: IndexUid = IndexUid::for_test("test-index-1", 0); - let mut state_guard = router.state.lock().await; - state_guard.routing_table.replace_shards( - index_uid.clone(), - "test-source", - vec![Shard { - index_uid: Some(index_uid.clone()), - source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), - shard_state: ShardState::Open as i32, - leader_id: "test-ingester-0".to_string(), - ..Default::default() - }], - ); - state_guard.routing_table.replace_shards( - index_uid2.clone(), - "test-source", - vec![ - Shard { - index_uid: Some(index_uid2.clone()), + + let index_uid_0: IndexUid = IndexUid::for_test("test-index-0", 0); + let index_uid_1: IndexUid = IndexUid::for_test("test-index-1", 0); + { + let mut state_guard = router.state.lock().await; + state_guard.node_routing_table.merge_from_shards( + index_uid_0.clone(), + "test-source".to_string(), + vec![Shard { + index_uid: Some(index_uid_0.clone()), source_id: "test-source".to_string(), shard_id: Some(ShardId::from(1)), shard_state: ShardState::Open as i32, leader_id: "test-ingester-0".to_string(), - follower_id: Some("test-ingester-1".to_string()), ..Default::default() - }, - Shard { - index_uid: Some(index_uid2.clone()), + }], + ); + state_guard.node_routing_table.merge_from_shards( + index_uid_1.clone(), + "test-source".to_string(), + vec![Shard { + index_uid: Some(index_uid_1.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(2)), + shard_id: Some(ShardId::from(1)), shard_state: ShardState::Open as i32, leader_id: "test-ingester-1".to_string(), - follower_id: Some("test-ingester-2".to_string()), ..Default::default() - }, - ], - ); - drop(state_guard); + }], + ); + } + let index_uid_0_clone = index_uid_0.clone(); let mut mock_ingester_0 = MockIngesterService::new(); - let index_uid_clone = index_uid.clone(); - let index_uid2_clone = index_uid2.clone(); - mock_ingester_0 - .expect_persist() - .once() - .returning(move |request| { - assert_eq!(request.leader_id, "test-ingester-0"); - assert_eq!(request.subrequests.len(), 2); - assert_eq!(request.commit_type(), CommitTypeV2::Auto); - - let subrequest = &request.subrequests[0]; - assert_eq!(subrequest.subrequest_id, 0); - assert_eq!(subrequest.index_uid(), &index_uid_clone); - assert_eq!(subrequest.source_id, "test-source"); - assert_eq!(subrequest.shard_id(), ShardId::from(1)); - assert_eq!( - subrequest.doc_batch, - Some(DocBatchV2::for_test(["", "test-doc-foo", "test-doc-bar"])) - ); - - let subrequest = &request.subrequests[1]; - assert_eq!(subrequest.subrequest_id, 1); - assert_eq!(subrequest.index_uid(), &index_uid2_clone); - assert_eq!(subrequest.source_id, "test-source"); - assert_eq!(subrequest.shard_id(), ShardId::from(1)); - assert_eq!( - subrequest.doc_batch, - Some(DocBatchV2::for_test(["test-doc-qux"])) - ); - - let response = PersistResponse { - leader_id: request.leader_id, - successes: vec![ - PersistSuccess { - subrequest_id: 0, - index_uid: Some(index_uid_clone.clone()), - source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), - replication_position_inclusive: Some(Position::offset(1u64)), - num_persisted_docs: 2, - parse_failures: vec![ParseFailure { - doc_uid: Some(DocUid::for_test(0)), - reason: ParseFailureReason::InvalidJson as i32, - message: "invalid JSON".to_string(), - }], - }, - PersistSuccess { - subrequest_id: 1, - index_uid: Some(index_uid2_clone.clone()), - source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), - replication_position_inclusive: Some(Position::offset(0u64)), - num_persisted_docs: 1, - parse_failures: Vec::new(), - }, - ], - failures: Vec::new(), - }; - Ok(response) - }); mock_ingester_0 .expect_persist() .once() .returning(move |request| { assert_eq!(request.leader_id, "test-ingester-0"); assert_eq!(request.subrequests.len(), 1); - assert_eq!(request.commit_type(), CommitTypeV2::Auto); + assert!(request.subrequests[0].shard_id.is_none()); - let subrequest = &request.subrequests[0]; - assert_eq!(subrequest.subrequest_id, 0); - assert_eq!(subrequest.index_uid(), &index_uid); - assert_eq!(subrequest.source_id, "test-source"); - assert_eq!(subrequest.shard_id(), ShardId::from(1)); - assert_eq!( - subrequest.doc_batch, - Some(DocBatchV2::for_test(["test-doc-moo", "test-doc-baz"])) - ); - - let response = PersistResponse { + Ok(PersistResponse { leader_id: request.leader_id, successes: vec![PersistSuccess { subrequest_id: 0, - index_uid: Some(index_uid.clone()), + index_uid: Some(index_uid_0_clone.clone()), source_id: "test-source".to_string(), shard_id: Some(ShardId::from(1)), - replication_position_inclusive: Some(Position::offset(3u64)), - num_persisted_docs: 4, - parse_failures: Vec::new(), + replication_position_inclusive: Some(Position::offset(1u64)), + num_persisted_docs: 2, + parse_failures: vec![ParseFailure { + doc_uid: Some(DocUid::for_test(0)), + reason: ParseFailureReason::InvalidJson as i32, + message: "invalid JSON".to_string(), + }], }], failures: Vec::new(), - }; - Ok(response) + }) }); - let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); + ingester_pool.insert( + "test-ingester-0".into(), + IngesterServiceClient::from_mock(mock_ingester_0), + ); let mut mock_ingester_1 = MockIngesterService::new(); mock_ingester_1 @@ -1623,136 +1279,94 @@ mod tests { .returning(move |request| { assert_eq!(request.leader_id, "test-ingester-1"); assert_eq!(request.subrequests.len(), 1); - assert_eq!(request.commit_type(), CommitTypeV2::Auto); - - let subrequest = &request.subrequests[0]; - assert_eq!(subrequest.subrequest_id, 1); - assert_eq!(subrequest.index_uid(), &index_uid2); - assert_eq!(subrequest.source_id, "test-source"); - assert_eq!(subrequest.shard_id(), ShardId::from(2)); - assert_eq!( - subrequest.doc_batch, - Some(DocBatchV2::for_test(["test-doc-tux"])) - ); + assert!(request.subrequests[0].shard_id.is_none()); - let response = PersistResponse { + Ok(PersistResponse { leader_id: request.leader_id, successes: vec![PersistSuccess { subrequest_id: 1, - index_uid: Some(index_uid2.clone()), + index_uid: Some(index_uid_1.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(2)), + shard_id: Some(ShardId::from(1)), replication_position_inclusive: Some(Position::offset(0u64)), num_persisted_docs: 1, parse_failures: Vec::new(), }], failures: Vec::new(), - }; - Ok(response) + }) }); - let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1); - ingester_pool.insert("test-ingester-1".into(), ingester_1); + ingester_pool.insert( + "test-ingester-1".into(), + IngesterServiceClient::from_mock(mock_ingester_1), + ); + + let response = router + .ingest(IngestRequestV2 { + subrequests: vec![ + IngestSubrequest { + subrequest_id: 0, + index_id: "test-index-0".to_string(), + source_id: "test-source".to_string(), + doc_batch: Some(DocBatchV2::for_test(["", "test-doc-foo", "test-doc-bar"])), + }, + IngestSubrequest { + subrequest_id: 1, + index_id: "test-index-1".to_string(), + source_id: "test-source".to_string(), + doc_batch: Some(DocBatchV2::for_test(["test-doc-qux"])), + }, + ], + commit_type: CommitTypeV2::Auto as i32, + }) + .await + .unwrap(); - let ingest_request = IngestRequestV2 { - subrequests: vec![ - IngestSubrequest { - subrequest_id: 0, - index_id: "test-index-0".to_string(), - source_id: "test-source".to_string(), - doc_batch: Some(DocBatchV2::for_test(["", "test-doc-foo", "test-doc-bar"])), - }, - IngestSubrequest { - subrequest_id: 1, - index_id: "test-index-1".to_string(), - source_id: "test-source".to_string(), - doc_batch: Some(DocBatchV2::for_test(["test-doc-qux"])), - }, - ], - commit_type: CommitTypeV2::Auto as i32, - }; - let response = router.ingest(ingest_request).await.unwrap(); assert_eq!(response.successes.len(), 2); assert_eq!(response.failures.len(), 0); let parse_failures = &response.successes[0].parse_failures; assert_eq!(parse_failures.len(), 1); - - let parse_failure = &parse_failures[0]; - assert_eq!(parse_failure.doc_uid(), DocUid::for_test(0)); - assert_eq!(parse_failure.reason(), ParseFailureReason::InvalidJson); - - let ingest_request = IngestRequestV2 { - subrequests: vec![ - IngestSubrequest { - subrequest_id: 0, - index_id: "test-index-0".to_string(), - source_id: "test-source".to_string(), - doc_batch: Some(DocBatchV2::for_test(["test-doc-moo", "test-doc-baz"])), - }, - IngestSubrequest { - subrequest_id: 1, - index_id: "test-index-1".to_string(), - source_id: "test-source".to_string(), - doc_batch: Some(DocBatchV2::for_test(["test-doc-tux"])), - }, - ], - commit_type: CommitTypeV2::Auto as i32, - }; - let response = router.ingest(ingest_request).await.unwrap(); - assert_eq!(response.successes.len(), 2); - assert_eq!(response.failures.len(), 0); - } + assert_eq!(parse_failures[0].doc_uid(), DocUid::for_test(0)); + assert_eq!(parse_failures[0].reason(), ParseFailureReason::InvalidJson); + } #[tokio::test] async fn test_router_ingest_retry() { let self_node_id = "test-router".into(); let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); let ingester_pool = IngesterPool::default(); - let replication_factor = 1; let router = IngestRouter::new( self_node_id, control_plane, ingester_pool.clone(), - replication_factor, + 1, EventBroker::default(), ); - let mut state_guard = router.state.lock().await; let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0); - state_guard.routing_table.replace_shards( - index_uid.clone(), - "test-source", - vec![Shard { - index_uid: Some(index_uid.clone()), - source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), - shard_state: ShardState::Open as i32, - leader_id: "test-ingester-0".to_string(), - ..Default::default() - }], - ); - drop(state_guard); + { + let mut state_guard = router.state.lock().await; + state_guard.node_routing_table.merge_from_shards( + index_uid.clone(), + "test-source".to_string(), + vec![Shard { + index_uid: Some(index_uid.clone()), + source_id: "test-source".to_string(), + shard_id: Some(ShardId::from(1)), + shard_state: ShardState::Open as i32, + leader_id: "test-ingester-0".to_string(), + ..Default::default() + }], + ); + } let mut mock_ingester_0 = MockIngesterService::new(); let index_uid_clone = index_uid.clone(); + // First attempt: returns NoShardsAvailable (transient, doesn't mark leader unavailable). mock_ingester_0 .expect_persist() .once() .returning(move |request| { - assert_eq!(request.leader_id, "test-ingester-0"); - assert_eq!(request.subrequests.len(), 1); - assert_eq!(request.commit_type(), CommitTypeV2::Auto); - - let subrequest = &request.subrequests[0]; - assert_eq!(subrequest.subrequest_id, 0); - assert_eq!(subrequest.index_uid(), &index_uid_clone); - assert_eq!(subrequest.source_id, "test-source"); - assert_eq!(subrequest.shard_id(), ShardId::from(1)); - assert_eq!( - subrequest.doc_batch, - Some(DocBatchV2::for_test(["test-doc-foo"])) - ); - - let response = PersistResponse { + Ok(PersistResponse { leader_id: request.leader_id, successes: Vec::new(), failures: vec![PersistFailure { @@ -1760,30 +1374,16 @@ mod tests { index_uid: Some(index_uid_clone.clone()), source_id: "test-source".to_string(), shard_id: Some(ShardId::from(1)), - reason: PersistFailureReason::Timeout as i32, + reason: PersistFailureReason::NoShardsAvailable as i32, }], - }; - Ok(response) + }) }); + // Second attempt: succeeds. mock_ingester_0 .expect_persist() .once() .returning(move |request| { - assert_eq!(request.leader_id, "test-ingester-0"); - assert_eq!(request.subrequests.len(), 1); - assert_eq!(request.commit_type(), CommitTypeV2::Auto); - - let subrequest = &request.subrequests[0]; - assert_eq!(subrequest.subrequest_id, 0); - assert_eq!(subrequest.index_uid(), &index_uid); - assert_eq!(subrequest.source_id, "test-source"); - assert_eq!(subrequest.shard_id(), ShardId::from(1)); - assert_eq!( - subrequest.doc_batch, - Some(DocBatchV2::for_test(["test-doc-foo"])) - ); - - let response = PersistResponse { + Ok(PersistResponse { leader_id: request.leader_id, successes: vec![PersistSuccess { subrequest_id: 0, @@ -1795,115 +1395,27 @@ mod tests { parse_failures: Vec::new(), }], failures: Vec::new(), - }; - Ok(response) + }) }); - let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); - - let ingest_request = IngestRequestV2 { - subrequests: vec![IngestSubrequest { - subrequest_id: 0, - index_id: "test-index-0".to_string(), - source_id: "test-source".to_string(), - doc_batch: Some(DocBatchV2::for_test(["test-doc-foo"])), - }], - commit_type: CommitTypeV2::Auto as i32, - }; - router.ingest(ingest_request).await.unwrap(); - } - - #[tokio::test] - async fn test_router_updates_routing_table_on_chitchat_events() { - let self_node_id = "test-router".into(); - let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); - let ingester_pool = IngesterPool::default(); - let replication_factor = 1; - let event_broker = EventBroker::default(); - let router = IngestRouter::new( - self_node_id, - control_plane, - ingester_pool.clone(), - replication_factor, - event_broker.clone(), - ); - router.subscribe(); - let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0); - - let mut state_guard = router.state.lock().await; - state_guard.routing_table.replace_shards( - index_uid.clone(), - "test-source", - vec![Shard { - index_uid: Some(index_uid.clone()), - shard_id: Some(ShardId::from(1)), - shard_state: ShardState::Open as i32, - leader_id: "test-ingester".to_string(), - ..Default::default() - }], + ingester_pool.insert( + "test-ingester-0".into(), + IngesterServiceClient::from_mock(mock_ingester_0), ); - drop(state_guard); - - let local_shards_update = LocalShardsUpdate { - leader_id: "test-ingester".into(), - source_uid: SourceUid { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - }, - shard_infos: BTreeSet::from_iter([ - ShardInfo { - shard_id: ShardId::from(1), - shard_state: ShardState::Closed, - short_term_ingestion_rate: RateMibPerSec(0), - long_term_ingestion_rate: RateMibPerSec(0), - }, - ShardInfo { - shard_id: ShardId::from(2), - shard_state: ShardState::Open, - short_term_ingestion_rate: RateMibPerSec(0), - long_term_ingestion_rate: RateMibPerSec(0), - }, - ]), - }; - event_broker.publish(local_shards_update); - - // Yield so that the event is processed. - yield_now().await; - - let state_guard = router.state.lock().await; - let shards = state_guard - .routing_table - .find_entry("test-index-0", "test-source") - .unwrap() - .all_shards(); - assert_eq!(shards.len(), 2); - assert_eq!(shards[0].shard_id, ShardId::from(1)); - assert_eq!(shards[0].shard_state, ShardState::Closed); - assert_eq!(shards[1].shard_id, ShardId::from(2)); - assert_eq!(shards[1].shard_state, ShardState::Open); - drop(state_guard); - let shard_positions_update = ShardPositionsUpdate { - source_uid: SourceUid { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - }, - updated_shard_positions: vec![(ShardId::from(1), Position::eof(0u64))], - }; - event_broker.publish(shard_positions_update); - - // Yield so that the event is processed. - yield_now().await; - - let state_guard = router.state.lock().await; - let shards = state_guard - .routing_table - .find_entry("test-index-0", "test-source") - .unwrap() - .all_shards(); - assert_eq!(shards.len(), 1); - assert_eq!(shards[0].shard_id, ShardId::from(2)); - drop(state_guard); + let response = router + .ingest(IngestRequestV2 { + subrequests: vec![IngestSubrequest { + subrequest_id: 0, + index_id: "test-index-0".to_string(), + source_id: "test-source".to_string(), + doc_batch: Some(DocBatchV2::for_test(["test-doc-foo"])), + }], + commit_type: CommitTypeV2::Auto as i32, + }) + .await + .unwrap(); + assert_eq!(response.successes.len(), 1); + assert_eq!(response.failures.len(), 0); } #[tokio::test] @@ -1922,42 +1434,48 @@ mod tests { let index_uid_0: IndexUid = IndexUid::for_test("test-index-0", 0); let index_uid_1: IndexUid = IndexUid::for_test("test-index-1", 0); - let mut state_guard = router.state.lock().await; - state_guard.routing_table.replace_shards( - index_uid_0.clone(), - "test-source", - vec![Shard { - index_uid: Some(index_uid_0.clone()), - shard_id: Some(ShardId::from(1)), - shard_state: ShardState::Open as i32, - leader_id: "test-ingester".to_string(), - ..Default::default() - }], - ); - state_guard.routing_table.replace_shards( - index_uid_1.clone(), - "test-source", - vec![Shard { - index_uid: Some(index_uid_1.clone()), - shard_id: Some(ShardId::from(2)), - shard_state: ShardState::Open as i32, - leader_id: "test-ingester".to_string(), - ..Default::default() - }], - ); - drop(state_guard); + { + let mut state_guard = router.state.lock().await; + state_guard.node_routing_table.merge_from_shards( + index_uid_0.clone(), + "test-source".to_string(), + vec![Shard { + index_uid: Some(index_uid_0.clone()), + shard_id: Some(ShardId::from(1)), + shard_state: ShardState::Open as i32, + leader_id: "test-ingester-0".to_string(), + ..Default::default() + }], + ); + state_guard.node_routing_table.merge_from_shards( + index_uid_1.clone(), + "test-source".to_string(), + vec![Shard { + index_uid: Some(index_uid_1.clone()), + shard_id: Some(ShardId::from(2)), + shard_state: ShardState::Open as i32, + leader_id: "test-ingester-1".to_string(), + ..Default::default() + }], + ); + } let debug_info = router.debug_info().await; let routing_table = &debug_info["routing_table"]; assert_eq!(routing_table.as_object().unwrap().len(), 2); - assert_eq!(routing_table["test-index-0"].as_array().unwrap().len(), 1); - assert_eq!(routing_table["test-index-1"].as_array().unwrap().len(), 1); + let index_0_entries = routing_table["test-index-0"].as_array().unwrap(); + assert_eq!(index_0_entries.len(), 1); + assert_eq!(index_0_entries[0]["node_id"], "test-ingester-0"); + assert_eq!(index_0_entries[0]["capacity_score"], 5); + + let index_1_entries = routing_table["test-index-1"].as_array().unwrap(); + assert_eq!(index_1_entries.len(), 1); + assert_eq!(index_1_entries[0]["node_id"], "test-ingester-1"); } #[tokio::test] - async fn test_router_does_not_retry_rate_limited_shards() { - // We avoid retrying a shard limited shard at the scale of a workbench. + async fn test_router_returns_rate_limited_failure() { let self_node_id = "test-router".into(); let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); let ingester_pool = IngesterPool::default(); @@ -1969,138 +1487,51 @@ mod tests { replication_factor, EventBroker::default(), ); - let mut state_guard = router.state.lock().await; let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0); - - state_guard.routing_table.replace_shards( - index_uid.clone(), - "test-source", - vec![ - Shard { + { + let mut state_guard = router.state.lock().await; + state_guard.node_routing_table.merge_from_shards( + index_uid.clone(), + "test-source".to_string(), + vec![Shard { index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), shard_id: Some(ShardId::from(1)), shard_state: ShardState::Open as i32, leader_id: "test-ingester-0".to_string(), ..Default::default() - }, - Shard { - index_uid: Some(index_uid.clone()), - source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(2)), - shard_state: ShardState::Open as i32, - leader_id: "test-ingester-0".to_string(), - ..Default::default() - }, - ], - ); - drop(state_guard); - - // We have two shards. - // - shard 1 is rate limited - // - shard 2 is timeout. - // We expect a retry on shard 2 that is then successful. - let mut seq = Sequence::new(); + }], + ); + } let mut mock_ingester_0 = MockIngesterService::new(); - mock_ingester_0 - .expect_persist() - .times(1) - .returning(move |request| { - assert_eq!(request.leader_id, "test-ingester-0"); - assert_eq!(request.commit_type(), CommitTypeV2::Auto); - assert_eq!(request.subrequests.len(), 1); - let subrequest = &request.subrequests[0]; - assert_eq!(subrequest.subrequest_id, 0); - let index_uid = subrequest.index_uid().clone(); - assert_eq!(subrequest.source_id, "test-source"); - assert_eq!(subrequest.shard_id(), ShardId::from(1)); - assert_eq!( - subrequest.doc_batch, - Some(DocBatchV2::for_test(["test-doc-foo"])) - ); - - let response = PersistResponse { - leader_id: request.leader_id, - successes: Vec::new(), - failures: vec![PersistFailure { - subrequest_id: 0, - index_uid: Some(index_uid), - source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), - reason: PersistFailureReason::ShardRateLimited as i32, - }], - }; - Ok(response) - }) - .in_sequence(&mut seq); - - mock_ingester_0 - .expect_persist() - .times(1) - .returning(move |request| { - assert_eq!(request.leader_id, "test-ingester-0"); - assert_eq!(request.commit_type(), CommitTypeV2::Auto); - assert_eq!(request.subrequests.len(), 1); - let subrequest = &request.subrequests[0]; - assert_eq!(subrequest.subrequest_id, 0); - let index_uid = subrequest.index_uid().clone(); - assert_eq!(subrequest.source_id, "test-source"); - assert_eq!(subrequest.shard_id(), ShardId::from(2)); - assert_eq!( - subrequest.doc_batch, - Some(DocBatchV2::for_test(["test-doc-foo"])) - ); - - let response = PersistResponse { - leader_id: request.leader_id, - successes: Vec::new(), - failures: vec![PersistFailure { - subrequest_id: 0, - index_uid: Some(index_uid), - source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), - reason: PersistFailureReason::Timeout as i32, - }], - }; - Ok(response) - }) - .in_sequence(&mut seq); - - mock_ingester_0 - .expect_persist() - .times(1) - .returning(move |request| { - assert_eq!(request.leader_id, "test-ingester-0"); - assert_eq!(request.commit_type(), CommitTypeV2::Auto); - assert_eq!(request.subrequests.len(), 1); - let subrequest = &request.subrequests[0]; - assert_eq!(subrequest.subrequest_id, 0); - let index_uid = subrequest.index_uid().clone(); - assert_eq!(subrequest.source_id, "test-source"); - assert_eq!(subrequest.shard_id(), ShardId::from(2)); - assert_eq!( - subrequest.doc_batch, - Some(DocBatchV2::for_test(["test-doc-foo"])) - ); - - let response = PersistResponse { - leader_id: request.leader_id, - successes: vec![PersistSuccess { - subrequest_id: 0, - index_uid: Some(index_uid), - source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), - num_persisted_docs: 1, - replication_position_inclusive: Some(Position::offset(0u64)), - parse_failures: Vec::new(), - }], - failures: Vec::new(), - }; - Ok(response) - }) - .in_sequence(&mut seq); + mock_ingester_0.expect_persist().returning(move |request| { + assert_eq!(request.leader_id, "test-ingester-0"); + assert_eq!(request.commit_type(), CommitTypeV2::Auto); + assert_eq!(request.subrequests.len(), 1); + let subrequest = &request.subrequests[0]; + assert_eq!(subrequest.subrequest_id, 0); + let index_uid = subrequest.index_uid().clone(); + assert_eq!(subrequest.source_id, "test-source"); + assert!(subrequest.shard_id.is_none()); + assert_eq!( + subrequest.doc_batch, + Some(DocBatchV2::for_test(["test-doc-foo"])) + ); + let response = PersistResponse { + leader_id: request.leader_id, + successes: Vec::new(), + failures: vec![PersistFailure { + subrequest_id: 0, + index_uid: Some(index_uid), + source_id: "test-source".to_string(), + shard_id: Some(ShardId::from(1)), + reason: PersistFailureReason::NoShardsAvailable as i32, + }], + }; + Ok(response) + }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); @@ -2113,88 +1544,130 @@ mod tests { }], commit_type: CommitTypeV2::Auto as i32, }; - router.ingest(ingest_request).await.unwrap(); + let ingest_response = router.ingest(ingest_request).await.unwrap(); + assert_eq!(ingest_response.successes.len(), 0); + assert_eq!(ingest_response.failures.len(), 1); + assert_eq!( + ingest_response.failures[0].reason(), + IngestFailureReason::NoShardsAvailable + ); } #[tokio::test] - async fn test_router_returns_rate_limited_failure() { - let self_node_id = "test-router".into(); - let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); - let ingester_pool = IngesterPool::default(); - let replication_factor = 1; + async fn test_router_updates_node_routing_table_on_capacity_update() { + let event_broker = EventBroker::default(); let router = IngestRouter::new( - self_node_id, - control_plane, - ingester_pool.clone(), - replication_factor, - EventBroker::default(), + "test-router".into(), + ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()), + IngesterPool::default(), + 1, + event_broker.clone(), ); - let mut state_guard = router.state.lock().await; - let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0); + router.subscribe(); - state_guard.routing_table.replace_shards( - index_uid.clone(), - "test-source", - vec![Shard { - index_uid: Some(index_uid.clone()), + event_broker.publish(IngesterCapacityScoreUpdate { + node_id: "test-ingester-0".into(), + source_uid: SourceUid { + index_uid: IndexUid::for_test("test-index", 0), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), - shard_state: ShardState::Open as i32, - leader_id: "test-ingester-0".to_string(), - ..Default::default() - }], - ); - drop(state_guard); - - let mut mock_ingester_0 = MockIngesterService::new(); - mock_ingester_0 - .expect_persist() - .times(1) - .returning(move |request| { - assert_eq!(request.leader_id, "test-ingester-0"); - assert_eq!(request.commit_type(), CommitTypeV2::Auto); - assert_eq!(request.subrequests.len(), 1); - let subrequest = &request.subrequests[0]; - assert_eq!(subrequest.subrequest_id, 0); - let index_uid = subrequest.index_uid().clone(); - assert_eq!(subrequest.source_id, "test-source"); - assert_eq!(subrequest.shard_id(), ShardId::from(1)); - assert_eq!( - subrequest.doc_batch, - Some(DocBatchV2::for_test(["test-doc-foo"])) - ); + }, + capacity_score: 7, + open_shard_count: 3, + }); + // Give the async subscriber a moment to process. + tokio::time::sleep(Duration::from_millis(10)).await; - let response = PersistResponse { - leader_id: request.leader_id, - successes: Vec::new(), - failures: vec![PersistFailure { - subrequest_id: 0, - index_uid: Some(index_uid), - source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), - reason: PersistFailureReason::ShardRateLimited as i32, - }], - }; - Ok(response) - }); - let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); + let state_guard = router.state.lock().await; + let entry = state_guard + .node_routing_table + .find_entry("test-index", "test-source") + .unwrap(); + let node = entry.nodes.get("test-ingester-0").unwrap(); + assert_eq!(node.capacity_score, 7); + assert_eq!(node.open_shard_count, 3); + } - let ingest_request = IngestRequestV2 { - subrequests: vec![IngestSubrequest { + #[tokio::test] + async fn test_router_process_persist_results_marks_unavailable_on_persist_failure() { + let router = IngestRouter::new( + "test-router".into(), + ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()), + IngesterPool::default(), + 1, + EventBroker::default(), + ); + let ingest_subrequests = vec![ + IngestSubrequest { subrequest_id: 0, index_id: "test-index-0".to_string(), source_id: "test-source".to_string(), - doc_batch: Some(DocBatchV2::for_test(["test-doc-foo"])), - }], - commit_type: CommitTypeV2::Auto as i32, - }; - let ingest_response = router.ingest(ingest_request).await.unwrap(); - assert_eq!(ingest_response.successes.len(), 0); - assert_eq!(ingest_response.failures.len(), 1); - assert_eq!( - ingest_response.failures[0].reason(), - IngestFailureReason::ShardRateLimited + ..Default::default() + }, + IngestSubrequest { + subrequest_id: 1, + index_id: "test-index-1".to_string(), + source_id: "test-source".to_string(), + ..Default::default() + }, + ]; + let mut workbench = IngestWorkbench::new(ingest_subrequests, 2); + + // NoShardsAvailable does NOT mark the leader as unavailable. + let persist_futures = FuturesUnordered::new(); + persist_futures.push(async { + let summary = PersistRequestSummary { + leader_id: "test-ingester-0".into(), + subrequest_ids: vec![0], + }; + let result = Ok::<_, IngestV2Error>(PersistResponse { + leader_id: "test-ingester-0".to_string(), + successes: Vec::new(), + failures: vec![PersistFailure { + subrequest_id: 0, + index_uid: Some(IndexUid::for_test("test-index-0", 0)), + source_id: "test-source".to_string(), + shard_id: Some(ShardId::from(1)), + reason: PersistFailureReason::NoShardsAvailable as i32, + }], + }); + (summary, result) + }); + router + .process_persist_results(&mut workbench, persist_futures) + .await; + assert!( + !workbench + .unavailable_leaders + .contains(&NodeId::from("test-ingester-0")) + ); + + // NodeUnavailable DOES mark the leader as unavailable. + let persist_futures = FuturesUnordered::new(); + persist_futures.push(async { + let summary = PersistRequestSummary { + leader_id: "test-ingester-1".into(), + subrequest_ids: vec![1], + }; + let result = Ok::<_, IngestV2Error>(PersistResponse { + leader_id: "test-ingester-1".to_string(), + successes: Vec::new(), + failures: vec![PersistFailure { + subrequest_id: 1, + index_uid: Some(IndexUid::for_test("test-index-1", 0)), + source_id: "test-source".to_string(), + shard_id: Some(ShardId::from(1)), + reason: PersistFailureReason::NodeUnavailable as i32, + }], + }); + (summary, result) + }); + router + .process_persist_results(&mut workbench, persist_futures) + .await; + assert!( + workbench + .unavailable_leaders + .contains(&NodeId::from("test-ingester-1")) ); } } diff --git a/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs b/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs index 311aef138d8..3e7b22969e8 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs @@ -24,7 +24,7 @@ use quickwit_proto::ingest::router::{ IngestFailure, IngestFailureReason, IngestResponseV2, IngestSubrequest, IngestSuccess, }; use quickwit_proto::ingest::{IngestV2Error, RateLimitingCause}; -use quickwit_proto::types::{NodeId, ShardId, SubrequestId}; +use quickwit_proto::types::{NodeId, SubrequestId}; use tracing::warn; use super::publish_tracker::PublishTracker; @@ -35,7 +35,6 @@ use super::router::PersistRequestSummary; #[derive(Default)] pub(super) struct IngestWorkbench { pub subworkbenches: BTreeMap, - pub rate_limited_shards: HashSet, pub num_successes: usize, /// The number of batch persist attempts. This is not sum of the number of attempts for each /// subrequest. @@ -228,13 +227,6 @@ impl IngestWorkbench { self.record_failure(subrequest_id, SubworkbenchFailure::NoShardsAvailable); } - pub fn record_rate_limited(&mut self, subrequest_id: SubrequestId) { - self.record_failure( - subrequest_id, - SubworkbenchFailure::RateLimited(RateLimitingCause::ShardRateLimiting), - ); - } - /// Marks a node as unavailable for the span of the workbench. /// /// Remaining attempts will treat the node as if it was not in the ingester pool. @@ -433,7 +425,7 @@ mod tests { assert!(!subworkbench.last_failure_is_transient()); subworkbench.last_failure_opt = Some(SubworkbenchFailure::Persist( - PersistFailureReason::ShardRateLimited, + PersistFailureReason::NoShardsAvailable, )); assert!(subworkbench.is_pending()); assert!(subworkbench.last_failure_is_transient()); @@ -807,7 +799,7 @@ mod tests { let persist_failure = PersistFailure { subrequest_id: 42, - reason: PersistFailureReason::ShardRateLimited as i32, + reason: PersistFailureReason::NoShardsAvailable as i32, ..Default::default() }; workbench.record_persist_failure(&persist_failure); diff --git a/quickwit/quickwit-proto/protos/quickwit/ingester.proto b/quickwit/quickwit-proto/protos/quickwit/ingester.proto index a5b651d94d8..25a4705d58a 100644 --- a/quickwit/quickwit-proto/protos/quickwit/ingester.proto +++ b/quickwit/quickwit-proto/protos/quickwit/ingester.proto @@ -96,11 +96,10 @@ message PersistSuccess { enum PersistFailureReason { PERSIST_FAILURE_REASON_UNSPECIFIED = 0; - PERSIST_FAILURE_REASON_SHARD_NOT_FOUND = 1; - PERSIST_FAILURE_REASON_SHARD_CLOSED = 2; - PERSIST_FAILURE_REASON_SHARD_RATE_LIMITED = 3; PERSIST_FAILURE_REASON_WAL_FULL = 4; PERSIST_FAILURE_REASON_TIMEOUT = 5; + PERSIST_FAILURE_REASON_NO_SHARDS_AVAILABLE = 6; + PERSIST_FAILURE_REASON_NODE_UNAVAILABLE = 7; } message PersistFailure { diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs index 018e19a39a9..07b8d5b64a1 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs @@ -397,11 +397,10 @@ pub struct ObservationMessage { #[repr(i32)] pub enum PersistFailureReason { Unspecified = 0, - ShardNotFound = 1, - ShardClosed = 2, - ShardRateLimited = 3, WalFull = 4, Timeout = 5, + NoShardsAvailable = 6, + NodeUnavailable = 7, } impl PersistFailureReason { /// String value of the enum field names used in the ProtoBuf definition. @@ -411,22 +410,20 @@ impl PersistFailureReason { pub fn as_str_name(&self) -> &'static str { match self { Self::Unspecified => "PERSIST_FAILURE_REASON_UNSPECIFIED", - Self::ShardNotFound => "PERSIST_FAILURE_REASON_SHARD_NOT_FOUND", - Self::ShardClosed => "PERSIST_FAILURE_REASON_SHARD_CLOSED", - Self::ShardRateLimited => "PERSIST_FAILURE_REASON_SHARD_RATE_LIMITED", Self::WalFull => "PERSIST_FAILURE_REASON_WAL_FULL", Self::Timeout => "PERSIST_FAILURE_REASON_TIMEOUT", + Self::NoShardsAvailable => "PERSIST_FAILURE_REASON_NO_SHARDS_AVAILABLE", + Self::NodeUnavailable => "PERSIST_FAILURE_REASON_NODE_UNAVAILABLE", } } /// Creates an enum from field names used in the ProtoBuf definition. pub fn from_str_name(value: &str) -> ::core::option::Option { match value { "PERSIST_FAILURE_REASON_UNSPECIFIED" => Some(Self::Unspecified), - "PERSIST_FAILURE_REASON_SHARD_NOT_FOUND" => Some(Self::ShardNotFound), - "PERSIST_FAILURE_REASON_SHARD_CLOSED" => Some(Self::ShardClosed), - "PERSIST_FAILURE_REASON_SHARD_RATE_LIMITED" => Some(Self::ShardRateLimited), "PERSIST_FAILURE_REASON_WAL_FULL" => Some(Self::WalFull), "PERSIST_FAILURE_REASON_TIMEOUT" => Some(Self::Timeout), + "PERSIST_FAILURE_REASON_NO_SHARDS_AVAILABLE" => Some(Self::NoShardsAvailable), + "PERSIST_FAILURE_REASON_NODE_UNAVAILABLE" => Some(Self::NodeUnavailable), _ => None, } } diff --git a/quickwit/quickwit-proto/src/ingest/mod.rs b/quickwit/quickwit-proto/src/ingest/mod.rs index fda347d7931..6d529d79fbd 100644 --- a/quickwit/quickwit-proto/src/ingest/mod.rs +++ b/quickwit/quickwit-proto/src/ingest/mod.rs @@ -313,11 +313,10 @@ impl From for IngestFailureReason { fn from(reason: PersistFailureReason) -> Self { match reason { PersistFailureReason::Unspecified => IngestFailureReason::Unspecified, - PersistFailureReason::ShardNotFound => IngestFailureReason::NoShardsAvailable, - PersistFailureReason::ShardClosed => IngestFailureReason::NoShardsAvailable, + PersistFailureReason::NoShardsAvailable => IngestFailureReason::NoShardsAvailable, PersistFailureReason::WalFull => IngestFailureReason::WalFull, - PersistFailureReason::ShardRateLimited => IngestFailureReason::ShardRateLimited, PersistFailureReason::Timeout => IngestFailureReason::Timeout, + PersistFailureReason::NodeUnavailable => IngestFailureReason::NoShardsAvailable, } } } @@ -326,8 +325,8 @@ impl From for PersistFailureReason { fn from(reason: ReplicateFailureReason) -> Self { match reason { ReplicateFailureReason::Unspecified => PersistFailureReason::Unspecified, - ReplicateFailureReason::ShardNotFound => PersistFailureReason::ShardNotFound, - ReplicateFailureReason::ShardClosed => PersistFailureReason::ShardClosed, + ReplicateFailureReason::ShardNotFound => PersistFailureReason::NoShardsAvailable, + ReplicateFailureReason::ShardClosed => PersistFailureReason::NoShardsAvailable, ReplicateFailureReason::WalFull => PersistFailureReason::WalFull, } } diff --git a/quickwit/quickwit-serve/src/lib.rs b/quickwit/quickwit-serve/src/lib.rs index ca4520ff0ce..60515bc819f 100644 --- a/quickwit/quickwit-serve/src/lib.rs +++ b/quickwit/quickwit-serve/src/lib.rs @@ -82,8 +82,9 @@ use quickwit_indexing::models::ShardPositionsService; use quickwit_indexing::start_indexing_service; use quickwit_ingest::{ GetMemoryCapacity, IngestRequest, IngestRouter, IngestServiceClient, Ingester, IngesterPool, - LocalShardsUpdate, get_idle_shard_timeout, setup_local_shards_update_listener, - start_ingest_api_service, wait_for_ingester_decommission, wait_for_ingester_status, + LocalShardsUpdate, get_idle_shard_timeout, setup_ingester_capacity_update_listener, + setup_local_shards_update_listener, start_ingest_api_service, wait_for_ingester_decommission, + wait_for_ingester_status, }; use quickwit_jaeger::JaegerService; use quickwit_janitor::{JanitorService, start_janitor_service}; @@ -906,6 +907,9 @@ async fn setup_ingest_v2( event_broker.clone(), ); ingest_router.subscribe(); + setup_ingester_capacity_update_listener(cluster.clone(), event_broker.clone()) + .await + .forever(); let ingest_router_service = IngestRouterServiceClient::tower() .stack_layer(INGEST_GRPC_SERVER_METRICS_LAYER.clone()) diff --git a/quickwit/rest-api-tests/scenarii/tag_fields/0002_negative_tags.yaml b/quickwit/rest-api-tests/scenarii/tag_fields/0002_negative_tags.yaml index f1f900c342c..99f0e5ed285 100644 --- a/quickwit/rest-api-tests/scenarii/tag_fields/0002_negative_tags.yaml +++ b/quickwit/rest-api-tests/scenarii/tag_fields/0002_negative_tags.yaml @@ -1,23 +1,23 @@ # regression test for https://github.com/quickwit-oss/quickwit/issues/4698 -endpoint: simple/search +endpoint: tag-simple/search params: query: "tag:1" expected: num_hits: 3 --- -endpoint: simple/search +endpoint: tag-simple/search params: query: "-tag:2" expected: num_hits: 4 --- -endpoint: simple/search +endpoint: tag-simple/search params: query: "tag:2" expected: num_hits: 1 --- -endpoint: simple/search +endpoint: tag-simple/search params: query: "-tag:1" expected: diff --git a/quickwit/rest-api-tests/scenarii/tag_fields/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/tag_fields/_setup.quickwit.yaml index 1208ca48343..4ae0b2eb465 100644 --- a/quickwit/rest-api-tests/scenarii/tag_fields/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/tag_fields/_setup.quickwit.yaml @@ -4,14 +4,14 @@ endpoint: indexes/allowedtypes status_code: null --- method: DELETE -endpoint: indexes/simple +endpoint: indexes/tag-simple status_code: null --- method: POST endpoint: indexes/ json: version: "0.7" - index_id: simple + index_id: tag-simple doc_mapping: field_mappings: - name: seq @@ -21,7 +21,7 @@ json: tag_fields: ["tag"] --- method: POST -endpoint: simple/ingest +endpoint: tag-simple/ingest params: commit: force ndjson: @@ -29,7 +29,7 @@ ndjson: - {"seq": 2, "tag": 2} --- method: POST -endpoint: simple/ingest +endpoint: tag-simple/ingest params: commit: force ndjson: @@ -37,7 +37,7 @@ ndjson: - {"seq": 3, "tag": null} --- method: POST -endpoint: simple/ingest +endpoint: tag-simple/ingest params: commit: force ndjson: diff --git a/quickwit/rest-api-tests/scenarii/tag_fields/_teardown.quickwit.yaml b/quickwit/rest-api-tests/scenarii/tag_fields/_teardown.quickwit.yaml index fa0ca391b51..0c3ac8fd6a4 100644 --- a/quickwit/rest-api-tests/scenarii/tag_fields/_teardown.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/tag_fields/_teardown.quickwit.yaml @@ -3,4 +3,4 @@ endpoint: indexes/allowedtypes status_code: null --- method: DELETE -endpoint: indexes/simple +endpoint: indexes/tag-simple From 006f951c25379fc664bab2220ccc69583d2347ba Mon Sep 17 00:00:00 2001 From: nadav-govari Date: Tue, 3 Mar 2026 13:28:39 -0500 Subject: [PATCH 4/9] Piggyback routing update on persist response (#6173) --- .github/workflows/ci.yml | 28 ++- .../broadcast/ingester_capacity_score.rs | 233 ++---------------- .../src/ingest_v2/debouncing.rs | 4 +- .../quickwit-ingest/src/ingest_v2/ingester.rs | 28 ++- quickwit/quickwit-ingest/src/ingest_v2/mod.rs | 7 +- .../src/ingest_v2/node_routing_table.rs | 190 +++++++++----- .../quickwit-ingest/src/ingest_v2/router.rs | 162 +++++++++++- .../quickwit-ingest/src/ingest_v2/state.rs | 158 ++++++++---- .../src/ingest_v2/wal_capacity_timeseries.rs | 214 ++++++++++++++++ .../src/ingest_v2/workbench.rs | 3 +- .../protos/quickwit/ingester.proto | 13 + .../quickwit/quickwit.ingest.ingester.rs | 22 ++ quickwit/quickwit-proto/src/getters.rs | 1 + 13 files changed, 701 insertions(+), 362 deletions(-) create mode 100644 quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_timeseries.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e4a12d69521..ada6218f295 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,16 +55,24 @@ jobs: - name: Cleanup Disk Space run: | df -h - sudo rm -rf /usr/share/dotnet - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/swift - sudo rm -rf /usr/local/.ghcup - sudo rm -rf /opt/hostedtoolcache/CodeQL - df -h + + if [ "$(df -BG / | awk 'NR==2 {gsub("G","",$4); print $4}')" -lt 30 ]; then + echo "Less than 30GiB available. Running cleanup..." + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/swift + sudo rm -rf /usr/local/.ghcup + sudo rm -rf /opt/hostedtoolcache/CodeQL + df -h + else + echo "30GiB or more available. Skipping cleanup." + fi - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 - name: Install Ubuntu packages - run: sudo apt-get -y install protobuf-compiler + run: | + sudo apt-get update + sudo apt-get -y install protobuf-compiler - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v.6.1.0 with: python-version: '3.11' @@ -90,7 +98,7 @@ jobs: workspaces: "./quickwit -> target" - name: Install nextest if: always() && steps.modified.outputs.rust_src == 'true' - uses: taiki-e/cache-cargo-install-action@34ce5120836e5f9f1508d8713d7fdea0e8facd6f # v3.0.1 + uses: taiki-e/install-action@aba36d755ec7ca22d38b12111787c26115943952 with: tool: cargo-nextest - name: cargo build @@ -132,7 +140,9 @@ jobs: - .github/workflows/ci.yml - name: Install Ubuntu packages if: always() && steps.modified.outputs.rust_src == 'true' - run: sudo apt-get -y install protobuf-compiler + run: | + sudo apt-get update + sudo apt-get -y install protobuf-compiler - name: Setup nightly Rust Toolchain (for rustfmt) if: steps.modified.outputs.rust_src == 'true' uses: dtolnay/rust-toolchain@f7ccc83f9ed1e5b9c81d8a67d7ad1a747e22a561 # master diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs index 9531db17deb..482f5f58886 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs @@ -18,100 +18,17 @@ use anyhow::{Context, Result}; use bytesize::ByteSize; use quickwit_cluster::{Cluster, ListenerHandle}; use quickwit_common::pubsub::{Event, EventBroker}; -use quickwit_common::ring_buffer::RingBuffer; use quickwit_common::shared_consts::INGESTER_CAPACITY_SCORE_PREFIX; use quickwit_proto::ingest::ingester::IngesterStatus; -use quickwit_proto::types::{IndexUid, NodeId, SourceId, SourceUid}; +use quickwit_proto::types::{NodeId, SourceUid}; use serde::{Deserialize, Serialize}; use tokio::task::JoinHandle; use tracing::{info, warn}; use super::{BROADCAST_INTERVAL_PERIOD, make_key, parse_key}; +use crate::OpenShardCounts; use crate::ingest_v2::state::WeakIngesterState; -pub type OpenShardCounts = Vec<(IndexUid, SourceId, usize)>; - -/// The lookback window length is meant to capture readings far enough back in time to give -/// a rough rate of change estimate. At size 6, with broadcast interval of 5 seconds, this would be -/// 30 seconds of readings. -const WAL_CAPACITY_LOOKBACK_WINDOW_LEN: usize = 6; - -/// The ring buffer stores one extra element so that `delta()` can compare the newest reading -/// with the one that is exactly `WAL_CAPACITY_LOOKBACK_WINDOW_LEN` steps ago. Otherwise, that -/// reading would be discarded when the next reading is inserted. -const WAL_CAPACITY_READINGS_LEN: usize = WAL_CAPACITY_LOOKBACK_WINDOW_LEN + 1; - -struct WalDiskCapacityTimeSeries { - memory_capacity: ByteSize, - readings: RingBuffer, -} - -impl WalDiskCapacityTimeSeries { - fn new(memory_capacity: ByteSize) -> Self { - #[cfg(not(test))] - assert!(memory_capacity.as_u64() > 0); - Self { - memory_capacity, - readings: RingBuffer::default(), - } - } - - fn record(&mut self, memory_used: ByteSize) { - let remaining = 1.0 - (memory_used.as_u64() as f64 / self.memory_capacity.as_u64() as f64); - self.readings.push_back(remaining.clamp(0.0, 1.0)); - } - - fn current(&self) -> Option { - self.readings.last() - } - - /// How much remaining capacity changed between the oldest and newest readings. - /// Positive = improving, negative = draining. - fn delta(&self) -> Option { - let current = self.readings.last()?; - let oldest = self.readings.front()?; - Some(current - oldest) - } -} - -/// Computes a capacity score from 0 to 10 using a PD controller. -/// -/// The score has two components: -/// -/// - **P (proportional):** How much WAL capacity remains right now. An ingester with 100% free -/// capacity gets `PROPORTIONAL_WEIGHT` points; 50% gets half; and so on. If remaining capacity -/// drops to `MIN_PERMISSIBLE_CAPACITY` or below, the score is immediately 0. -/// -/// - **D (derivative):** Up to `DERIVATIVE_WEIGHT` bonus points based on how fast remaining -/// capacity is changing over the lookback window. A higher drain rate is worse, so we invert it: -/// `drain / MAX_DRAIN_RATE` normalizes the drain to a 0–1 penalty, and subtracting from 1 -/// converts it into a 0–1 bonus. Multiplied by `DERIVATIVE_WEIGHT`, a stable node gets the full -/// bonus and a node draining at `MAX_DRAIN_RATE` or faster gets nothing. -/// -/// Putting it together: a completely idle ingester scores 10 (8 + 2). -/// One that is full but stable scores ~2. One that is draining rapidly scores less. -/// A score of 0 means the ingester is at or below minimum permissible capacity. -/// -/// Below this remaining capacity fraction, the score is immediately 0. -const MIN_PERMISSIBLE_CAPACITY: f64 = 0.05; -/// Weight of the proportional term (max points from P). -const PROPORTIONAL_WEIGHT: f64 = 8.0; -/// Weight of the derivative term (max points from D). -const DERIVATIVE_WEIGHT: f64 = 2.0; -/// The drain rate (as a fraction of total capacity over the lookback window) at which the -/// derivative penalty is fully applied. Drain rates beyond this are clamped. -const MAX_DRAIN_RATE: f64 = 0.10; - -fn compute_capacity_score(remaining_capacity: f64, capacity_delta: f64) -> usize { - if remaining_capacity <= MIN_PERMISSIBLE_CAPACITY { - return 0; - } - let p = PROPORTIONAL_WEIGHT * remaining_capacity; - let drain = (-capacity_delta).clamp(0.0, MAX_DRAIN_RATE); - let d = DERIVATIVE_WEIGHT * (1.0 - drain / MAX_DRAIN_RATE); - (p + d).clamp(0.0, 10.0) as usize -} - #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct IngesterCapacityScore { pub capacity_score: usize, @@ -123,24 +40,18 @@ pub struct IngesterCapacityScore { pub struct BroadcastIngesterCapacityScoreTask { cluster: Cluster, weak_state: WeakIngesterState, - wal_capacity_time_series: WalDiskCapacityTimeSeries, } impl BroadcastIngesterCapacityScoreTask { - pub fn spawn( - cluster: Cluster, - weak_state: WeakIngesterState, - disk_capacity: ByteSize, - ) -> JoinHandle<()> { + pub fn spawn(cluster: Cluster, weak_state: WeakIngesterState) -> JoinHandle<()> { let mut broadcaster = Self { cluster, weak_state, - wal_capacity_time_series: WalDiskCapacityTimeSeries::new(disk_capacity), }; tokio::spawn(async move { broadcaster.run().await }) } - async fn snapshot(&self) -> Result> { + async fn snapshot(&self) -> Result> { let state = self .weak_state .upgrade() @@ -152,15 +63,16 @@ impl BroadcastIngesterCapacityScoreTask { return Ok(None); } - let guard = state + let mut guard = state .lock_fully() .await .map_err(|_| anyhow::anyhow!("failed to acquire ingester state lock"))?; let usage = guard.mrecordlog.resource_usage(); let disk_used = ByteSize::b(usage.disk_used_bytes as u64); - let open_shard_counts = guard.get_open_shard_counts(); + let capacity_score = guard.wal_capacity_time_series.record_and_score(disk_used); + let (open_shard_counts, _) = guard.get_shard_snapshot(); - Ok(Some((disk_used, open_shard_counts))) + Ok(Some((capacity_score, open_shard_counts))) } async fn run(&mut self) { @@ -170,7 +82,7 @@ impl BroadcastIngesterCapacityScoreTask { loop { interval.tick().await; - let (disk_used, open_shard_counts) = match self.snapshot().await { + let (capacity_score, open_shard_counts) = match self.snapshot().await { Ok(Some(snapshot)) => snapshot, Ok(None) => continue, Err(error) => { @@ -179,12 +91,6 @@ impl BroadcastIngesterCapacityScoreTask { } }; - self.wal_capacity_time_series.record(disk_used); - - let remaining_capacity = self.wal_capacity_time_series.current().unwrap_or(1.0); - let capacity_delta = self.wal_capacity_time_series.delta().unwrap_or(0.0); - let capacity_score = compute_capacity_score(remaining_capacity, capacity_delta); - previous_sources = self .broadcast_capacity(capacity_score, &open_shard_counts, &previous_sources) .await; @@ -266,90 +172,12 @@ mod tests { use std::sync::atomic::{AtomicUsize, Ordering}; use quickwit_cluster::{ChannelTransport, create_cluster_for_test}; - use quickwit_proto::types::ShardId; + use quickwit_proto::types::{IndexUid, ShardId, SourceId}; use super::*; use crate::ingest_v2::models::IngesterShard; use crate::ingest_v2::state::IngesterState; - fn ts() -> WalDiskCapacityTimeSeries { - WalDiskCapacityTimeSeries::new(ByteSize::b(100)) - } - - /// Helper: record a reading with `used` bytes against the series' fixed capacity. - fn record(series: &mut WalDiskCapacityTimeSeries, used: u64) { - series.record(ByteSize::b(used)); - } - - #[test] - fn test_wal_disk_capacity_current_after_record() { - let mut series = WalDiskCapacityTimeSeries::new(ByteSize::b(256)); - // 192 of 256 used => 25% remaining - series.record(ByteSize::b(192)); - assert_eq!(series.current(), Some(0.25)); - - // 16 of 256 used => 93.75% remaining - series.record(ByteSize::b(16)); - assert_eq!(series.current(), Some(0.9375)); - } - - #[test] - fn test_wal_disk_capacity_record_saturates_at_zero() { - let mut series = ts(); - // 200 used out of 100 capacity => clamped to 0.0 - record(&mut series, 200); - assert_eq!(series.current(), Some(0.0)); - } - - #[test] - fn test_wal_disk_capacity_delta_growing() { - let mut series = ts(); - // oldest: 60 of 100 used => 40% remaining - record(&mut series, 60); - // current: 20 of 100 used => 80% remaining - record(&mut series, 20); - // delta = 0.80 - 0.40 = 0.40 - assert_eq!(series.delta(), Some(0.40)); - } - - #[test] - fn test_wal_disk_capacity_delta_shrinking() { - let mut series = ts(); - // oldest: 20 of 100 used => 80% remaining - record(&mut series, 20); - // current: 60 of 100 used => 40% remaining - record(&mut series, 60); - // delta = 0.40 - 0.80 = -0.40 - assert_eq!(series.delta(), Some(-0.40)); - } - - #[test] - fn test_capacity_score_draining_vs_stable() { - // Node A: capacity draining — usage increases 10, 20, ..., 70 over 7 ticks. - let mut node_a = ts(); - for used in (10..=70).step_by(10) { - record(&mut node_a, used); - } - let a_remaining = node_a.current().unwrap(); - let a_delta = node_a.delta().unwrap(); - let a_score = compute_capacity_score(a_remaining, a_delta); - - // Node B: steady at 50% usage over 7 ticks. - let mut node_b = ts(); - for _ in 0..7 { - record(&mut node_b, 50); - } - let b_remaining = node_b.current().unwrap(); - let b_delta = node_b.delta().unwrap(); - let b_score = compute_capacity_score(b_remaining, b_delta); - - // p=2.4, d=0 (max drain) => 2 - assert_eq!(a_score, 2); - // p=4, d=2 (stable) => 6 - assert_eq!(b_score, 6); - assert!(b_score > a_score); - } - #[tokio::test] async fn test_snapshot_state_dropped() { let transport = ChannelTransport::default(); @@ -363,7 +191,6 @@ mod tests { let task = BroadcastIngesterCapacityScoreTask { cluster, weak_state, - wal_capacity_time_series: WalDiskCapacityTimeSeries::new(ByteSize::mb(1)), }; assert!(task.snapshot().await.is_err()); } @@ -376,7 +203,9 @@ mod tests { .unwrap(); let event_broker = EventBroker::default(); - let (_temp_dir, state) = IngesterState::for_test().await; + // Use 1000 bytes disk capacity so 500 used => 50% remaining, 0 delta => score = 6 + let (_temp_dir, state) = + IngesterState::for_test_with_disk_capacity(ByteSize::b(1000)).await; let index_uid = IndexUid::for_test("test-index", 0); let mut state_guard = state.lock_partially().await.unwrap(); let shard = IngesterShard::new_solo( @@ -387,21 +216,18 @@ mod tests { .advertisable() .build(); state_guard.shards.insert(shard.queue_id(), shard); - let open_shard_counts = state_guard.get_open_shard_counts(); + let (open_shard_counts, _) = state_guard.get_shard_snapshot(); + let capacity_score = state_guard + .wal_capacity_time_series + .record_and_score(ByteSize::b(500)); drop(state_guard); - // Simulate 500 of 1000 bytes capacity used => 50% remaining, 0 delta => score = 6 - let mut task = BroadcastIngesterCapacityScoreTask { + assert_eq!(capacity_score, 6); + + let task = BroadcastIngesterCapacityScoreTask { cluster: cluster.clone(), weak_state: state.weak(), - wal_capacity_time_series: WalDiskCapacityTimeSeries::new(ByteSize::b(1000)), }; - task.wal_capacity_time_series.record(ByteSize::b(500)); - - let remaining = task.wal_capacity_time_series.current().unwrap(); - let delta = task.wal_capacity_time_series.delta().unwrap(); - let capacity_score = compute_capacity_score(remaining, delta); - assert_eq!(capacity_score, 6); let update_counter = Arc::new(AtomicUsize::new(0)); let update_counter_clone = update_counter.clone(); @@ -434,23 +260,4 @@ mod tests { assert_eq!(deserialized.capacity_score, 6); assert_eq!(deserialized.open_shard_count, 1); } - - #[test] - fn test_wal_disk_capacity_delta_spans_lookback_window() { - let mut series = ts(); - - // Fill to exactly the lookback window length (6 readings), all same value. - for _ in 0..WAL_CAPACITY_LOOKBACK_WINDOW_LEN { - record(&mut series, 50); - } - assert_eq!(series.delta(), Some(0.0)); - - // 7th reading fills the ring buffer. Delta spans 6 intervals. - record(&mut series, 0); - assert_eq!(series.delta(), Some(0.50)); - - // 8th reading evicts the oldest 50-remaining. Delta still spans 6 intervals. - record(&mut series, 0); - assert_eq!(series.delta(), Some(0.50)); - } } diff --git a/quickwit/quickwit-ingest/src/ingest_v2/debouncing.rs b/quickwit/quickwit-ingest/src/ingest_v2/debouncing.rs index 19d6f5d691d..041f2928c45 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/debouncing.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/debouncing.rs @@ -18,6 +18,7 @@ use std::sync::Arc; use quickwit_proto::control_plane::{ GetOrCreateOpenShardsRequest, GetOrCreateOpenShardsSubrequest, }; +use quickwit_proto::ingest::ShardIds; use quickwit_proto::types::{IndexId, SourceId}; use tokio::sync::{OwnedRwLockWriteGuard, RwLock}; @@ -68,6 +69,7 @@ impl GetOrCreateOpenShardsRequestDebouncer { #[derive(Default)] pub(super) struct DebouncedGetOrCreateOpenShardsRequest { subrequests: Vec, + pub closed_shards: Vec, pub unavailable_leaders: Vec, rendezvous: Rendezvous, } @@ -83,8 +85,8 @@ impl DebouncedGetOrCreateOpenShardsRequest { } let request = GetOrCreateOpenShardsRequest { subrequests: self.subrequests, + closed_shards: self.closed_shards, unavailable_leaders: self.unavailable_leaders, - ..Default::default() }; (Some(request), self.rendezvous) } diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs index 5cd231d5a4a..0eafd689b66 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs @@ -132,11 +132,11 @@ impl Ingester { idle_shard_timeout: Duration, ) -> IngestV2Result { let self_node_id: NodeId = cluster.self_node_id().into(); - let state = IngesterState::load(wal_dir_path, rate_limiter_settings); + let state = IngesterState::load(wal_dir_path, disk_capacity, rate_limiter_settings); let weak_state = state.weak(); BroadcastLocalShardsTask::spawn(cluster.clone(), weak_state.clone()); - BroadcastIngesterCapacityScoreTask::spawn(cluster, weak_state.clone(), disk_capacity); + BroadcastIngesterCapacityScoreTask::spawn(cluster, weak_state.clone()); CloseIdleShardsTask::spawn(weak_state, idle_shard_timeout); let ingester = Self { @@ -468,6 +468,7 @@ impl Ingester { leader_id: leader_id.into(), successes: Vec::new(), failures: persist_failures, + routing_update: None, }; return Ok(persist_response); } @@ -788,15 +789,33 @@ impl Ingester { } } let wal_usage = state_guard.mrecordlog.resource_usage(); - drop(state_guard); - let disk_used = wal_usage.disk_used_bytes as u64; + let (open_shard_counts, closed_shards) = state_guard.get_shard_snapshot(); + let capacity_score = state_guard + .wal_capacity_time_series + .score(ByteSize::b(disk_used)) as u32; + drop(state_guard); if disk_used >= self.disk_capacity.as_u64() * 90 / 100 { self.background_reset_shards(); } report_wal_usage(wal_usage); + let source_shard_updates = open_shard_counts + .into_iter() + .map(|(index_uid, source_id, count)| SourceShardUpdate { + index_uid: Some(index_uid), + source_id, + open_shard_count: count as u32, + }) + .collect(); + + let routing_update = RoutingUpdate { + capacity_score, + source_shard_updates, + closed_shards, + }; + #[cfg(test)] { persist_successes.sort_by_key(|success| success.subrequest_id); @@ -807,6 +826,7 @@ impl Ingester { leader_id, successes: persist_successes, failures: persist_failures, + routing_update: Some(routing_update), }; Ok(persist_response) } diff --git a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs index 0bb3d6b6138..f6c1cd42732 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs @@ -31,6 +31,7 @@ mod router; #[allow(dead_code)] mod routing_table; mod state; +mod wal_capacity_timeseries; mod workbench; use std::collections::HashMap; @@ -50,7 +51,9 @@ use quickwit_common::tower::Pool; use quickwit_proto::ingest::ingester::IngesterServiceClient; use quickwit_proto::ingest::router::{IngestRequestV2, IngestSubrequest}; use quickwit_proto::ingest::{CommitTypeV2, DocBatchV2}; -use quickwit_proto::types::{DocUid, DocUidGenerator, IndexId, NodeId, SubrequestId}; +use quickwit_proto::types::{ + DocUid, DocUidGenerator, IndexId, IndexUid, NodeId, SourceId, SubrequestId, +}; use serde::Serialize; use tracing::{error, info}; use workbench::pending_subrequests; @@ -70,6 +73,8 @@ pub type LeaderId = NodeId; pub type FollowerId = NodeId; +pub type OpenShardCounts = Vec<(IndexUid, SourceId, usize)>; + const IDLE_SHARD_TIMEOUT_ENV_KEY: &str = "QW_IDLE_SHARD_TIMEOUT_SECS"; const DEFAULT_IDLE_SHARD_TIMEOUT: Duration = Duration::from_secs(15 * 60); // 15 minutes diff --git a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs index f354011ede2..6b23fcd1aca 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs @@ -14,8 +14,9 @@ use std::collections::{HashMap, HashSet}; +use itertools::Itertools; use quickwit_proto::ingest::Shard; -use quickwit_proto::types::{IndexId, IndexUid, NodeId, SourceId, SourceUid}; +use quickwit_proto::types::{IndexId, IndexUid, NodeId, SourceId}; use rand::rng; use rand::seq::IndexedRandom; @@ -36,7 +37,7 @@ pub(super) struct IngesterNode { pub open_shard_count: usize, } -#[derive(Debug)] +#[derive(Debug, Default)] pub(super) struct RoutingEntry { pub nodes: HashMap, } @@ -135,67 +136,59 @@ impl NodeBasedRoutingTable { pub fn apply_capacity_update( &mut self, node_id: NodeId, - source_uid: SourceUid, + index_uid: IndexUid, + source_id: SourceId, capacity_score: usize, open_shard_count: usize, ) { - let key = ( - source_uid.index_uid.index_id.to_string(), - source_uid.source_id.clone(), - ); - - let entry = self.table.entry(key).or_insert_with(|| RoutingEntry { - nodes: HashMap::new(), - }); + let key = (index_uid.index_id.to_string(), source_id.clone()); + let entry = self.table.entry(key).or_default(); let ingester_node = IngesterNode { node_id: node_id.clone(), - index_uid: source_uid.index_uid, - source_id: source_uid.source_id, + index_uid, + source_id, capacity_score, open_shard_count, }; entry.nodes.insert(node_id, ingester_node); } - /// Merges nodes from a GetOrCreateOpenShards control plane response into the - /// table. Only adds nodes that aren't already present — existing nodes keep - /// their real capacity scores from the broadcast. - /// TODO: New nodes get a default capacity_score of 5 until GetOrCreateOpenShards contains - /// capacity scores. + /// Merges routing updates from a GetOrCreateOpenShards control plane response into the + /// table. For existing nodes, updates their open shard count, including if the count is 0, from + /// the CP response while preserving capacity scores if they already exist. + /// New nodes get a default capacity_score of 5. pub fn merge_from_shards( &mut self, index_uid: IndexUid, source_id: SourceId, shards: Vec, ) { - let key = (index_uid.index_id.to_string(), source_id.clone()); - - let mut per_leader_count: HashMap = HashMap::new(); - for shard in &shards { - if shard.is_open() { - *per_leader_count - .entry(NodeId::from(shard.leader_id.clone())) - .or_default() += 1; - } - } + let per_leader_count: HashMap = shards + .iter() + .map(|shard| { + let num_open_shards = shard.is_open() as usize; + let leader_id = NodeId::from(shard.leader_id.clone()); + (leader_id, num_open_shards) + }) + .into_grouping_map() + .sum(); - let entry = self.table.entry(key).or_insert_with(|| RoutingEntry { - nodes: HashMap::new(), - }); + let key = (index_uid.index_id.to_string(), source_id.clone()); + let entry = self.table.entry(key).or_default(); for (node_id, open_shard_count) in per_leader_count { - if entry.nodes.contains_key(&node_id) { - continue; - } - let ingester_node = IngesterNode { - node_id: node_id.clone(), - index_uid: index_uid.clone(), - source_id: source_id.clone(), - capacity_score: 5, - open_shard_count, - }; - entry.nodes.insert(node_id, ingester_node); + entry + .nodes + .entry(node_id.clone()) + .and_modify(|node| node.open_shard_count = open_shard_count) + .or_insert_with(|| IngesterNode { + node_id, + index_uid: index_uid.clone(), + source_id: source_id.clone(), + capacity_score: 5, + open_shard_count, + }); } } } @@ -208,37 +201,53 @@ mod tests { use super::*; - fn source_uid(index_id: &str, incarnation_id: u128, source_id: &str) -> SourceUid { - SourceUid { - index_uid: IndexUid::for_test(index_id, incarnation_id), - source_id: source_id.to_string(), - } - } - #[test] fn test_apply_capacity_update() { let mut table = NodeBasedRoutingTable::default(); - let uid = source_uid("test-index", 0, "test-source"); - let key = ("test-index".to_string(), "test-source".to_string()); + let key = ("test-index".to_string(), "test-source".into()); // Insert first node. - table.apply_capacity_update("node-1".into(), uid.clone(), 8, 3); + table.apply_capacity_update( + "node-1".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 8, + 3, + ); let entry = table.table.get(&key).unwrap(); assert_eq!(entry.nodes.len(), 1); assert_eq!(entry.nodes.get("node-1").unwrap().capacity_score, 8); // Update existing node. - table.apply_capacity_update("node-1".into(), uid.clone(), 4, 5); + table.apply_capacity_update( + "node-1".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 4, + 5, + ); let node = table.table.get(&key).unwrap().nodes.get("node-1").unwrap(); assert_eq!(node.capacity_score, 4); assert_eq!(node.open_shard_count, 5); // Add second node. - table.apply_capacity_update("node-2".into(), uid.clone(), 6, 2); + table.apply_capacity_update( + "node-2".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 6, + 2, + ); assert_eq!(table.table.get(&key).unwrap().nodes.len(), 2); // Zero shards: node stays in table but becomes ineligible for routing. - table.apply_capacity_update("node-1".into(), uid.clone(), 0, 0); + table.apply_capacity_update( + "node-1".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 0, + 0, + ); let entry = table.table.get(&key).unwrap(); assert_eq!(entry.nodes.len(), 2); assert_eq!(entry.nodes.get("node-1").unwrap().open_shard_count, 0); @@ -249,13 +258,18 @@ mod tests { fn test_has_open_nodes() { let mut table = NodeBasedRoutingTable::default(); let pool = IngesterPool::default(); - let uid = source_uid("test-index", 0, "test-source"); // Empty table. assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new())); // Node exists but is not in pool. - table.apply_capacity_update("node-1".into(), uid.clone(), 8, 3); + table.apply_capacity_update( + "node-1".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 8, + 3, + ); assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new())); // Node is in pool → true. @@ -267,12 +281,24 @@ mod tests { assert!(!table.has_open_nodes("test-index", "test-source", &pool, &unavailable)); // Second node available → true despite first being unavailable. - table.apply_capacity_update("node-2".into(), uid.clone(), 6, 2); + table.apply_capacity_update( + "node-2".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 6, + 2, + ); pool.insert("node-2".into(), IngesterServiceClient::mocked()); assert!(table.has_open_nodes("test-index", "test-source", &pool, &unavailable)); // Node with capacity_score=0 is not eligible. - table.apply_capacity_update("node-2".into(), uid, 0, 2); + table.apply_capacity_update( + "node-2".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 0, + 2, + ); assert!(!table.has_open_nodes("test-index", "test-source", &pool, &unavailable)); } @@ -280,11 +306,16 @@ mod tests { fn test_pick_node() { let mut table = NodeBasedRoutingTable::default(); let pool = IngesterPool::default(); - let uid = source_uid("test-index", 0, "test-source"); let key = ("test-index".to_string(), "test-source".to_string()); // Node exists but not in pool → None. - table.apply_capacity_update("node-1".into(), uid.clone(), 8, 3); + table.apply_capacity_update( + "node-1".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 8, + 3, + ); assert!( table .table @@ -305,7 +336,13 @@ mod tests { assert_eq!(picked.node_id, NodeId::from("node-1")); // Multiple nodes → something is returned. - table.apply_capacity_update("node-2".into(), uid.clone(), 2, 1); + table.apply_capacity_update( + "node-2".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 2, + 1, + ); pool.insert("node-2".into(), IngesterServiceClient::mocked()); assert!( table @@ -317,8 +354,20 @@ mod tests { ); // Node with capacity_score=0 is skipped. - table.apply_capacity_update("node-1".into(), uid.clone(), 0, 3); - table.apply_capacity_update("node-2".into(), uid, 0, 1); + table.apply_capacity_update( + "node-1".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 0, + 3, + ); + table.apply_capacity_update( + "node-2".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 0, + 1, + ); assert!( table .table @@ -385,17 +434,18 @@ mod tests { ..Default::default() }; - // Two open shards on node-1, one open on node-2, one closed (ignored). + // Two open shards on node-1, one open + one closed on node-2, only closed on node-3. let shards = vec![ make_shard(1, "node-1", true), make_shard(2, "node-1", true), make_shard(3, "node-2", true), make_shard(4, "node-2", false), + make_shard(5, "node-3", false), ]; table.merge_from_shards(index_uid.clone(), "test-source".into(), shards); let entry = table.table.get(&key).unwrap(); - assert_eq!(entry.nodes.len(), 2); + assert_eq!(entry.nodes.len(), 3); let n1 = entry.nodes.get("node-1").unwrap(); assert_eq!(n1.open_shard_count, 2); @@ -404,14 +454,18 @@ mod tests { let n2 = entry.nodes.get("node-2").unwrap(); assert_eq!(n2.open_shard_count, 1); + let n3 = entry.nodes.get("node-3").unwrap(); + assert_eq!(n3.open_shard_count, 0); + // Merging again adds new nodes but preserves existing ones. - let shards = vec![make_shard(10, "node-3", true)]; + let shards = vec![make_shard(10, "node-4", true)]; table.merge_from_shards(index_uid, "test-source".into(), shards); let entry = table.table.get(&key).unwrap(); - assert_eq!(entry.nodes.len(), 3); + assert_eq!(entry.nodes.len(), 4); assert!(entry.nodes.contains_key("node-1")); assert!(entry.nodes.contains_key("node-2")); assert!(entry.nodes.contains_key("node-3")); + assert!(entry.nodes.contains_key("node-4")); } } diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs index da3d989d93e..1f5ca26865e 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs @@ -98,7 +98,9 @@ pub struct IngestRouter { } struct RouterState { + // Debounces `GetOrCreateOpenShardsRequest` requests to the control plane. debouncer: GetOrCreateOpenShardsRequestDebouncer, + // Routing table of nodes, their WAL capacity, and the number of open shards per source. node_routing_table: NodeBasedRoutingTable, } @@ -152,7 +154,7 @@ impl IngestRouter { ingester_pool: &IngesterPool, ) -> DebouncedGetOrCreateOpenShardsRequest { let mut debounced_request = DebouncedGetOrCreateOpenShardsRequest::default(); - let unavailable_leaders = &workbench.unavailable_leaders; + let unavailable_leaders: &HashSet = &workbench.unavailable_leaders; let mut state_guard = self.state.lock().await; @@ -186,6 +188,12 @@ impl IngestRouter { } drop(state_guard); + if !debounced_request.is_empty() && !workbench.closed_shards.is_empty() { + info!(closed_shards=?workbench.closed_shards, "reporting closed shard(s) to control plane"); + debounced_request + .closed_shards + .append(&mut workbench.closed_shards); + } if !debounced_request.is_empty() && !unavailable_leaders.is_empty() { info!(unavailable_leaders=?unavailable_leaders, "reporting unavailable leader(s) to control plane"); @@ -265,6 +273,8 @@ impl IngestRouter { while let Some((persist_summary, persist_result)) = persist_futures.next().await { match persist_result { Ok(persist_response) => { + let leader_id = NodeId::from(persist_response.leader_id.clone()); + for persist_success in persist_response.successes { workbench.record_persist_success(persist_success); } @@ -272,16 +282,37 @@ impl IngestRouter { workbench.record_persist_failure(&persist_failure); match persist_failure.reason() { - PersistFailureReason::NoShardsAvailable => {} + PersistFailureReason::NoShardsAvailable => { + // For non-critical failures, we don't mark the nodes unavailable; + // a routing update is piggybacked on PersistResponses, so shard + // counts and capacity scores will be fresh on the next try. + } PersistFailureReason::NodeUnavailable | PersistFailureReason::WalFull | PersistFailureReason::Timeout => { - unavailable_leaders - .insert(NodeId::from(persist_response.leader_id.clone())); + unavailable_leaders.insert(leader_id.clone()); } _ => {} } } + + if let Some(routing_update) = persist_response.routing_update { + // Since we just talked to the node, we take advantage and use the + // opportunity to get a fresh routing update. + let mut state_guard = self.state.lock().await; + for shard_update in routing_update.source_shard_updates { + state_guard.node_routing_table.apply_capacity_update( + leader_id.clone(), + shard_update.index_uid().clone(), + shard_update.source_id, + routing_update.capacity_score as usize, + shard_update.open_shard_count as usize, + ); + } + drop(state_guard); + + workbench.closed_shards.extend(routing_update.closed_shards); + } } Err(persist_error) => { if workbench.is_last_attempt() { @@ -574,7 +605,8 @@ impl EventSubscriber for WeakRouterState { let mut state_guard = state.lock().await; state_guard.node_routing_table.apply_capacity_update( update.node_id, - update.source_uid, + update.source_uid.index_uid, + update.source_uid.source_id, update.capacity_score, update.open_shard_count, ); @@ -593,7 +625,8 @@ mod tests { GetOrCreateOpenShardsResponse, GetOrCreateOpenShardsSuccess, MockControlPlaneService, }; use quickwit_proto::ingest::ingester::{ - IngesterServiceClient, MockIngesterService, PersistFailure, PersistResponse, PersistSuccess, + IngesterServiceClient, MockIngesterService, PersistFailure, PersistResponse, + PersistSuccess, RoutingUpdate, SourceShardUpdate, }; use quickwit_proto::ingest::router::IngestSubrequest; use quickwit_proto::ingest::{ @@ -630,10 +663,8 @@ mod tests { let mut state_guard = router.state.lock().await; state_guard.node_routing_table.apply_capacity_update( "test-ingester-0".into(), - SourceUid { - index_uid: IndexUid::for_test("test-index-0", 0), - source_id: "test-source".to_string(), - }, + IndexUid::for_test("test-index-0", 0), + "test-source".to_string(), 8, 1, ); @@ -1042,6 +1073,11 @@ mod tests { ..Default::default() }], failures: Vec::new(), + routing_update: Some(RoutingUpdate { + capacity_score: 6, + source_shard_updates: Vec::new(), + ..Default::default() + }), }); (persist_summary, persist_result) }); @@ -1094,6 +1130,11 @@ mod tests { shard_id: Some(ShardId::from(1)), reason: PersistFailureReason::NoShardsAvailable as i32, }], + routing_update: Some(RoutingUpdate { + capacity_score: 6, + source_shard_updates: Vec::new(), + ..Default::default() + }), }); (persist_summary, persist_result) }); @@ -1265,6 +1306,11 @@ mod tests { }], }], failures: Vec::new(), + routing_update: Some(RoutingUpdate { + capacity_score: 6, + source_shard_updates: Vec::new(), + ..Default::default() + }), }) }); ingester_pool.insert( @@ -1293,6 +1339,11 @@ mod tests { parse_failures: Vec::new(), }], failures: Vec::new(), + routing_update: Some(RoutingUpdate { + capacity_score: 6, + source_shard_updates: Vec::new(), + ..Default::default() + }), }) }); ingester_pool.insert( @@ -1360,8 +1411,9 @@ mod tests { } let mut mock_ingester_0 = MockIngesterService::new(); - let index_uid_clone = index_uid.clone(); // First attempt: returns NoShardsAvailable (transient, doesn't mark leader unavailable). + // The response still reports capacity_score=6 and 1 open shard so the node stays routable. + let index_uid_clone = index_uid.clone(); mock_ingester_0 .expect_persist() .once() @@ -1376,6 +1428,15 @@ mod tests { shard_id: Some(ShardId::from(1)), reason: PersistFailureReason::NoShardsAvailable as i32, }], + routing_update: Some(RoutingUpdate { + capacity_score: 6, + source_shard_updates: vec![SourceShardUpdate { + index_uid: Some(index_uid_clone.clone()), + source_id: "test-source".to_string(), + open_shard_count: 1, + }], + ..Default::default() + }), }) }); // Second attempt: succeeds. @@ -1395,6 +1456,11 @@ mod tests { parse_failures: Vec::new(), }], failures: Vec::new(), + routing_update: Some(RoutingUpdate { + capacity_score: 6, + source_shard_updates: Vec::new(), + ..Default::default() + }), }) }); ingester_pool.insert( @@ -1524,11 +1590,20 @@ mod tests { successes: Vec::new(), failures: vec![PersistFailure { subrequest_id: 0, - index_uid: Some(index_uid), + index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), shard_id: Some(ShardId::from(1)), reason: PersistFailureReason::NoShardsAvailable as i32, }], + routing_update: Some(RoutingUpdate { + capacity_score: 6, + source_shard_updates: vec![SourceShardUpdate { + index_uid: Some(index_uid), + source_id: "test-source".to_string(), + open_shard_count: 1, + }], + ..Default::default() + }), }; Ok(response) }); @@ -1629,6 +1704,11 @@ mod tests { shard_id: Some(ShardId::from(1)), reason: PersistFailureReason::NoShardsAvailable as i32, }], + routing_update: Some(RoutingUpdate { + capacity_score: 6, + source_shard_updates: Vec::new(), + ..Default::default() + }), }); (summary, result) }); @@ -1658,6 +1738,11 @@ mod tests { shard_id: Some(ShardId::from(1)), reason: PersistFailureReason::NodeUnavailable as i32, }], + routing_update: Some(RoutingUpdate { + capacity_score: 6, + source_shard_updates: Vec::new(), + ..Default::default() + }), }); (summary, result) }); @@ -1670,4 +1755,57 @@ mod tests { .contains(&NodeId::from("test-ingester-1")) ); } + + #[tokio::test] + async fn test_router_process_persist_results_applies_piggybacked_routing_updates() { + let router = IngestRouter::new( + "test-router".into(), + ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()), + IngesterPool::default(), + 1, + EventBroker::default(), + ); + let ingest_subrequests = vec![IngestSubrequest { + subrequest_id: 0, + index_id: "test-index".to_string(), + source_id: "test-source".to_string(), + ..Default::default() + }]; + let mut workbench = IngestWorkbench::new(ingest_subrequests, 2); + + let persist_futures = FuturesUnordered::new(); + persist_futures.push(async { + let summary = PersistRequestSummary { + leader_id: "test-ingester-0".into(), + subrequest_ids: vec![0], + }; + let result = Ok::<_, IngestV2Error>(PersistResponse { + leader_id: "test-ingester-0".to_string(), + successes: Vec::new(), + failures: Vec::new(), + routing_update: Some(RoutingUpdate { + capacity_score: 3, + source_shard_updates: vec![SourceShardUpdate { + index_uid: Some(IndexUid::for_test("test-index", 0)), + source_id: "test-source".to_string(), + open_shard_count: 2, + }], + ..Default::default() + }), + }); + (summary, result) + }); + router + .process_persist_results(&mut workbench, persist_futures) + .await; + + let state_guard = router.state.lock().await; + let entry = state_guard + .node_routing_table + .find_entry("test-index", "test-source") + .unwrap(); + let node = entry.nodes.get("test-ingester-0").unwrap(); + assert_eq!(node.capacity_score, 3); + assert_eq!(node.open_shard_count, 2); + } } diff --git a/quickwit/quickwit-ingest/src/ingest_v2/state.rs b/quickwit/quickwit-ingest/src/ingest_v2/state.rs index a14f4ae9a44..e158bce7c58 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/state.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/state.rs @@ -19,6 +19,7 @@ use std::path::Path; use std::sync::{Arc, Weak}; use std::time::{Duration, Instant}; +use bytesize::ByteSize; use itertools::Itertools; use mrecordlog::error::{DeleteQueueError, TruncateError}; use quickwit_common::pretty::PrettyDisplay; @@ -26,7 +27,7 @@ use quickwit_common::rate_limiter::{RateLimiter, RateLimiterSettings}; use quickwit_doc_mapper::DocMapper; use quickwit_proto::control_plane::AdviseResetShardsResponse; use quickwit_proto::ingest::ingester::IngesterStatus; -use quickwit_proto::ingest::{IngestV2Error, IngestV2Result, ShardState}; +use quickwit_proto::ingest::{IngestV2Error, IngestV2Result, ShardIds, ShardState}; use quickwit_proto::types::{DocMappingUid, IndexUid, Position, QueueId, SourceId, split_queue_id}; use tokio::sync::{Mutex, MutexGuard, RwLock, RwLockMappedWriteGuard, RwLockWriteGuard, watch}; use tracing::{error, info}; @@ -34,9 +35,10 @@ use tracing::{error, info}; use super::models::IngesterShard; use super::rate_meter::RateMeter; use super::replication::{ReplicationStreamTaskHandle, ReplicationTaskHandle}; +use super::wal_capacity_timeseries::WalDiskCapacityTimeSeries; use crate::ingest_v2::mrecordlog_utils::{force_delete_queue, queue_position_range}; use crate::mrecordlog_async::MultiRecordLogAsync; -use crate::{FollowerId, LeaderId}; +use crate::{FollowerId, LeaderId, OpenShardCounts}; /// Stores the state of the ingester and attempts to prevent deadlocks by exposing an API that /// guarantees that the internal data structures are always locked in the same order. @@ -59,6 +61,7 @@ pub(super) struct InnerIngesterState { pub replication_streams: HashMap, // Replication tasks running for each replication stream opened with leaders. pub replication_tasks: HashMap, + pub wal_capacity_time_series: WalDiskCapacityTimeSeries, status: IngesterStatus, status_tx: watch::Sender, } @@ -89,20 +92,45 @@ impl InnerIngesterState { .map(|(_, shard)| shard) } - pub fn get_open_shard_counts(&self) -> Vec<(IndexUid, SourceId, usize)> { - self.shards + /// Returns per-source open shard counts and closed shard IDs for all advertisable, + /// non-replica shards. + pub fn get_shard_snapshot(&self) -> (OpenShardCounts, Vec) { + let grouped = self + .shards .values() - .filter(|shard| shard.is_advertisable && !shard.is_replica() && shard.is_open()) - .map(|shard| (shard.index_uid.clone(), shard.source_id.clone())) - .counts() - .into_iter() - .map(|((index_uid, source_id), count)| (index_uid, source_id, count)) - .collect() + .filter(|shard| shard.is_advertisable && !shard.is_replica()) + .map(|shard| ((shard.index_uid.clone(), shard.source_id.clone()), shard)) + .into_group_map(); + + let mut open_counts = Vec::new(); + let mut closed_shards = Vec::new(); + + for ((index_uid, source_id), shards) in grouped { + let mut open_count = 0; + let mut closed_ids = Vec::new(); + + for shard in shards { + if shard.is_open() { + open_count += 1; + } else if shard.is_closed() { + closed_ids.push(shard.shard_id.clone()); + } + } + open_counts.push((index_uid.clone(), source_id.clone(), open_count)); + if !closed_ids.is_empty() { + closed_shards.push(ShardIds { + index_uid: Some(index_uid), + source_id, + shard_ids: closed_ids, + }); + } + } + (open_counts, closed_shards) } } impl IngesterState { - fn new() -> Self { + fn new(disk_capacity: ByteSize) -> Self { let status = IngesterStatus::Initializing; let (status_tx, status_rx) = watch::channel(status); let inner = InnerIngesterState { @@ -110,6 +138,7 @@ impl IngesterState { doc_mappers: Default::default(), replication_streams: Default::default(), replication_tasks: Default::default(), + wal_capacity_time_series: WalDiskCapacityTimeSeries::new(disk_capacity), status, status_tx, }; @@ -123,8 +152,12 @@ impl IngesterState { } } - pub fn load(wal_dir_path: &Path, rate_limiter_settings: RateLimiterSettings) -> Self { - let state = Self::new(); + pub fn load( + wal_dir_path: &Path, + disk_capacity: ByteSize, + rate_limiter_settings: RateLimiterSettings, + ) -> Self { + let state = Self::new(disk_capacity); let state_clone = state.clone(); let wal_dir_path = wal_dir_path.to_path_buf(); @@ -138,8 +171,17 @@ impl IngesterState { #[cfg(test)] pub async fn for_test() -> (tempfile::TempDir, Self) { + Self::for_test_with_disk_capacity(ByteSize::mb(256)).await + } + + #[cfg(test)] + pub async fn for_test_with_disk_capacity(disk_capacity: ByteSize) -> (tempfile::TempDir, Self) { let temp_dir = tempfile::tempdir().unwrap(); - let mut state = IngesterState::load(temp_dir.path(), RateLimiterSettings::default()); + let mut state = IngesterState::load( + temp_dir.path(), + disk_capacity, + RateLimiterSettings::default(), + ); state .status_rx @@ -488,7 +530,7 @@ mod tests { #[tokio::test] async fn test_ingester_state_does_not_lock_while_initializing() { - let state = IngesterState::new(); + let state = IngesterState::new(ByteSize::mb(256)); let inner_guard = state.inner.lock().await; assert_eq!(inner_guard.status(), IngesterStatus::Initializing); @@ -503,7 +545,7 @@ mod tests { #[tokio::test] async fn test_ingester_state_failed() { - let state = IngesterState::new(); + let state = IngesterState::new(ByteSize::mb(256)); state.inner.lock().await.set_status(IngesterStatus::Failed); @@ -516,7 +558,7 @@ mod tests { #[tokio::test] async fn test_ingester_state_init() { - let mut state = IngesterState::new(); + let mut state = IngesterState::new(ByteSize::mb(256)); let temp_dir = tempfile::tempdir().unwrap(); state @@ -672,61 +714,71 @@ mod tests { } #[tokio::test] - async fn test_get_open_shard_counts() { + async fn test_get_shard_snapshot() { let (_temp_dir, state) = IngesterState::for_test().await; let mut state_guard = state.lock_partially().await.unwrap(); - let index_a = IndexUid::for_test("index-a", 0); - let index_b = IndexUid::for_test("index-b", 0); - let index_c = IndexUid::for_test("index-c", 0); + let index_uid = IndexUid::for_test("test-index", 0); - // (index-a, source-a): 1 open solo shard. + // source-a: 2 open shards + 1 closed shard + 1 replica (ignored). let s = open_shard( - index_a.clone(), - SourceId::from("source-a"), + index_uid.clone(), + "source-a".into(), ShardId::from(1), false, ); state_guard.shards.insert(s.queue_id(), s); - - // (index-b, source-b): 1 open solo + 1 replica. Only the solo should be counted. let s = open_shard( - index_b.clone(), - SourceId::from("source-b"), + index_uid.clone(), + "source-a".into(), ShardId::from(2), false, ); state_guard.shards.insert(s.queue_id(), s); - let s = open_shard( - index_b.clone(), - SourceId::from("source-b"), - ShardId::from(3), - true, - ); + let s = IngesterShard::new_solo(index_uid.clone(), "source-a".into(), ShardId::from(3)) + .with_state(ShardState::Closed) + .advertisable() + .build(); + state_guard.shards.insert(s.queue_id(), s); + let s = open_shard(index_uid.clone(), "source-a".into(), ShardId::from(4), true); state_guard.shards.insert(s.queue_id(), s); - // (index-c, source-c): 2 open solo shards. - let s = open_shard( - index_c.clone(), - SourceId::from("source-c"), - ShardId::from(4), - false, - ); + // source-b: 2 closed shards, no open shards. + let s = IngesterShard::new_solo(index_uid.clone(), "source-b".into(), ShardId::from(5)) + .with_state(ShardState::Closed) + .advertisable() + .build(); state_guard.shards.insert(s.queue_id(), s); - let s = open_shard( - index_c.clone(), - SourceId::from("source-c"), - ShardId::from(5), - false, - ); + let s = IngesterShard::new_solo(index_uid.clone(), "source-b".into(), ShardId::from(6)) + .with_state(ShardState::Closed) + .advertisable() + .build(); state_guard.shards.insert(s.queue_id(), s); - let mut counts = state_guard.get_open_shard_counts(); - counts.sort_by(|a, b| a.0.cmp(&b.0)); + let (mut open_counts, mut closed_shards) = state_guard.get_shard_snapshot(); + + // Open counts: source-a has 2, source-b has 0. + open_counts.sort_by(|a, b| a.1.cmp(&b.1)); + assert_eq!(open_counts.len(), 2); + assert_eq!( + open_counts[0], + (index_uid.clone(), SourceId::from("source-a"), 2) + ); + assert_eq!( + open_counts[1], + (index_uid.clone(), SourceId::from("source-b"), 0) + ); + + // Closed shards: source-a has shard 3, source-b has shards 5 and 6. + closed_shards.sort_by(|a, b| a.source_id.cmp(&b.source_id)); + assert_eq!(closed_shards.len(), 2); + + assert_eq!(closed_shards[0].source_id, "source-a"); + assert_eq!(closed_shards[0].shard_ids, vec![ShardId::from(3)]); - assert_eq!(counts.len(), 3); - assert_eq!(counts[0], (index_a, SourceId::from("source-a"), 1)); - assert_eq!(counts[1], (index_b, SourceId::from("source-b"), 1)); - assert_eq!(counts[2], (index_c, SourceId::from("source-c"), 2)); + assert_eq!(closed_shards[1].source_id, "source-b"); + let mut source_b_ids = closed_shards[1].shard_ids.clone(); + source_b_ids.sort(); + assert_eq!(source_b_ids, vec![ShardId::from(5), ShardId::from(6)]); } } diff --git a/quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_timeseries.rs b/quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_timeseries.rs new file mode 100644 index 00000000000..58f030cbf74 --- /dev/null +++ b/quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_timeseries.rs @@ -0,0 +1,214 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use bytesize::ByteSize; +use quickwit_common::ring_buffer::RingBuffer; + +/// The lookback window length is meant to capture readings far enough back in time to give +/// a rough rate of change estimate. At size 6, with broadcast interval of 5 seconds, this would be +/// 30 seconds of readings. +const WAL_CAPACITY_LOOKBACK_WINDOW_LEN: usize = 6; + +/// The ring buffer stores one extra element so that `delta()` can compare the newest reading +/// with the one that is exactly `WAL_CAPACITY_LOOKBACK_WINDOW_LEN` steps ago. Otherwise, that +/// reading would be discarded when the next reading is inserted. +const WAL_CAPACITY_READINGS_LEN: usize = WAL_CAPACITY_LOOKBACK_WINDOW_LEN + 1; + +pub struct WalDiskCapacityTimeSeries { + disk_capacity: ByteSize, + readings: RingBuffer, +} + +impl WalDiskCapacityTimeSeries { + pub fn new(disk_capacity: ByteSize) -> Self { + #[cfg(not(test))] + assert!(disk_capacity.as_u64() > 0); + Self { + disk_capacity, + readings: RingBuffer::default(), + } + } + + /// Records a disk usage reading and returns the resulting capacity score. + pub fn record_and_score(&mut self, disk_used: ByteSize) -> usize { + self.record(disk_used); + let remaining = self.current().unwrap_or(1.0); + let delta = self.delta().unwrap_or(0.0); + compute_capacity_score(remaining, delta) + } + + /// Computes a capacity score for the given disk usage without recording it. + pub fn score(&self, disk_used: ByteSize) -> usize { + let remaining = 1.0 - (disk_used.as_u64() as f64 / self.disk_capacity.as_u64() as f64); + let delta = self.delta().unwrap_or(0.0); + compute_capacity_score(remaining, delta) + } + + fn record(&mut self, disk_used: ByteSize) { + let remaining = 1.0 - (disk_used.as_u64() as f64 / self.disk_capacity.as_u64() as f64); + self.readings.push_back(remaining.clamp(0.0, 1.0)); + } + + fn current(&self) -> Option { + self.readings.last() + } + + /// How much remaining capacity changed between the oldest and newest readings. + /// Positive = improving, negative = draining. + fn delta(&self) -> Option { + let current = self.readings.last()?; + let oldest = self.readings.front()?; + Some(current - oldest) + } +} + +/// Computes a capacity score from 0 to 10 using a PD controller. +/// +/// The score has two components: +/// +/// - **P (proportional):** How much WAL capacity remains right now. An ingester with 100% free +/// capacity gets `PROPORTIONAL_WEIGHT` points; 50% gets half; and so on. If remaining capacity +/// drops to `MIN_PERMISSIBLE_CAPACITY` or below, the score is immediately 0. +/// +/// - **D (derivative):** Up to `DERIVATIVE_WEIGHT` bonus points based on how fast remaining +/// capacity is changing over the lookback window. A higher drain rate is worse, so we invert it: +/// `drain / MAX_DRAIN_RATE` normalizes the drain to a 0–1 penalty, and subtracting from 1 +/// converts it into a 0–1 bonus. Multiplied by `DERIVATIVE_WEIGHT`, a stable node gets the full +/// bonus and a node draining at `MAX_DRAIN_RATE` or faster gets nothing. +/// +/// Putting it together: a completely idle ingester scores 10 (8 + 2). +/// One that is full but stable scores ~2. One that is draining rapidly scores less. +/// A score of 0 means the ingester is at or below minimum permissible capacity. +/// +/// Below this remaining capacity fraction, the score is immediately 0. +const MIN_PERMISSIBLE_CAPACITY: f64 = 0.05; +/// Weight of the proportional term (max points from P). +const PROPORTIONAL_WEIGHT: f64 = 8.0; +/// Weight of the derivative term (max points from D). +const DERIVATIVE_WEIGHT: f64 = 2.0; +/// The drain rate (as a fraction of total capacity over the lookback window) at which the +/// derivative penalty is fully applied. Drain rates beyond this are clamped. +const MAX_DRAIN_RATE: f64 = 0.10; + +fn compute_capacity_score(remaining_capacity: f64, capacity_delta: f64) -> usize { + if remaining_capacity <= MIN_PERMISSIBLE_CAPACITY { + return 0; + } + let p = PROPORTIONAL_WEIGHT * remaining_capacity; + let drain = (-capacity_delta).clamp(0.0, MAX_DRAIN_RATE); + let d = DERIVATIVE_WEIGHT * (1.0 - drain / MAX_DRAIN_RATE); + (p + d).clamp(0.0, 10.0) as usize +} + +#[cfg(test)] +mod tests { + use super::*; + + fn ts() -> WalDiskCapacityTimeSeries { + WalDiskCapacityTimeSeries::new(ByteSize::b(100)) + } + + /// Helper: record a reading with `used` bytes against the series' fixed capacity. + fn record(series: &mut WalDiskCapacityTimeSeries, used: u64) { + series.record(ByteSize::b(used)); + } + + #[test] + fn test_wal_disk_capacity_current_after_record() { + let mut series = WalDiskCapacityTimeSeries::new(ByteSize::b(256)); + // 192 of 256 used => 25% remaining + series.record(ByteSize::b(192)); + assert_eq!(series.current(), Some(0.25)); + + // 16 of 256 used => 93.75% remaining + series.record(ByteSize::b(16)); + assert_eq!(series.current(), Some(0.9375)); + } + + #[test] + fn test_wal_disk_capacity_record_saturates_at_zero() { + let mut series = ts(); + // 200 used out of 100 capacity => clamped to 0.0 + record(&mut series, 200); + assert_eq!(series.current(), Some(0.0)); + } + + #[test] + fn test_wal_disk_capacity_delta_growing() { + let mut series = ts(); + // oldest: 60 of 100 used => 40% remaining + record(&mut series, 60); + // current: 20 of 100 used => 80% remaining + record(&mut series, 20); + // delta = 0.80 - 0.40 = 0.40 + assert_eq!(series.delta(), Some(0.40)); + } + + #[test] + fn test_wal_disk_capacity_delta_shrinking() { + let mut series = ts(); + // oldest: 20 of 100 used => 80% remaining + record(&mut series, 20); + // current: 60 of 100 used => 40% remaining + record(&mut series, 60); + // delta = 0.40 - 0.80 = -0.40 + assert_eq!(series.delta(), Some(-0.40)); + } + + #[test] + fn test_capacity_score_draining_vs_stable() { + // Node A: capacity draining — usage increases 10, 20, ..., 70 over 7 ticks. + let mut node_a = ts(); + for used in (10..=70).step_by(10) { + record(&mut node_a, used); + } + let a_remaining = node_a.current().unwrap(); + let a_delta = node_a.delta().unwrap(); + let a_score = compute_capacity_score(a_remaining, a_delta); + + // Node B: steady at 50% usage over 7 ticks. + let mut node_b = ts(); + for _ in 0..7 { + record(&mut node_b, 50); + } + let b_remaining = node_b.current().unwrap(); + let b_delta = node_b.delta().unwrap(); + let b_score = compute_capacity_score(b_remaining, b_delta); + + // p=2.4, d=0 (max drain) => 2 + assert_eq!(a_score, 2); + // p=4, d=2 (stable) => 6 + assert_eq!(b_score, 6); + assert!(b_score > a_score); + } + + #[test] + fn test_wal_disk_capacity_delta_spans_lookback_window() { + let mut series = ts(); + + // Fill to exactly the lookback window length (6 readings), all same value. + for _ in 0..WAL_CAPACITY_LOOKBACK_WINDOW_LEN { + record(&mut series, 50); + } + assert_eq!(series.delta(), Some(0.0)); + + // 7th reading fills the ring buffer. Delta spans 6 intervals. + record(&mut series, 0); + assert_eq!(series.delta(), Some(0.50)); + + // 8th reading evicts the oldest 50-remaining. Delta still spans 6 intervals. + record(&mut series, 0); + assert_eq!(series.delta(), Some(0.50)); + } +} diff --git a/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs b/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs index 3e7b22969e8..fb52d8e8139 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs @@ -23,7 +23,7 @@ use quickwit_proto::ingest::ingester::{PersistFailure, PersistFailureReason, Per use quickwit_proto::ingest::router::{ IngestFailure, IngestFailureReason, IngestResponseV2, IngestSubrequest, IngestSuccess, }; -use quickwit_proto::ingest::{IngestV2Error, RateLimitingCause}; +use quickwit_proto::ingest::{IngestV2Error, RateLimitingCause, ShardIds}; use quickwit_proto::types::{NodeId, SubrequestId}; use tracing::warn; @@ -47,6 +47,7 @@ pub(super) struct IngestWorkbench { /// (The point here is to make sure we do not wait for the failure detection to kick the node /// out of the ingest node.) pub unavailable_leaders: HashSet, + pub closed_shards: Vec, publish_tracker: Option, } diff --git a/quickwit/quickwit-proto/protos/quickwit/ingester.proto b/quickwit/quickwit-proto/protos/quickwit/ingester.proto index 25a4705d58a..23ff6e6825d 100644 --- a/quickwit/quickwit-proto/protos/quickwit/ingester.proto +++ b/quickwit/quickwit-proto/protos/quickwit/ingester.proto @@ -81,6 +81,19 @@ message PersistResponse { string leader_id = 1; repeated PersistSuccess successes = 2; repeated PersistFailure failures = 3; + RoutingUpdate routing_update = 4; +} + +message RoutingUpdate { + uint32 capacity_score = 1; + repeated SourceShardUpdate source_shard_updates = 2; + repeated quickwit.ingest.ShardIds closed_shards = 3; +} + +message SourceShardUpdate { + quickwit.common.IndexUid index_uid = 1; + string source_id = 2; + uint32 open_shard_count = 3; } message PersistSuccess { diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs index 07b8d5b64a1..606dce48e2e 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs @@ -51,6 +51,28 @@ pub struct PersistResponse { pub successes: ::prost::alloc::vec::Vec, #[prost(message, repeated, tag = "3")] pub failures: ::prost::alloc::vec::Vec, + #[prost(message, optional, tag = "4")] + pub routing_update: ::core::option::Option, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct RoutingUpdate { + #[prost(uint32, tag = "1")] + pub capacity_score: u32, + #[prost(message, repeated, tag = "2")] + pub source_shard_updates: ::prost::alloc::vec::Vec, + #[prost(message, repeated, tag = "3")] + pub closed_shards: ::prost::alloc::vec::Vec, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct SourceShardUpdate { + #[prost(message, optional, tag = "1")] + pub index_uid: ::core::option::Option, + #[prost(string, tag = "2")] + pub source_id: ::prost::alloc::string::String, + #[prost(uint32, tag = "3")] + pub open_shard_count: u32, } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Clone, PartialEq, ::prost::Message)] diff --git a/quickwit/quickwit-proto/src/getters.rs b/quickwit/quickwit-proto/src/getters.rs index a327c1717a7..d33901bde47 100644 --- a/quickwit/quickwit-proto/src/getters.rs +++ b/quickwit/quickwit-proto/src/getters.rs @@ -111,6 +111,7 @@ generate_getters! { ShardIds, ShardPKey, TruncateShardsSubrequest, + SourceShardUpdate, // Metastore API AcquireShardsRequest, From 115553b65086a831c9656968cedeefc2027b4957 Mon Sep 17 00:00:00 2001 From: nadav-govari Date: Tue, 3 Mar 2026 14:10:38 -0500 Subject: [PATCH 5/9] Remove unused shard_ids in persist protos (#6169) --- .../quickwit-ingest/src/ingest_v2/ingester.rs | 30 ------------------- quickwit/quickwit-ingest/src/ingest_v2/mod.rs | 1 - .../src/ingest_v2/node_routing_table.rs | 1 + .../quickwit-ingest/src/ingest_v2/router.rs | 9 ------ .../src/ingest_v2/workbench.rs | 2 -- .../protos/quickwit/ingester.proto | 2 -- .../quickwit/quickwit.ingest.ingester.rs | 4 --- quickwit/quickwit-proto/src/getters.rs | 2 -- .../quickwit-proto/src/ingest/ingester.rs | 6 ---- 9 files changed, 1 insertion(+), 56 deletions(-) diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs index 0eafd689b66..a131e2c289a 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs @@ -459,7 +459,6 @@ impl Ingester { subrequest_id: subrequest.subrequest_id, index_uid: subrequest.index_uid, source_id: subrequest.source_id, - shard_id: subrequest.shard_id, reason: PersistFailureReason::NodeUnavailable as i32, }; persist_failures.push(persist_failure); @@ -490,7 +489,6 @@ impl Ingester { subrequest_id: subrequest.subrequest_id, index_uid: subrequest.index_uid, source_id: subrequest.source_id, - shard_id: subrequest.shard_id, reason: PersistFailureReason::NoShardsAvailable as i32, }; persist_failures.push(persist_failure); @@ -531,7 +529,6 @@ impl Ingester { subrequest_id: subrequest.subrequest_id, index_uid: subrequest.index_uid, source_id: subrequest.source_id, - shard_id: Some(shard_id), reason: PersistFailureReason::WalFull as i32, }; persist_failures.push(persist_failure); @@ -549,7 +546,6 @@ impl Ingester { subrequest_id: subrequest.subrequest_id, index_uid: subrequest.index_uid, source_id: subrequest.source_id, - shard_id: Some(shard_id), reason: PersistFailureReason::NoShardsAvailable as i32, }; persist_failures.push(persist_failure); @@ -687,7 +683,6 @@ impl Ingester { subrequest_id: replicate_failure.subrequest_id, index_uid: replicate_failure.index_uid, source_id: replicate_failure.source_id, - shard_id: replicate_failure.shard_id, reason: persist_failure_reason as i32, }; persist_failures.push(persist_failure); @@ -737,7 +732,6 @@ impl Ingester { subrequest_id: subrequest.subrequest_id, index_uid: subrequest.index_uid, source_id: subrequest.source_id, - shard_id: subrequest.shard_id, reason: reason as i32, }; persist_failures.push(persist_failure); @@ -1771,14 +1765,12 @@ mod tests { subrequest_id: 0, index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-010"}"#])), }, PersistSubrequest { subrequest_id: 1, index_uid: Some(index_uid2.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), doc_batch: Some(DocBatchV2::for_test([ r#"{"doc": "test-doc-110"}"#, r#"{"doc": "test-doc-111"}"#, @@ -1795,7 +1787,6 @@ mod tests { assert_eq!(persist_success_0.subrequest_id, 0); assert_eq!(persist_success_0.index_uid(), &index_uid); assert_eq!(persist_success_0.source_id, "test-source"); - assert_eq!(persist_success_0.shard_id(), ShardId::from(1)); assert_eq!( persist_success_0.replication_position_inclusive, Some(Position::offset(1u64)) @@ -1805,7 +1796,6 @@ mod tests { assert_eq!(persist_success_1.subrequest_id, 1); assert_eq!(persist_success_1.index_uid(), &index_uid2); assert_eq!(persist_success_1.source_id, "test-source"); - assert_eq!(persist_success_1.shard_id(), ShardId::from(1)); assert_eq!( persist_success_1.replication_position_inclusive, Some(Position::offset(2u64)) @@ -1892,7 +1882,6 @@ mod tests { subrequest_id: 0, index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(0)), doc_batch: None, }], }; @@ -1905,7 +1894,6 @@ mod tests { assert_eq!(persist_success.subrequest_id, 0); assert_eq!(persist_success.index_uid(), &index_uid); assert_eq!(persist_success.source_id, "test-source"); - assert_eq!(persist_success.shard_id(), ShardId::from(0)); assert_eq!( persist_success.replication_position_inclusive, Some(Position::Beginning) @@ -1953,7 +1941,6 @@ mod tests { subrequest_id: 0, index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(0)), doc_batch: Some(DocBatchV2::for_test([ "", // invalid "[]", // invalid @@ -2028,7 +2015,6 @@ mod tests { subrequest_id: 0, index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(0)), doc_batch: Some(DocBatchV2::for_test([ "", // invalid "[]", // invalid @@ -2091,7 +2077,6 @@ mod tests { subrequest_id: 0, index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(0)), doc_batch: Some(DocBatchV2::for_test(["", "[]", r#"{"foo": "bar"}"#])), }], }; @@ -2152,7 +2137,6 @@ mod tests { subrequest_id: 0, index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(0)), doc_batch: Some(DocBatchV2::for_test(["", "[]", r#"{"foo": "bar"}"#])), }], }; @@ -2213,7 +2197,6 @@ mod tests { subrequest_id: 0, index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), doc_batch: Some(DocBatchV2::for_test([r#"test-doc-foo"#])), }], }; @@ -2226,7 +2209,6 @@ mod tests { assert_eq!(persist_failure.subrequest_id, 0); assert_eq!(persist_failure.index_uid(), &index_uid); assert_eq!(persist_failure.source_id, "test-source"); - assert_eq!(persist_failure.shard_id(), ShardId::from(1)); assert_eq!( persist_failure.reason(), PersistFailureReason::NodeUnavailable, @@ -2266,7 +2248,6 @@ mod tests { subrequest_id: 0, index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-foo"}"#])), }], }; @@ -2279,7 +2260,6 @@ mod tests { assert_eq!(persist_failure.subrequest_id, 0); assert_eq!(persist_failure.index_uid(), &index_uid); assert_eq!(persist_failure.source_id, "test-source"); - assert_eq!(persist_failure.shard_id(), ShardId::from(1)); assert_eq!( persist_failure.reason(), PersistFailureReason::NodeUnavailable @@ -2362,14 +2342,12 @@ mod tests { subrequest_id: 0, index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-010"}"#])), }, PersistSubrequest { subrequest_id: 1, index_uid: Some(index_uid2.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), doc_batch: Some(DocBatchV2::for_test([ r#"{"doc": "test-doc-110"}"#, r#"{"doc": "test-doc-111"}"#, @@ -2570,14 +2548,12 @@ mod tests { subrequest_id: 0, index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-010"}"#])), }, PersistSubrequest { subrequest_id: 1, index_uid: Some(index_uid2.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), doc_batch: Some(DocBatchV2::for_test([ r#"{"doc": "test-doc-110"}"#, r#"{"doc": "test-doc-111"}"#, @@ -2696,7 +2672,6 @@ mod tests { subrequest_id: 0, index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-010"}"#])), }], }; @@ -2709,7 +2684,6 @@ mod tests { assert_eq!(persist_failure.subrequest_id, 0); assert_eq!(persist_failure.index_uid(), &index_uid); assert_eq!(persist_failure.source_id, "test-source"); - assert_eq!(persist_failure.shard_id(), ShardId::from(1)); assert_eq!( persist_failure.reason(), PersistFailureReason::NoShardsAvailable @@ -2775,7 +2749,6 @@ mod tests { subrequest_id: 0, index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-010"}"#])), }], }; @@ -2788,7 +2761,6 @@ mod tests { assert_eq!(persist_failure.subrequest_id, 0); assert_eq!(persist_failure.index_uid(), &index_uid); assert_eq!(persist_failure.source_id, "test-source"); - assert_eq!(persist_failure.shard_id(), ShardId::from(1)); assert_eq!( persist_failure.reason(), PersistFailureReason::NoShardsAvailable @@ -2856,7 +2828,6 @@ mod tests { subrequest_id: 0, index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-010"}"#])), }], }; @@ -2869,7 +2840,6 @@ mod tests { assert_eq!(persist_failure.subrequest_id, 0); assert_eq!(persist_failure.index_uid(), &index_uid); assert_eq!(persist_failure.source_id, "test-source"); - assert_eq!(persist_failure.shard_id(), ShardId::from(1)); assert_eq!(persist_failure.reason(), PersistFailureReason::WalFull); let state_guard = ingester.state.lock_fully().await.unwrap(); diff --git a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs index f6c1cd42732..34051e62cd0 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs @@ -22,7 +22,6 @@ mod metrics; mod models; mod mrecord; mod mrecordlog_utils; -#[allow(dead_code)] mod node_routing_table; mod publish_tracker; mod rate_meter; diff --git a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs index 6b23fcd1aca..72ee2ded5ec 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs @@ -29,6 +29,7 @@ use crate::IngesterPool; pub(super) struct IngesterNode { pub node_id: NodeId, pub index_uid: IndexUid, + #[allow(unused)] pub source_id: SourceId, /// Score from 0-10. Higher means more available capacity. pub capacity_score: usize, diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs index 1f5ca26865e..5a473a1adb1 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs @@ -368,7 +368,6 @@ impl IngestRouter { subrequest_id: subrequest.subrequest_id, index_uid: Some(ingester_node.index_uid.clone()), source_id: subrequest.source_id.clone(), - shard_id: None, doc_batch: subrequest.doc_batch.clone(), }; per_leader_persist_subrequests @@ -1127,7 +1126,6 @@ mod tests { subrequest_id: 0, index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), reason: PersistFailureReason::NoShardsAvailable as i32, }], routing_update: Some(RoutingUpdate { @@ -1288,7 +1286,6 @@ mod tests { .returning(move |request| { assert_eq!(request.leader_id, "test-ingester-0"); assert_eq!(request.subrequests.len(), 1); - assert!(request.subrequests[0].shard_id.is_none()); Ok(PersistResponse { leader_id: request.leader_id, @@ -1325,7 +1322,6 @@ mod tests { .returning(move |request| { assert_eq!(request.leader_id, "test-ingester-1"); assert_eq!(request.subrequests.len(), 1); - assert!(request.subrequests[0].shard_id.is_none()); Ok(PersistResponse { leader_id: request.leader_id, @@ -1425,7 +1421,6 @@ mod tests { subrequest_id: 0, index_uid: Some(index_uid_clone.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), reason: PersistFailureReason::NoShardsAvailable as i32, }], routing_update: Some(RoutingUpdate { @@ -1579,7 +1574,6 @@ mod tests { assert_eq!(subrequest.subrequest_id, 0); let index_uid = subrequest.index_uid().clone(); assert_eq!(subrequest.source_id, "test-source"); - assert!(subrequest.shard_id.is_none()); assert_eq!( subrequest.doc_batch, Some(DocBatchV2::for_test(["test-doc-foo"])) @@ -1592,7 +1586,6 @@ mod tests { subrequest_id: 0, index_uid: Some(index_uid.clone()), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), reason: PersistFailureReason::NoShardsAvailable as i32, }], routing_update: Some(RoutingUpdate { @@ -1701,7 +1694,6 @@ mod tests { subrequest_id: 0, index_uid: Some(IndexUid::for_test("test-index-0", 0)), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), reason: PersistFailureReason::NoShardsAvailable as i32, }], routing_update: Some(RoutingUpdate { @@ -1735,7 +1727,6 @@ mod tests { subrequest_id: 1, index_uid: Some(IndexUid::for_test("test-index-1", 0)), source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), reason: PersistFailureReason::NodeUnavailable as i32, }], routing_update: Some(RoutingUpdate { diff --git a/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs b/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs index fb52d8e8139..50f92654efb 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs @@ -560,7 +560,6 @@ mod tests { let persist_failure = PersistFailure { subrequest_id: 1, - shard_id: Some(shard_id_2.clone()), ..Default::default() }; workbench.record_persist_failure(&persist_failure); @@ -807,7 +806,6 @@ mod tests { let persist_failure = PersistFailure { subrequest_id: 0, - shard_id: Some(ShardId::from(1)), reason: PersistFailureReason::WalFull as i32, ..Default::default() }; diff --git a/quickwit/quickwit-proto/protos/quickwit/ingester.proto b/quickwit/quickwit-proto/protos/quickwit/ingester.proto index 23ff6e6825d..04b649a16ed 100644 --- a/quickwit/quickwit-proto/protos/quickwit/ingester.proto +++ b/quickwit/quickwit-proto/protos/quickwit/ingester.proto @@ -73,7 +73,6 @@ message PersistSubrequest { uint32 subrequest_id = 1; quickwit.common.IndexUid index_uid = 2; string source_id = 3; - quickwit.ingest.ShardId shard_id = 4; quickwit.ingest.DocBatchV2 doc_batch = 5; } @@ -119,7 +118,6 @@ message PersistFailure { uint32 subrequest_id = 1; quickwit.common.IndexUid index_uid = 2; string source_id = 3; - quickwit.ingest.ShardId shard_id = 4; PersistFailureReason reason = 5; } diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs index 606dce48e2e..1b5fcfd15c6 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs @@ -37,8 +37,6 @@ pub struct PersistSubrequest { pub index_uid: ::core::option::Option, #[prost(string, tag = "3")] pub source_id: ::prost::alloc::string::String, - #[prost(message, optional, tag = "4")] - pub shard_id: ::core::option::Option, #[prost(message, optional, tag = "5")] pub doc_batch: ::core::option::Option, } @@ -101,8 +99,6 @@ pub struct PersistFailure { pub index_uid: ::core::option::Option, #[prost(string, tag = "3")] pub source_id: ::prost::alloc::string::String, - #[prost(message, optional, tag = "4")] - pub shard_id: ::core::option::Option, #[prost(enumeration = "PersistFailureReason", tag = "5")] pub reason: i32, } diff --git a/quickwit/quickwit-proto/src/getters.rs b/quickwit/quickwit-proto/src/getters.rs index d33901bde47..4d8e6f058e6 100644 --- a/quickwit/quickwit-proto/src/getters.rs +++ b/quickwit/quickwit-proto/src/getters.rs @@ -210,8 +210,6 @@ generate_getters! { InitShardFailure, OpenFetchStreamRequest, OpenShardSubrequest, - PersistFailure, - PersistSubrequest, PersistSuccess, ReplicateFailure, ReplicateSubrequest, diff --git a/quickwit/quickwit-proto/src/ingest/ingester.rs b/quickwit/quickwit-proto/src/ingest/ingester.rs index d2da3f8d9bd..3d6bb896e37 100644 --- a/quickwit/quickwit-proto/src/ingest/ingester.rs +++ b/quickwit/quickwit-proto/src/ingest/ingester.rs @@ -85,12 +85,6 @@ impl OpenFetchStreamRequest { } } -impl PersistSubrequest { - pub fn queue_id(&self) -> QueueId { - queue_id(self.index_uid(), &self.source_id, self.shard_id()) - } -} - impl PersistSuccess { pub fn queue_id(&self) -> QueueId { queue_id(self.index_uid(), &self.source_id, self.shard_id()) From 1d0e18e177eb36cedadda816d5c03f8317eba208 Mon Sep 17 00:00:00 2001 From: nadav-govari Date: Mon, 9 Mar 2026 14:47:24 -0400 Subject: [PATCH 6/9] Add availability zone awareness to node based routing (#6189) --- quickwit/quickwit-cluster/src/node.rs | 7 + .../src/rate_limited_tracing.rs | 9 +- .../src/control_plane.rs | 15 +- .../src/ingest/ingest_controller.rs | 78 ++++++--- .../src/source/ingest/mod.rs | 24 ++- .../quickwit-ingest/src/ingest_v2/fetch.rs | 29 +++- .../quickwit-ingest/src/ingest_v2/ingester.rs | 25 ++- quickwit/quickwit-ingest/src/ingest_v2/mod.rs | 9 +- .../src/ingest_v2/node_routing_table.rs | 163 +++++++++++------- .../quickwit-ingest/src/ingest_v2/router.rs | 94 +++++++--- .../src/ingest_v2/routing_table.rs | 40 ++++- quickwit/quickwit-proto/src/lib.rs | 3 +- quickwit/quickwit-serve/src/lib.rs | 27 +-- 13 files changed, 360 insertions(+), 163 deletions(-) diff --git a/quickwit/quickwit-cluster/src/node.rs b/quickwit/quickwit-cluster/src/node.rs index 4a8b11dbafc..52029348eb1 100644 --- a/quickwit/quickwit-cluster/src/node.rs +++ b/quickwit/quickwit-cluster/src/node.rs @@ -48,6 +48,7 @@ impl ClusterNode { indexing_capacity: member.indexing_cpu_capacity, is_ready: member.is_ready, is_self_node, + availability_zone: member.availability_zone, }; let node = ClusterNode { inner: Arc::new(inner), @@ -132,6 +133,10 @@ impl ClusterNode { pub fn is_self_node(&self) -> bool { self.inner.is_self_node } + + pub fn availability_zone(&self) -> Option<&str> { + self.inner.availability_zone.as_deref() + } } impl Debug for ClusterNode { @@ -153,6 +158,7 @@ impl PartialEq for ClusterNode { && self.inner.indexing_tasks == other.inner.indexing_tasks && self.inner.is_ready == other.inner.is_ready && self.inner.is_self_node == other.inner.is_self_node + && self.inner.availability_zone == other.inner.availability_zone } } @@ -165,4 +171,5 @@ struct InnerNode { indexing_capacity: CpuCapacity, is_ready: bool, is_self_node: bool, + availability_zone: Option, } diff --git a/quickwit/quickwit-common/src/rate_limited_tracing.rs b/quickwit/quickwit-common/src/rate_limited_tracing.rs index c9a323f9ec2..198c2bf8bdd 100644 --- a/quickwit/quickwit-common/src/rate_limited_tracing.rs +++ b/quickwit/quickwit-common/src/rate_limited_tracing.rs @@ -179,12 +179,13 @@ fn _check_macro_works() { #[doc(hidden)] pub use coarsetime::Instant as CoarsetimeInstant; +pub use rate_limited_debug; +pub use rate_limited_error; +pub use rate_limited_info; +pub use rate_limited_trace; #[doc(hidden)] pub use rate_limited_tracing; -pub use { - rate_limited_debug, rate_limited_error, rate_limited_info, rate_limited_trace, - rate_limited_warn, -}; +pub use rate_limited_warn; #[cfg(test)] mod tests { diff --git a/quickwit/quickwit-control-plane/src/control_plane.rs b/quickwit/quickwit-control-plane/src/control_plane.rs index e4c6995d639..33b0e00ddaa 100644 --- a/quickwit/quickwit-control-plane/src/control_plane.rs +++ b/quickwit/quickwit-control-plane/src/control_plane.rs @@ -1176,6 +1176,7 @@ mod tests { CLI_SOURCE_ID, INGEST_V2_SOURCE_ID, IndexConfig, KafkaSourceParams, SourceParams, }; use quickwit_indexing::IndexingService; + use quickwit_ingest::IngesterPoolEntry; use quickwit_metastore::{ CreateIndexRequestExt, IndexMetadata, ListIndexesMetadataResponseExt, }; @@ -1203,6 +1204,12 @@ mod tests { use super::*; use crate::IndexerNodeInfo; + fn ingester_pool_entry(client: IngesterServiceClient) -> IngesterPoolEntry { + IngesterPoolEntry { + client, + availability_zone: None, + } + } #[tokio::test] async fn test_control_plane_create_index() { let universe = Universe::with_accelerated_time(); @@ -2210,7 +2217,7 @@ mod tests { assert!(&retain_shards_for_source.shard_ids.is_empty()); Ok(RetainShardsResponse {}) }); - let ingester = IngesterServiceClient::from_mock(mock_ingester); + let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("node1".into(), ingester); let cluster_config = ClusterConfig::for_test(); @@ -2256,7 +2263,7 @@ mod tests { ); Ok(RetainShardsResponse {}) }); - let ingester = IngesterServiceClient::from_mock(mock_ingester); + let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("node1".into(), ingester); let mut index_0 = IndexMetadata::for_test("test-index-0", "ram:///test-index-0"); @@ -2552,7 +2559,7 @@ mod tests { }; Ok(response) }); - let ingester = IngesterServiceClient::from_mock(mock_ingester); + let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert(ingester_id, ingester); let mut mock_metastore = MockMetastoreService::new(); @@ -2706,7 +2713,7 @@ mod tests { }; Ok(response) }); - let ingester = IngesterServiceClient::from_mock(mock_ingester); + let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert(ingester_id, ingester); let mut mock_metastore = MockMetastoreService::new(); diff --git a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs index 63295d61eca..e5e7ec8da0e 100644 --- a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs +++ b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs @@ -329,7 +329,7 @@ impl IngestController { fn sync_with_ingester(&self, ingester: &NodeId, model: &ControlPlaneModel) -> WaitHandle { info!(ingester = %ingester, "sync_with_ingester"); let (wait_drop_guard, wait_handle) = WaitHandle::new(); - let Some(ingester_client) = self.ingester_pool.get(ingester) else { + let Some(ingester_client) = self.ingester_pool.get(ingester).map(|h| h.client) else { // TODO: (Maybe) We should mark the ingester as unavailable, and stop advertise its // shard to routers. warn!("failed to sync with ingester `{ingester}`: not available"); @@ -639,7 +639,7 @@ impl IngestController { } }) .collect(); - let Some(leader) = self.ingester_pool.get(&leader_id) else { + let Some(leader) = self.ingester_pool.get(&leader_id).map(|h| h.client) else { warn!("failed to init shards: ingester `{leader_id}` is unavailable"); failures.extend(init_shard_failures); continue; @@ -898,7 +898,7 @@ impl IngestController { return Ok(()); }; info!("scaling down shard {shard_id} from {leader_id}"); - let Some(ingester) = self.ingester_pool.get(&leader_id) else { + let Some(ingester) = self.ingester_pool.get(&leader_id).map(|h| h.client) else { model.release_scaling_permits(&source_uid, ScalingMode::Down); return Ok(()); }; @@ -1211,7 +1211,7 @@ impl IngestController { let mut close_shards_futures = FuturesUnordered::new(); for (leader_id, shard_pkeys) in per_leader_shards_to_close { - let Some(ingester) = self.ingester_pool.get(&leader_id) else { + let Some(ingester) = self.ingester_pool.get(&leader_id).map(|h| h.client) else { warn!("failed to close shards: ingester `{leader_id}` is unavailable"); continue; }; @@ -1313,7 +1313,7 @@ mod tests { use quickwit_common::shared_consts::DEFAULT_SHARD_THROUGHPUT_LIMIT; use quickwit_common::tower::DelayLayer; use quickwit_config::{DocMapping, INGEST_V2_SOURCE_ID, SourceConfig}; - use quickwit_ingest::{RateMibPerSec, ShardInfo}; + use quickwit_ingest::{IngesterPoolEntry, RateMibPerSec, ShardInfo}; use quickwit_metastore::IndexMetadata; use quickwit_proto::control_plane::GetOrCreateOpenShardsSubrequest; use quickwit_proto::ingest::ingester::{ @@ -1328,6 +1328,13 @@ mod tests { use super::*; + fn ingester_pool_entry(client: IngesterServiceClient) -> IngesterPoolEntry { + IngesterPoolEntry { + client, + availability_zone: None, + } + } + const TEST_SHARD_THROUGHPUT_LIMIT_MIB: f32 = DEFAULT_SHARD_THROUGHPUT_LIMIT.as_u64() as f32 / quickwit_common::shared_consts::MIB as f32; @@ -1385,7 +1392,10 @@ mod tests { let ingester = IngesterServiceClient::from_mock(mock_ingester); let ingester_pool = IngesterPool::default(); - ingester_pool.insert(NodeId::from("test-ingester-1"), ingester.clone()); + ingester_pool.insert( + NodeId::from("test-ingester-1"), + ingester_pool_entry(ingester.clone()), + ); let mut mock_ingester = MockIngesterService::new(); let index_uid_1_clone = index_uid_1.clone(); @@ -1413,7 +1423,10 @@ mod tests { Ok(response) }); let ingester = IngesterServiceClient::from_mock(mock_ingester); - ingester_pool.insert(NodeId::from("test-ingester-2"), ingester.clone()); + ingester_pool.insert( + NodeId::from("test-ingester-2"), + ingester_pool_entry(ingester.clone()), + ); let replication_factor = 2; let mut controller = IngestController::new( @@ -1599,7 +1612,10 @@ mod tests { let ingester = IngesterServiceClient::from_mock(mock_ingester); let ingester_pool = IngesterPool::default(); - ingester_pool.insert(NodeId::from("test-ingester-1"), ingester.clone()); + ingester_pool.insert( + NodeId::from("test-ingester-1"), + ingester_pool_entry(ingester.clone()), + ); let replication_factor = 1; let mut controller = IngestController::new( @@ -1712,7 +1728,7 @@ mod tests { ingester_pool.insert( NodeId::from("test-ingester-1"), - IngesterServiceClient::mocked(), + ingester_pool_entry(IngesterServiceClient::mocked()), ); let leader_follower_pairs_opt = @@ -1722,7 +1738,10 @@ mod tests { // find any solution. assert!(leader_follower_pairs_opt.is_none()); - ingester_pool.insert("test-ingester-2".into(), IngesterServiceClient::mocked()); + ingester_pool.insert( + "test-ingester-2".into(), + ingester_pool_entry(IngesterServiceClient::mocked()), + ); let leader_follower_pairs = controller .allocate_shards(0, &FnvHashSet::default(), &model) @@ -1841,7 +1860,10 @@ mod tests { Some(NodeId::from("test-ingester-1")) ); - ingester_pool.insert("test-ingester-3".into(), IngesterServiceClient::mocked()); + ingester_pool.insert( + "test-ingester-3".into(), + ingester_pool_entry(IngesterServiceClient::mocked()), + ); let unavailable_leaders = FnvHashSet::from_iter([NodeId::from("test-ingester-2")]); let leader_follower_pairs = controller .allocate_shards(4, &unavailable_leaders, &model) @@ -1934,7 +1956,7 @@ mod tests { Ok(response) }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert(ingester_id_0, ingester_0); + ingester_pool.insert(ingester_id_0, ingester_pool_entry(ingester_0)); let ingester_id_1 = NodeId::from("test-ingester-1"); let mut mock_ingester_1 = MockIngesterService::new(); @@ -1955,7 +1977,7 @@ mod tests { Err(IngestV2Error::Internal("internal error".to_string())) }); - let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1); + let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert(ingester_id_1, ingester_1); let ingester_id_2 = NodeId::from("test-ingester-2"); @@ -1965,7 +1987,7 @@ mod tests { let ingester_2 = IngesterServiceClient::tower() .stack_init_shards_layer(DelayLayer::new(INIT_SHARDS_REQUEST_TIMEOUT * 2)) .build_from_mock(mock_ingester_2); - ingester_pool.insert(ingester_id_2, ingester_2); + ingester_pool.insert(ingester_id_2, ingester_pool_entry(ingester_2)); let init_shards_response = controller .init_shards(Vec::new(), &Progress::default()) @@ -2173,7 +2195,7 @@ mod tests { ingester_pool.insert( NodeId::from("test-ingester-1"), - IngesterServiceClient::from_mock(mock_ingester), + ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)), ); let source_uids: HashMap = HashMap::from_iter([(source_uid.clone(), 1)]); let unavailable_leaders = FnvHashSet::default(); @@ -2343,7 +2365,7 @@ mod tests { "failed to close shards".to_string(), )) }); - let ingester = IngesterServiceClient::from_mock(mock_ingester); + let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("test-ingester".into(), ingester); let shard_infos = BTreeSet::from_iter([ @@ -2496,7 +2518,7 @@ mod tests { }, ); - let ingester = IngesterServiceClient::from_mock(mock_ingester); + let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("test-ingester".into(), ingester); let shard_infos = BTreeSet::from_iter([ShardInfo { @@ -2642,7 +2664,7 @@ mod tests { }; Ok(response) }); - let ingester = IngesterServiceClient::from_mock(mock_ingester); + let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("test-ingester".into(), ingester); // Test failed to open shards. @@ -2764,7 +2786,7 @@ mod tests { }; Ok(response) }); - let ingester = IngesterServiceClient::from_mock(mock_ingester); + let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("test-ingester".into(), ingester); // Test failed to close shard. @@ -2996,15 +3018,15 @@ mod tests { }); ingester_pool.insert( "node-1".into(), - IngesterServiceClient::from_mock(mock_ingester_1), + ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1)), ); ingester_pool.insert( "node-2".into(), - IngesterServiceClient::from_mock(mock_ingester_2), + ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_2)), ); ingester_pool.insert( "node-3".into(), - IngesterServiceClient::from_mock(mock_ingester_3), + ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_3)), ); let node_id = "node-1".into(); let wait_handle = controller.sync_with_ingester(&node_id, &model); @@ -3134,7 +3156,7 @@ mod tests { Ok(response) }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert(ingester_id_0.clone(), ingester_0); + ingester_pool.insert(ingester_id_0.clone(), ingester_pool_entry(ingester_0)); let ingester_id_1 = NodeId::from("test-ingester-1"); let mut mock_ingester_1 = MockIngesterService::new(); @@ -3152,7 +3174,7 @@ mod tests { Err(IngestV2Error::Internal("internal error".to_string())) }); let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1); - ingester_pool.insert(ingester_id_1.clone(), ingester_1); + ingester_pool.insert(ingester_id_1.clone(), ingester_pool_entry(ingester_1)); let ingester_id_2 = NodeId::from("test-ingester-2"); let mut mock_ingester_2 = MockIngesterService::new(); @@ -3161,7 +3183,7 @@ mod tests { let ingester_2 = IngesterServiceClient::tower() .stack_close_shards_layer(DelayLayer::new(CLOSE_SHARDS_REQUEST_TIMEOUT * 2)) .build_from_mock(mock_ingester_2); - ingester_pool.insert(ingester_id_2.clone(), ingester_2); + ingester_pool.insert(ingester_id_2.clone(), ingester_pool_entry(ingester_2)); // In this test: // - ingester 0 will close shard 0 successfully and fail to close shard 1; @@ -3341,7 +3363,7 @@ mod tests { Ok(response) }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert(ingester_id_0.clone(), ingester_0); + ingester_pool.insert(ingester_id_0.clone(), ingester_pool_entry(ingester_0)); let ingester_id_1 = NodeId::from("test-ingester-1"); let mut mock_ingester_1 = MockIngesterService::new(); @@ -3382,7 +3404,7 @@ mod tests { }; Ok(response) }); - let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1); + let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert(ingester_id_1.clone(), ingester_1); let close_shards_task = controller @@ -3561,7 +3583,7 @@ mod tests { let ingester_pool = IngesterPool::default(); let mock_ingester = MockIngesterService::new(); - let ingester_client = IngesterServiceClient::from_mock(mock_ingester); + let ingester_client = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); let active_ids: Vec = (0..available_ingester_shards.len()) .map(|i| format!("active-ingester-{}", i)) diff --git a/quickwit/quickwit-indexing/src/source/ingest/mod.rs b/quickwit/quickwit-indexing/src/source/ingest/mod.rs index 63c746aabe0..6f9551ee406 100644 --- a/quickwit/quickwit-indexing/src/source/ingest/mod.rs +++ b/quickwit/quickwit-indexing/src/source/ingest/mod.rs @@ -345,7 +345,7 @@ impl IngestSource { .push(truncate_shards_subrequest); } for (ingester_id, truncate_subrequests) in per_ingester_truncate_subrequests { - let Some(ingester) = self.ingester_pool.get(ingester_id) else { + let Some(ingester) = self.ingester_pool.get(ingester_id).map(|h| h.client) else { warn!("failed to truncate shard(s): ingester `{ingester_id}` is unavailable"); continue; }; @@ -672,6 +672,7 @@ mod tests { use quickwit_common::metrics::MEMORY_METRICS; use quickwit_common::stream_utils::InFlightValue; use quickwit_config::{IndexingSettings, SourceConfig, SourceParams}; + use quickwit_ingest::IngesterPoolEntry; use quickwit_proto::indexing::IndexingPipelineId; use quickwit_proto::ingest::ingester::{ FetchMessage, IngesterServiceClient, MockIngesterService, TruncateShardsResponse, @@ -687,6 +688,13 @@ mod tests { use crate::models::RawDocBatch; use crate::source::SourceActor; + fn ingester_pool_entry(client: IngesterServiceClient) -> IngesterPoolEntry { + IngesterPoolEntry { + client, + availability_zone: None, + } + } + // In this test, we simulate a source to which we sequentially assign the following set of // shards [] // [1] (triggers a reset, and the creation of a publish lock) @@ -929,7 +937,7 @@ mod tests { Ok(response) }); - let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); + let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); let event_broker = EventBroker::default(); @@ -1126,7 +1134,7 @@ mod tests { Ok(response) }); - let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); + let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); let event_broker = EventBroker::default(); @@ -1291,7 +1299,7 @@ mod tests { Ok(response) }); - let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); + let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); let event_broker = EventBroker::default(); @@ -1599,7 +1607,7 @@ mod tests { }) }); - let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); + let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); let event_broker = EventBroker::default(); @@ -1699,7 +1707,7 @@ mod tests { Ok(TruncateShardsResponse {}) }); - let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); + let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); let mut mock_ingester_1 = MockIngesterService::new(); @@ -1726,7 +1734,7 @@ mod tests { Ok(TruncateShardsResponse {}) }); - let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1); + let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert("test-ingester-1".into(), ingester_1.clone()); let mut mock_ingester_3 = MockIngesterService::new(); @@ -1746,7 +1754,7 @@ mod tests { Ok(TruncateShardsResponse {}) }); - let ingester_3 = IngesterServiceClient::from_mock(mock_ingester_3); + let ingester_3 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_3)); ingester_pool.insert("test-ingester-3".into(), ingester_3.clone()); let event_broker = EventBroker::default(); diff --git a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs index 6e8d085e35d..f6f0145d175 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs @@ -495,7 +495,11 @@ async fn fault_tolerant_fetch_stream( shard_id: Some(shard_id.clone()), from_position_exclusive: Some(from_position_exclusive.clone()), }; - let mut fetch_stream = match ingester.open_fetch_stream(open_fetch_stream_request).await { + let mut fetch_stream = match ingester + .client + .open_fetch_stream(open_fetch_stream_request) + .await + { Ok(fetch_stream) => fetch_stream, Err(not_found_error @ IngestV2Error::ShardNotFound { .. }) => { error!( @@ -628,7 +632,7 @@ pub(super) mod tests { use tokio::time::timeout; use super::*; - use crate::MRecord; + use crate::{IngesterPoolEntry, MRecord}; pub fn into_fetch_payload(fetch_message: FetchMessage) -> FetchPayload { match fetch_message.message.unwrap() { @@ -644,6 +648,13 @@ pub(super) mod tests { } } + fn ingester_pool_entry(client: IngesterServiceClient) -> IngesterPoolEntry { + IngesterPoolEntry { + client, + availability_zone: None, + } + } + #[tokio::test] async fn test_fetch_task_happy_path() { let tempdir = tempfile::tempdir().unwrap(); @@ -1325,7 +1336,7 @@ pub(super) mod tests { Ok(service_stream_1) }); - let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1); + let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert("test-ingester-1".into(), ingester_1); @@ -1425,7 +1436,7 @@ pub(super) mod tests { "open fetch stream error".to_string(), )) }); - let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); + let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0)); let mut mock_ingester_1 = MockIngesterService::new(); let index_uid_clone = index_uid.clone(); @@ -1440,7 +1451,7 @@ pub(super) mod tests { Ok(service_stream_1) }); - let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1); + let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert("test-ingester-0".into(), ingester_0); ingester_pool.insert("test-ingester-1".into(), ingester_1); @@ -1540,7 +1551,7 @@ pub(super) mod tests { Ok(service_stream_0) }); - let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); + let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0)); let mut mock_ingester_1 = MockIngesterService::new(); let index_uid_clone = index_uid.clone(); @@ -1555,7 +1566,7 @@ pub(super) mod tests { Ok(service_stream_1) }); - let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1); + let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert("test-ingester-0".into(), ingester_0); ingester_pool.insert("test-ingester-1".into(), ingester_1); @@ -1658,7 +1669,7 @@ pub(super) mod tests { shard_id: ShardId::from(1), }) }); - let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); + let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0); fault_tolerant_fetch_stream( @@ -1746,7 +1757,7 @@ pub(super) mod tests { Ok(service_stream_2) }); - let ingester = IngesterServiceClient::from_mock(mock_ingester); + let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("test-ingester".into(), ingester); diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs index a131e2c289a..df51758a4ca 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs @@ -31,7 +31,6 @@ use quickwit_common::metrics::{GaugeGuard, MEMORY_METRICS}; use quickwit_common::pretty::PrettyDisplay; use quickwit_common::pubsub::{EventBroker, EventSubscriber}; use quickwit_common::rate_limiter::{RateLimiter, RateLimiterSettings}; -use quickwit_common::tower::Pool; use quickwit_common::{ServiceStream, rate_limited_error, rate_limited_warn}; use quickwit_proto::control_plane::{ AdviseResetShardsRequest, ControlPlaneService, ControlPlaneServiceClient, @@ -123,7 +122,7 @@ impl Ingester { pub async fn try_new( cluster: Cluster, control_plane: ControlPlaneServiceClient, - ingester_pool: Pool, + ingester_pool: IngesterPool, wal_dir_path: &Path, disk_capacity: ByteSize, memory_capacity: ByteSize, @@ -390,6 +389,7 @@ impl Ingester { IngestV2Error::Unavailable(message) })?; let mut ack_replication_stream = ingester + .client .open_replication_stream(syn_replication_stream) .await?; ack_replication_stream @@ -1311,11 +1311,11 @@ mod tests { use tonic::transport::{Endpoint, Server}; use super::*; - use crate::MRecord; use crate::ingest_v2::DEFAULT_IDLE_SHARD_TIMEOUT; use crate::ingest_v2::broadcast::ShardInfos; use crate::ingest_v2::doc_mapper::try_build_doc_mapper; use crate::ingest_v2::fetch::tests::{into_fetch_eof, into_fetch_payload}; + use crate::{IngesterPoolEntry, MRecord}; const MAX_GRPC_MESSAGE_SIZE: ByteSize = ByteSize::mib(1); @@ -2284,10 +2284,14 @@ mod tests { .build() .await; - leader_ctx.ingester_pool.insert( - follower_ctx.node_id.clone(), - IngesterServiceClient::new(follower.clone()), - ); + let ingester_pool_entry = IngesterPoolEntry { + client: IngesterServiceClient::new(follower.clone()), + availability_zone: None, + }; + + leader_ctx + .ingester_pool + .insert(follower_ctx.node_id.clone(), ingester_pool_entry); let index_uid = IndexUid::for_test("test-index", 0); let index_uid2: IndexUid = IndexUid::for_test("test-index", 1); @@ -2491,9 +2495,14 @@ mod tests { None, ); + let ingester_pool_entry = IngesterPoolEntry { + client: follower_grpc_client, + availability_zone: None, + }; + leader_ctx .ingester_pool - .insert(follower_ctx.node_id.clone(), follower_grpc_client); + .insert(follower_ctx.node_id.clone(), ingester_pool_entry); let index_uid = IndexUid::for_test("test-index", 0); let index_uid2: IndexUid = IndexUid::for_test("test-index", 1); diff --git a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs index 34051e62cd0..d5432936e58 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs @@ -63,7 +63,14 @@ use self::mrecord::MRECORD_HEADER_LEN; pub use self::mrecord::{MRecord, decoded_mrecords}; pub use self::router::IngestRouter; -pub type IngesterPool = Pool; +/// An ingester as represented in the pool, bundling the gRPC client with node metadata. +#[derive(Debug, Clone)] +pub struct IngesterPoolEntry { + pub client: IngesterServiceClient, + pub availability_zone: Option, +} + +pub type IngesterPool = Pool; /// Identifies an ingester client, typically a source, for logging and debugging purposes. pub type ClientId = String; diff --git a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs index 72ee2ded5ec..68299358f77 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs @@ -57,15 +57,25 @@ fn power_of_two_choices<'a>(candidates: &[&'a IngesterNode]) -> &'a IngesterNode } } +fn pick_from(candidates: Vec<&IngesterNode>) -> Option<&IngesterNode> { + match candidates.len() { + 0 => None, + 1 => Some(candidates[0]), + _ => Some(power_of_two_choices(&candidates)), + } +} + impl RoutingEntry { /// Pick an ingester node to persist the request to. Uses power of two choices based on reported - /// ingester capacity, if more than one eligible node exists. - pub fn pick_node( + /// ingester capacity, if more than one eligible node exists. Prefers nodes in the same + /// availability zone, falling back to remote nodes. + fn pick_node( &self, ingester_pool: &IngesterPool, unavailable_leaders: &HashSet, + self_availability_zone: &Option, ) -> Option<&IngesterNode> { - let eligible: Vec<&IngesterNode> = self + let (local_ingesters, remote_ingesters): (Vec<&IngesterNode>, Vec<&IngesterNode>) = self .nodes .values() .filter(|node| { @@ -74,31 +84,55 @@ impl RoutingEntry { && ingester_pool.contains_key(&node.node_id) && !unavailable_leaders.contains(&node.node_id) }) - .collect(); - - match eligible.len() { - 0 => None, - 1 => Some(eligible[0]), - _ => Some(power_of_two_choices(&eligible)), - } + .partition(|node| { + let node_az = ingester_pool + .get(&node.node_id) + .and_then(|h| h.availability_zone); + node_az == *self_availability_zone + }); + + pick_from(local_ingesters).or_else(|| pick_from(remote_ingesters)) } } #[derive(Debug, Default)] pub(super) struct NodeBasedRoutingTable { table: HashMap<(IndexId, SourceId), RoutingEntry>, + self_availability_zone: Option, } impl NodeBasedRoutingTable { - pub fn find_entry(&self, index_id: &str, source_id: &str) -> Option<&RoutingEntry> { + pub fn new(self_availability_zone: Option) -> Self { + Self { + self_availability_zone, + ..Default::default() + } + } + + pub fn pick_node( + &self, + index_id: &str, + source_id: &str, + ingester_pool: &IngesterPool, + unavailable_leaders: &HashSet, + ) -> Option<&IngesterNode> { let key = (index_id.to_string(), source_id.to_string()); - self.table.get(&key) + let entry = self.table.get(&key)?; + entry.pick_node( + ingester_pool, + unavailable_leaders, + &self.self_availability_zone, + ) } - pub fn debug_info(&self) -> HashMap> { + pub fn debug_info( + &self, + ingester_pool: &IngesterPool, + ) -> HashMap> { let mut per_index: HashMap> = HashMap::new(); for ((index_id, source_id), entry) in &self.table { for (node_id, node) in &entry.nodes { + let az = ingester_pool.get(node_id).and_then(|h| h.availability_zone); per_index .entry(index_id.clone()) .or_default() @@ -107,6 +141,7 @@ impl NodeBasedRoutingTable { "node_id": node_id, "capacity_score": node.capacity_score, "open_shard_count": node.open_shard_count, + "availability_zone": az, })); } } @@ -201,6 +236,14 @@ mod tests { use quickwit_proto::types::ShardId; use super::*; + use crate::IngesterPoolEntry; + + fn mocked_ingester(availability_zone: Option<&str>) -> IngesterPoolEntry { + IngesterPoolEntry { + client: IngesterServiceClient::mocked(), + availability_zone: availability_zone.map(|s| s.to_string()), + } + } #[test] fn test_apply_capacity_update() { @@ -274,7 +317,7 @@ mod tests { assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new())); // Node is in pool → true. - pool.insert("node-1".into(), IngesterServiceClient::mocked()); + pool.insert("node-1".into(), mocked_ingester(None)); assert!(table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new())); // Node is unavailable → false. @@ -289,7 +332,7 @@ mod tests { 6, 2, ); - pool.insert("node-2".into(), IngesterServiceClient::mocked()); + pool.insert("node-2".into(), mocked_ingester(None)); assert!(table.has_open_nodes("test-index", "test-source", &pool, &unavailable)); // Node with capacity_score=0 is not eligible. @@ -304,77 +347,81 @@ mod tests { } #[test] - fn test_pick_node() { - let mut table = NodeBasedRoutingTable::default(); + fn test_pick_node_prefers_same_az() { + let mut table = NodeBasedRoutingTable::new(Some("az-1".to_string())); let pool = IngesterPool::default(); - let key = ("test-index".to_string(), "test-source".to_string()); - // Node exists but not in pool → None. table.apply_capacity_update( "node-1".into(), IndexUid::for_test("test-index", 0), "test-source".into(), - 8, - 3, + 5, + 1, ); - assert!( - table - .table - .get(&key) - .unwrap() - .pick_node(&pool, &HashSet::new()) - .is_none() + table.apply_capacity_update( + "node-2".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 5, + 1, ); + pool.insert("node-1".into(), mocked_ingester(Some("az-1"))); + pool.insert("node-2".into(), mocked_ingester(Some("az-2"))); - // Single node in pool → picks it. - pool.insert("node-1".into(), IngesterServiceClient::mocked()); let picked = table - .table - .get(&key) - .unwrap() - .pick_node(&pool, &HashSet::new()) + .pick_node("test-index", "test-source", &pool, &HashSet::new()) .unwrap(); assert_eq!(picked.node_id, NodeId::from("node-1")); + } + + #[test] + fn test_pick_node_falls_back_to_cross_az() { + let mut table = NodeBasedRoutingTable::new(Some("az-1".to_string())); + let pool = IngesterPool::default(); - // Multiple nodes → something is returned. table.apply_capacity_update( "node-2".into(), IndexUid::for_test("test-index", 0), "test-source".into(), - 2, + 5, 1, ); - pool.insert("node-2".into(), IngesterServiceClient::mocked()); - assert!( - table - .table - .get(&key) - .unwrap() - .pick_node(&pool, &HashSet::new()) - .is_some() - ); + pool.insert("node-2".into(), mocked_ingester(Some("az-2"))); + + let picked = table + .pick_node("test-index", "test-source", &pool, &HashSet::new()) + .unwrap(); + assert_eq!(picked.node_id, NodeId::from("node-2")); + } + + #[test] + fn test_pick_node_no_az_awareness() { + let mut table = NodeBasedRoutingTable::default(); + let pool = IngesterPool::default(); - // Node with capacity_score=0 is skipped. table.apply_capacity_update( "node-1".into(), IndexUid::for_test("test-index", 0), "test-source".into(), - 0, - 3, - ); - table.apply_capacity_update( - "node-2".into(), - IndexUid::for_test("test-index", 0), - "test-source".into(), - 0, + 5, 1, ); + pool.insert("node-1".into(), mocked_ingester(Some("az-1"))); + + let picked = table + .pick_node("test-index", "test-source", &pool, &HashSet::new()) + .unwrap(); + assert_eq!(picked.node_id, NodeId::from("node-1")); + } + + #[test] + fn test_pick_node_missing_entry() { + let table = NodeBasedRoutingTable::new(Some("az-1".to_string())); + let pool = IngesterPool::default(); + assert!( table - .table - .get(&key) - .unwrap() - .pick_node(&pool, &HashSet::new()) + .pick_node("nonexistent", "source", &pool, &HashSet::new()) .is_none() ); } diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs index 5a473a1adb1..a1f5bf86302 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs @@ -120,10 +120,11 @@ impl IngestRouter { ingester_pool: IngesterPool, replication_factor: usize, event_broker: EventBroker, + self_availability_zone: Option, ) -> Self { let state = Arc::new(Mutex::new(RouterState { debouncer: GetOrCreateOpenShardsRequestDebouncer::default(), - node_routing_table: NodeBasedRoutingTable::default(), + node_routing_table: NodeBasedRoutingTable::new(self_availability_zone), })); let ingest_semaphore_permits = get_ingest_router_buffer_size().as_u64() as usize; let ingest_semaphore = Arc::new(Semaphore::new(ingest_semaphore_permits)); @@ -352,10 +353,12 @@ impl IngestRouter { let state_guard = self.state.lock().await; for subrequest in pending_subrequests(&workbench.subworkbenches) { - let ingester_node = state_guard - .node_routing_table - .find_entry(&subrequest.index_id, &subrequest.source_id) - .and_then(|entry| entry.pick_node(&self.ingester_pool, unavailable_leaders)); + let ingester_node = state_guard.node_routing_table.pick_node( + &subrequest.index_id, + &subrequest.source_id, + &self.ingester_pool, + unavailable_leaders, + ); let ingester_node = match ingester_node { Some(node) => node, @@ -383,7 +386,7 @@ impl IngestRouter { .iter() .map(|subrequest| subrequest.subrequest_id) .collect(); - let Some(ingester) = self.ingester_pool.get(&leader_id) else { + let Some(ingester) = self.ingester_pool.get(&leader_id).map(|h| h.client) else { no_shards_available_subrequest_ids.extend(subrequest_ids); continue; }; @@ -471,7 +474,9 @@ impl IngestRouter { pub async fn debug_info(&self) -> JsonValue { let state_guard = self.state.lock().await; - let routing_table_json = state_guard.node_routing_table.debug_info(); + let routing_table_json = state_guard + .node_routing_table + .debug_info(&self.ingester_pool); json!({ "routing_table": routing_table_json, @@ -634,8 +639,16 @@ mod tests { use quickwit_proto::types::{DocUid, IndexUid, Position, ShardId, SourceUid}; use super::*; + use crate::IngesterPoolEntry; use crate::ingest_v2::workbench::SubworkbenchFailure; + fn mocked_ingester() -> IngesterPoolEntry { + IngesterPoolEntry { + client: IngesterServiceClient::mocked(), + availability_zone: None, + } + } + #[tokio::test] async fn test_router_make_get_or_create_open_shard_request() { let self_node_id = "test-router".into(); @@ -649,6 +662,7 @@ mod tests { ingester_pool.clone(), replication_factor, EventBroker::default(), + Some("test-az".to_string()), ); let mut workbench = IngestWorkbench::default(); let (get_or_create_open_shard_request_opt, rendezvous) = router @@ -723,7 +737,7 @@ mod tests { drop(rendezvous_1); drop(rendezvous_2); - ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked()); + ingester_pool.insert("test-ingester-0".into(), mocked_ingester()); { // Ingester-0 is in pool and in table, but marked unavailable on the workbench // (simulating a prior transport error). has_open_nodes returns false → both @@ -859,6 +873,7 @@ mod tests { ingester_pool.clone(), replication_factor, EventBroker::default(), + Some("test-az".to_string()), ); let ingest_subrequests = vec![ IngestSubrequest { @@ -957,6 +972,7 @@ mod tests { ingester_pool.clone(), replication_factor, EventBroker::default(), + Some("test-az".to_string()), ); let ingest_subrequests = vec![IngestSubrequest { subrequest_id: 0, @@ -1016,6 +1032,7 @@ mod tests { ingester_pool.clone(), replication_factor, EventBroker::default(), + Some("test-az".to_string()), ); let ingest_subrequests = vec![IngestSubrequest { subrequest_id: 0, @@ -1046,6 +1063,7 @@ mod tests { ingester_pool.clone(), replication_factor, EventBroker::default(), + Some("test-az".to_string()), ); let ingest_subrequests = vec![IngestSubrequest { subrequest_id: 0, @@ -1103,6 +1121,7 @@ mod tests { ingester_pool.clone(), replication_factor, EventBroker::default(), + Some("test-az".to_string()), ); let ingest_subrequests = vec![IngestSubrequest { subrequest_id: 0, @@ -1153,8 +1172,8 @@ mod tests { let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); let ingester_pool = IngesterPool::default(); - ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked()); - ingester_pool.insert("test-ingester-1".into(), IngesterServiceClient::mocked()); + ingester_pool.insert("test-ingester-0".into(), mocked_ingester()); + ingester_pool.insert("test-ingester-1".into(), mocked_ingester()); let replication_factor = 1; let router = IngestRouter::new( @@ -1163,6 +1182,7 @@ mod tests { ingester_pool.clone(), replication_factor, EventBroker::default(), + Some("test-az".to_string()), ); let ingest_subrequests = vec![ IngestSubrequest { @@ -1246,6 +1266,7 @@ mod tests { ingester_pool.clone(), 1, EventBroker::default(), + Some("test-az".to_string()), ); let index_uid_0: IndexUid = IndexUid::for_test("test-index-0", 0); @@ -1312,7 +1333,10 @@ mod tests { }); ingester_pool.insert( "test-ingester-0".into(), - IngesterServiceClient::from_mock(mock_ingester_0), + IngesterPoolEntry { + client: IngesterServiceClient::from_mock(mock_ingester_0), + availability_zone: None, + }, ); let mut mock_ingester_1 = MockIngesterService::new(); @@ -1344,7 +1368,10 @@ mod tests { }); ingester_pool.insert( "test-ingester-1".into(), - IngesterServiceClient::from_mock(mock_ingester_1), + IngesterPoolEntry { + client: IngesterServiceClient::from_mock(mock_ingester_1), + availability_zone: None, + }, ); let response = router @@ -1388,6 +1415,7 @@ mod tests { ingester_pool.clone(), 1, EventBroker::default(), + Some("test-az".to_string()), ); let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0); { @@ -1460,7 +1488,10 @@ mod tests { }); ingester_pool.insert( "test-ingester-0".into(), - IngesterServiceClient::from_mock(mock_ingester_0), + IngesterPoolEntry { + client: IngesterServiceClient::from_mock(mock_ingester_0), + availability_zone: None, + }, ); let response = router @@ -1491,6 +1522,7 @@ mod tests { ingester_pool.clone(), replication_factor, EventBroker::default(), + Some("test-az".to_string()), ); let index_uid_0: IndexUid = IndexUid::for_test("test-index-0", 0); let index_uid_1: IndexUid = IndexUid::for_test("test-index-1", 0); @@ -1547,6 +1579,7 @@ mod tests { ingester_pool.clone(), replication_factor, EventBroker::default(), + Some("test-az".to_string()), ); let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0); { @@ -1601,7 +1634,13 @@ mod tests { Ok(response) }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); + ingester_pool.insert( + "test-ingester-0".into(), + IngesterPoolEntry { + client: ingester_0.clone(), + availability_zone: None, + }, + ); let ingest_request = IngestRequestV2 { subrequests: vec![IngestSubrequest { @@ -1624,12 +1663,14 @@ mod tests { #[tokio::test] async fn test_router_updates_node_routing_table_on_capacity_update() { let event_broker = EventBroker::default(); + let ingester_pool = IngesterPool::default(); let router = IngestRouter::new( "test-router".into(), ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()), - IngesterPool::default(), + ingester_pool.clone(), 1, event_broker.clone(), + Some("test-az".to_string()), ); router.subscribe(); @@ -1645,14 +1686,13 @@ mod tests { // Give the async subscriber a moment to process. tokio::time::sleep(Duration::from_millis(10)).await; + ingester_pool.insert("test-ingester-0".into(), mocked_ingester()); let state_guard = router.state.lock().await; - let entry = state_guard + let node = state_guard .node_routing_table - .find_entry("test-index", "test-source") + .pick_node("test-index", "test-source", &ingester_pool, &HashSet::new()) .unwrap(); - let node = entry.nodes.get("test-ingester-0").unwrap(); - assert_eq!(node.capacity_score, 7); - assert_eq!(node.open_shard_count, 3); + assert_eq!(node.node_id, NodeId::from("test-ingester-0")); } #[tokio::test] @@ -1663,6 +1703,7 @@ mod tests { IngesterPool::default(), 1, EventBroker::default(), + Some("test-az".to_string()), ); let ingest_subrequests = vec![ IngestSubrequest { @@ -1749,12 +1790,14 @@ mod tests { #[tokio::test] async fn test_router_process_persist_results_applies_piggybacked_routing_updates() { + let ingester_pool = IngesterPool::default(); let router = IngestRouter::new( "test-router".into(), ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()), - IngesterPool::default(), + ingester_pool.clone(), 1, EventBroker::default(), + Some("test-az".to_string()), ); let ingest_subrequests = vec![IngestSubrequest { subrequest_id: 0, @@ -1790,13 +1833,12 @@ mod tests { .process_persist_results(&mut workbench, persist_futures) .await; + ingester_pool.insert("test-ingester-0".into(), mocked_ingester()); let state_guard = router.state.lock().await; - let entry = state_guard + let node = state_guard .node_routing_table - .find_entry("test-index", "test-source") + .pick_node("test-index", "test-source", &ingester_pool, &HashSet::new()) .unwrap(); - let node = entry.nodes.get("test-ingester-0").unwrap(); - assert_eq!(node.capacity_score, 3); - assert_eq!(node.open_shard_count, 2); + assert_eq!(node.node_id, NodeId::from("test-ingester-0")); } } diff --git a/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs index 987d754ed69..4b4150d6e98 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs @@ -584,8 +584,20 @@ mod tests { assert!(closed_shard_ids.is_empty()); assert!(unavailable_leaders.is_empty()); - ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked()); - ingester_pool.insert("test-ingester-1".into(), IngesterServiceClient::mocked()); + ingester_pool.insert( + "test-ingester-0".into(), + crate::IngesterPoolEntry { + client: IngesterServiceClient::mocked(), + availability_zone: None, + }, + ); + ingester_pool.insert( + "test-ingester-1".into(), + crate::IngesterPoolEntry { + client: IngesterServiceClient::mocked(), + availability_zone: None, + }, + ); let table_entry = RoutingTableEntry { index_uid: index_uid.clone(), @@ -675,8 +687,20 @@ mod tests { .unwrap_err(); assert_eq!(error, NextOpenShardError::NoShardsAvailable); - ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked()); - ingester_pool.insert("test-ingester-1".into(), IngesterServiceClient::mocked()); + ingester_pool.insert( + "test-ingester-0".into(), + crate::IngesterPoolEntry { + client: IngesterServiceClient::mocked(), + availability_zone: None, + }, + ); + ingester_pool.insert( + "test-ingester-1".into(), + crate::IngesterPoolEntry { + client: IngesterServiceClient::mocked(), + availability_zone: None, + }, + ); let table_entry = RoutingTableEntry { index_uid: index_uid.clone(), @@ -795,7 +819,13 @@ mod tests { let source_id: SourceId = "test-source".into(); let ingester_pool = IngesterPool::default(); - ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked()); + ingester_pool.insert( + "test-ingester-0".into(), + crate::IngesterPoolEntry { + client: IngesterServiceClient::mocked(), + availability_zone: None, + }, + ); let rate_limited_shards = HashSet::from_iter([ShardId::from(1)]); diff --git a/quickwit/quickwit-proto/src/lib.rs b/quickwit/quickwit-proto/src/lib.rs index f4ddb734d2a..f89fdb97687 100644 --- a/quickwit/quickwit-proto/src/lib.rs +++ b/quickwit/quickwit-proto/src/lib.rs @@ -28,7 +28,8 @@ use tracing_opentelemetry::OpenTelemetrySpanExt; pub mod cluster; pub mod control_plane; -pub use {bytes, tonic}; +pub use bytes; +pub use tonic; pub mod developer; pub mod error; mod getters; diff --git a/quickwit/quickwit-serve/src/lib.rs b/quickwit/quickwit-serve/src/lib.rs index 60515bc819f..78bdfee57da 100644 --- a/quickwit/quickwit-serve/src/lib.rs +++ b/quickwit/quickwit-serve/src/lib.rs @@ -82,9 +82,9 @@ use quickwit_indexing::models::ShardPositionsService; use quickwit_indexing::start_indexing_service; use quickwit_ingest::{ GetMemoryCapacity, IngestRequest, IngestRouter, IngestServiceClient, Ingester, IngesterPool, - LocalShardsUpdate, get_idle_shard_timeout, setup_ingester_capacity_update_listener, - setup_local_shards_update_listener, start_ingest_api_service, wait_for_ingester_decommission, - wait_for_ingester_status, + IngesterPoolEntry, LocalShardsUpdate, get_idle_shard_timeout, + setup_ingester_capacity_update_listener, setup_local_shards_update_listener, + start_ingest_api_service, wait_for_ingester_decommission, wait_for_ingester_status, }; use quickwit_jaeger::JaegerService; use quickwit_janitor::{JanitorService, start_janitor_service}; @@ -905,6 +905,7 @@ async fn setup_ingest_v2( ingester_pool.clone(), replication_factor, event_broker.clone(), + node_config.availability_zone.clone(), ); ingest_router.subscribe(); setup_ingester_capacity_update_listener(cluster.clone(), event_broker.clone()) @@ -968,21 +969,21 @@ async fn setup_ingest_v2( chitchat_id.node_id, ); let node_id: NodeId = node.node_id().into(); + let availability_zone = node.availability_zone().map(|az| az.to_string()); - if node.is_self_node() { + let client = if node.is_self_node() { // Here, since the service is available locally, we bypass the network stack // and use the instance directly. However, we still want client-side // metrics, so we use both metrics layers. let ingester = ingester_opt_clone_clone .expect("ingester service should be initialized"); - let ingester_service = ingester_service_layer_stack( + ingester_service_layer_stack( IngesterServiceClient::tower() .stack_layer(INGEST_GRPC_CLIENT_METRICS_LAYER.clone()), ) - .build(ingester); - Some(Change::Insert(node_id, ingester_service)) + .build(ingester) } else { - let ingester_service = IngesterServiceClient::tower() + IngesterServiceClient::tower() .stack_layer(INGEST_GRPC_CLIENT_METRICS_LAYER.clone()) .stack_layer(TimeoutLayer::new(GRPC_INGESTER_SERVICE_TIMEOUT)) .build_from_channel( @@ -990,9 +991,13 @@ async fn setup_ingest_v2( node.channel(), max_message_size, grpc_compression_encoding_opt, - ); - Some(Change::Insert(node_id, ingester_service)) - } + ) + }; + let ingester_pool_entry = IngesterPoolEntry { + client, + availability_zone, + }; + Some(Change::Insert(node_id, ingester_pool_entry)) } ClusterChange::Remove(node) if node.is_indexer() => { let chitchat_id = node.chitchat_id(); From 4c35484a788b1ec42437c92599efd8d5d7fe37b0 Mon Sep 17 00:00:00 2001 From: nadav-govari Date: Mon, 9 Mar 2026 14:57:42 -0400 Subject: [PATCH 7/9] Remove old routing table; Take both disk and memory WAL readings (#6193) * Remove old routing table; Take both disk and memory WAL readings * Add az-aware ingest attempts metric (#6194) --- ...er_capacity_score.rs => capacity_score.rs} | 9 +- .../src/ingest_v2/broadcast/mod.rs | 4 +- .../quickwit-ingest/src/ingest_v2/ingester.rs | 13 +- .../quickwit-ingest/src/ingest_v2/metrics.rs | 8 + quickwit/quickwit-ingest/src/ingest_v2/mod.rs | 4 +- .../src/ingest_v2/node_routing_table.rs | 519 ------ .../quickwit-ingest/src/ingest_v2/router.rs | 49 +- .../src/ingest_v2/routing_table.rs | 1440 +++++------------ .../quickwit-ingest/src/ingest_v2/state.rs | 18 +- ..._timeseries.rs => wal_capacity_tracker.rs} | 75 +- 10 files changed, 561 insertions(+), 1578 deletions(-) rename quickwit/quickwit-ingest/src/ingest_v2/broadcast/{ingester_capacity_score.rs => capacity_score.rs} (97%) delete mode 100644 quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs rename quickwit/quickwit-ingest/src/ingest_v2/{wal_capacity_timeseries.rs => wal_capacity_tracker.rs} (76%) diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/capacity_score.rs similarity index 97% rename from quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs rename to quickwit/quickwit-ingest/src/ingest_v2/broadcast/capacity_score.rs index 482f5f58886..d9f456b7201 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/capacity_score.rs @@ -69,7 +69,10 @@ impl BroadcastIngesterCapacityScoreTask { .map_err(|_| anyhow::anyhow!("failed to acquire ingester state lock"))?; let usage = guard.mrecordlog.resource_usage(); let disk_used = ByteSize::b(usage.disk_used_bytes as u64); - let capacity_score = guard.wal_capacity_time_series.record_and_score(disk_used); + let memory_used = ByteSize::b(usage.memory_used_bytes as u64); + let capacity_score = guard + .wal_capacity_tracker + .record_and_score(disk_used, memory_used); let (open_shard_counts, _) = guard.get_shard_snapshot(); Ok(Some((capacity_score, open_shard_counts))) @@ -218,8 +221,8 @@ mod tests { state_guard.shards.insert(shard.queue_id(), shard); let (open_shard_counts, _) = state_guard.get_shard_snapshot(); let capacity_score = state_guard - .wal_capacity_time_series - .record_and_score(ByteSize::b(500)); + .wal_capacity_tracker + .record_and_score(ByteSize::b(500), ByteSize::b(0)); drop(state_guard); assert_eq!(capacity_score, 6); diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs index 18a00209de1..b579382af78 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs @@ -13,7 +13,7 @@ // limitations under the License. #[allow(dead_code)] -mod ingester_capacity_score; +mod capacity_score; mod local_shards; use std::time::Duration; @@ -26,7 +26,7 @@ pub(in crate::ingest_v2) const BROADCAST_INTERVAL_PERIOD: Duration = if cfg!(tes Duration::from_secs(5) }; -pub use ingester_capacity_score::{ +pub use capacity_score::{ BroadcastIngesterCapacityScoreTask, IngesterCapacityScoreUpdate, setup_ingester_capacity_update_listener, }; diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs index df51758a4ca..b77cefa7f38 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs @@ -131,7 +131,12 @@ impl Ingester { idle_shard_timeout: Duration, ) -> IngestV2Result { let self_node_id: NodeId = cluster.self_node_id().into(); - let state = IngesterState::load(wal_dir_path, disk_capacity, rate_limiter_settings); + let state = IngesterState::load( + wal_dir_path, + disk_capacity, + memory_capacity, + rate_limiter_settings, + ); let weak_state = state.weak(); BroadcastLocalShardsTask::spawn(cluster.clone(), weak_state.clone()); @@ -784,10 +789,12 @@ impl Ingester { } let wal_usage = state_guard.mrecordlog.resource_usage(); let disk_used = wal_usage.disk_used_bytes as u64; + let memory_used = wal_usage.memory_used_bytes as u64; let (open_shard_counts, closed_shards) = state_guard.get_shard_snapshot(); let capacity_score = state_guard - .wal_capacity_time_series - .score(ByteSize::b(disk_used)) as u32; + .wal_capacity_tracker + .score(ByteSize::b(disk_used), ByteSize::b(memory_used)) + as u32; drop(state_guard); if disk_used >= self.disk_capacity.as_u64() * 90 / 100 { diff --git a/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs b/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs index 1fb32c0b2fd..87975a3c462 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs @@ -82,12 +82,20 @@ pub(super) struct IngestV2Metrics { pub wal_disk_used_bytes: IntGauge, pub wal_memory_used_bytes: IntGauge, pub ingest_results: IngestResultMetrics, + pub ingest_attempts: IntCounterVec<1>, } impl Default for IngestV2Metrics { fn default() -> Self { Self { ingest_results: IngestResultMetrics::default(), + ingest_attempts: new_counter_vec::<1>( + "ingest_attempts", + "Number of routing attempts by AZ locality", + "ingest", + &[], + ["az_routing"], + ), reset_shards_operations_total: new_counter_vec( "reset_shards_operations_total", "Total number of reset shards operations performed.", diff --git a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs index d5432936e58..151bf219fb7 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs @@ -22,15 +22,13 @@ mod metrics; mod models; mod mrecord; mod mrecordlog_utils; -mod node_routing_table; mod publish_tracker; mod rate_meter; mod replication; mod router; -#[allow(dead_code)] mod routing_table; mod state; -mod wal_capacity_timeseries; +mod wal_capacity_tracker; mod workbench; use std::collections::HashMap; diff --git a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs deleted file mode 100644 index 68299358f77..00000000000 --- a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs +++ /dev/null @@ -1,519 +0,0 @@ -// Copyright 2021-Present Datadog, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::{HashMap, HashSet}; - -use itertools::Itertools; -use quickwit_proto::ingest::Shard; -use quickwit_proto::types::{IndexId, IndexUid, NodeId, SourceId}; -use rand::rng; -use rand::seq::IndexedRandom; - -use crate::IngesterPool; - -/// A single ingester node's routing-relevant data for a specific (index, source) pair. -/// Each entry is self-describing: it carries its own node_id, index_uid, and source_id -/// so it can always be attributed back to a specific source on a specific node. -#[derive(Debug, Clone)] -pub(super) struct IngesterNode { - pub node_id: NodeId, - pub index_uid: IndexUid, - #[allow(unused)] - pub source_id: SourceId, - /// Score from 0-10. Higher means more available capacity. - pub capacity_score: usize, - /// Number of open shards on this node for this (index, source) pair. Tiebreaker for power of - /// two choices comparison - we favor a node with more open shards. - pub open_shard_count: usize, -} - -#[derive(Debug, Default)] -pub(super) struct RoutingEntry { - pub nodes: HashMap, -} - -/// Given a slice of candidates, picks the better of two random choices. -/// Higher capacity_score wins; tiebreak on more open_shard_count (more landing spots). -fn power_of_two_choices<'a>(candidates: &[&'a IngesterNode]) -> &'a IngesterNode { - debug_assert!(candidates.len() >= 2); - let mut iter = candidates.choose_multiple(&mut rng(), 2); - let (&a, &b) = (iter.next().unwrap(), iter.next().unwrap()); - - if (a.capacity_score, a.open_shard_count) >= (b.capacity_score, b.open_shard_count) { - a - } else { - b - } -} - -fn pick_from(candidates: Vec<&IngesterNode>) -> Option<&IngesterNode> { - match candidates.len() { - 0 => None, - 1 => Some(candidates[0]), - _ => Some(power_of_two_choices(&candidates)), - } -} - -impl RoutingEntry { - /// Pick an ingester node to persist the request to. Uses power of two choices based on reported - /// ingester capacity, if more than one eligible node exists. Prefers nodes in the same - /// availability zone, falling back to remote nodes. - fn pick_node( - &self, - ingester_pool: &IngesterPool, - unavailable_leaders: &HashSet, - self_availability_zone: &Option, - ) -> Option<&IngesterNode> { - let (local_ingesters, remote_ingesters): (Vec<&IngesterNode>, Vec<&IngesterNode>) = self - .nodes - .values() - .filter(|node| { - node.capacity_score > 0 - && node.open_shard_count > 0 - && ingester_pool.contains_key(&node.node_id) - && !unavailable_leaders.contains(&node.node_id) - }) - .partition(|node| { - let node_az = ingester_pool - .get(&node.node_id) - .and_then(|h| h.availability_zone); - node_az == *self_availability_zone - }); - - pick_from(local_ingesters).or_else(|| pick_from(remote_ingesters)) - } -} - -#[derive(Debug, Default)] -pub(super) struct NodeBasedRoutingTable { - table: HashMap<(IndexId, SourceId), RoutingEntry>, - self_availability_zone: Option, -} - -impl NodeBasedRoutingTable { - pub fn new(self_availability_zone: Option) -> Self { - Self { - self_availability_zone, - ..Default::default() - } - } - - pub fn pick_node( - &self, - index_id: &str, - source_id: &str, - ingester_pool: &IngesterPool, - unavailable_leaders: &HashSet, - ) -> Option<&IngesterNode> { - let key = (index_id.to_string(), source_id.to_string()); - let entry = self.table.get(&key)?; - entry.pick_node( - ingester_pool, - unavailable_leaders, - &self.self_availability_zone, - ) - } - - pub fn debug_info( - &self, - ingester_pool: &IngesterPool, - ) -> HashMap> { - let mut per_index: HashMap> = HashMap::new(); - for ((index_id, source_id), entry) in &self.table { - for (node_id, node) in &entry.nodes { - let az = ingester_pool.get(node_id).and_then(|h| h.availability_zone); - per_index - .entry(index_id.clone()) - .or_default() - .push(serde_json::json!({ - "source_id": source_id, - "node_id": node_id, - "capacity_score": node.capacity_score, - "open_shard_count": node.open_shard_count, - "availability_zone": az, - })); - } - } - per_index - } - - pub fn has_open_nodes( - &self, - index_id: &str, - source_id: &str, - ingester_pool: &IngesterPool, - unavailable_leaders: &HashSet, - ) -> bool { - let key = (index_id.to_string(), source_id.to_string()); - let Some(entry) = self.table.get(&key) else { - return false; - }; - entry.nodes.values().any(|node| { - node.capacity_score > 0 - && node.open_shard_count > 0 - && ingester_pool.contains_key(&node.node_id) - && !unavailable_leaders.contains(&node.node_id) - }) - } - - /// Applies a capacity update from the IngesterCapacityScoreUpdate broadcast. This is the - /// primary way the table learns about node availability and capacity. - pub fn apply_capacity_update( - &mut self, - node_id: NodeId, - index_uid: IndexUid, - source_id: SourceId, - capacity_score: usize, - open_shard_count: usize, - ) { - let key = (index_uid.index_id.to_string(), source_id.clone()); - - let entry = self.table.entry(key).or_default(); - let ingester_node = IngesterNode { - node_id: node_id.clone(), - index_uid, - source_id, - capacity_score, - open_shard_count, - }; - entry.nodes.insert(node_id, ingester_node); - } - - /// Merges routing updates from a GetOrCreateOpenShards control plane response into the - /// table. For existing nodes, updates their open shard count, including if the count is 0, from - /// the CP response while preserving capacity scores if they already exist. - /// New nodes get a default capacity_score of 5. - pub fn merge_from_shards( - &mut self, - index_uid: IndexUid, - source_id: SourceId, - shards: Vec, - ) { - let per_leader_count: HashMap = shards - .iter() - .map(|shard| { - let num_open_shards = shard.is_open() as usize; - let leader_id = NodeId::from(shard.leader_id.clone()); - (leader_id, num_open_shards) - }) - .into_grouping_map() - .sum(); - - let key = (index_uid.index_id.to_string(), source_id.clone()); - let entry = self.table.entry(key).or_default(); - - for (node_id, open_shard_count) in per_leader_count { - entry - .nodes - .entry(node_id.clone()) - .and_modify(|node| node.open_shard_count = open_shard_count) - .or_insert_with(|| IngesterNode { - node_id, - index_uid: index_uid.clone(), - source_id: source_id.clone(), - capacity_score: 5, - open_shard_count, - }); - } - } -} - -#[cfg(test)] -mod tests { - use quickwit_proto::ingest::ShardState; - use quickwit_proto::ingest::ingester::IngesterServiceClient; - use quickwit_proto::types::ShardId; - - use super::*; - use crate::IngesterPoolEntry; - - fn mocked_ingester(availability_zone: Option<&str>) -> IngesterPoolEntry { - IngesterPoolEntry { - client: IngesterServiceClient::mocked(), - availability_zone: availability_zone.map(|s| s.to_string()), - } - } - - #[test] - fn test_apply_capacity_update() { - let mut table = NodeBasedRoutingTable::default(); - let key = ("test-index".to_string(), "test-source".into()); - - // Insert first node. - table.apply_capacity_update( - "node-1".into(), - IndexUid::for_test("test-index", 0), - "test-source".into(), - 8, - 3, - ); - let entry = table.table.get(&key).unwrap(); - assert_eq!(entry.nodes.len(), 1); - assert_eq!(entry.nodes.get("node-1").unwrap().capacity_score, 8); - - // Update existing node. - table.apply_capacity_update( - "node-1".into(), - IndexUid::for_test("test-index", 0), - "test-source".into(), - 4, - 5, - ); - let node = table.table.get(&key).unwrap().nodes.get("node-1").unwrap(); - assert_eq!(node.capacity_score, 4); - assert_eq!(node.open_shard_count, 5); - - // Add second node. - table.apply_capacity_update( - "node-2".into(), - IndexUid::for_test("test-index", 0), - "test-source".into(), - 6, - 2, - ); - assert_eq!(table.table.get(&key).unwrap().nodes.len(), 2); - - // Zero shards: node stays in table but becomes ineligible for routing. - table.apply_capacity_update( - "node-1".into(), - IndexUid::for_test("test-index", 0), - "test-source".into(), - 0, - 0, - ); - let entry = table.table.get(&key).unwrap(); - assert_eq!(entry.nodes.len(), 2); - assert_eq!(entry.nodes.get("node-1").unwrap().open_shard_count, 0); - assert_eq!(entry.nodes.get("node-1").unwrap().capacity_score, 0); - } - - #[test] - fn test_has_open_nodes() { - let mut table = NodeBasedRoutingTable::default(); - let pool = IngesterPool::default(); - - // Empty table. - assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new())); - - // Node exists but is not in pool. - table.apply_capacity_update( - "node-1".into(), - IndexUid::for_test("test-index", 0), - "test-source".into(), - 8, - 3, - ); - assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new())); - - // Node is in pool → true. - pool.insert("node-1".into(), mocked_ingester(None)); - assert!(table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new())); - - // Node is unavailable → false. - let unavailable: HashSet = HashSet::from(["node-1".into()]); - assert!(!table.has_open_nodes("test-index", "test-source", &pool, &unavailable)); - - // Second node available → true despite first being unavailable. - table.apply_capacity_update( - "node-2".into(), - IndexUid::for_test("test-index", 0), - "test-source".into(), - 6, - 2, - ); - pool.insert("node-2".into(), mocked_ingester(None)); - assert!(table.has_open_nodes("test-index", "test-source", &pool, &unavailable)); - - // Node with capacity_score=0 is not eligible. - table.apply_capacity_update( - "node-2".into(), - IndexUid::for_test("test-index", 0), - "test-source".into(), - 0, - 2, - ); - assert!(!table.has_open_nodes("test-index", "test-source", &pool, &unavailable)); - } - - #[test] - fn test_pick_node_prefers_same_az() { - let mut table = NodeBasedRoutingTable::new(Some("az-1".to_string())); - let pool = IngesterPool::default(); - - table.apply_capacity_update( - "node-1".into(), - IndexUid::for_test("test-index", 0), - "test-source".into(), - 5, - 1, - ); - table.apply_capacity_update( - "node-2".into(), - IndexUid::for_test("test-index", 0), - "test-source".into(), - 5, - 1, - ); - pool.insert("node-1".into(), mocked_ingester(Some("az-1"))); - pool.insert("node-2".into(), mocked_ingester(Some("az-2"))); - - let picked = table - .pick_node("test-index", "test-source", &pool, &HashSet::new()) - .unwrap(); - assert_eq!(picked.node_id, NodeId::from("node-1")); - } - - #[test] - fn test_pick_node_falls_back_to_cross_az() { - let mut table = NodeBasedRoutingTable::new(Some("az-1".to_string())); - let pool = IngesterPool::default(); - - table.apply_capacity_update( - "node-2".into(), - IndexUid::for_test("test-index", 0), - "test-source".into(), - 5, - 1, - ); - pool.insert("node-2".into(), mocked_ingester(Some("az-2"))); - - let picked = table - .pick_node("test-index", "test-source", &pool, &HashSet::new()) - .unwrap(); - assert_eq!(picked.node_id, NodeId::from("node-2")); - } - - #[test] - fn test_pick_node_no_az_awareness() { - let mut table = NodeBasedRoutingTable::default(); - let pool = IngesterPool::default(); - - table.apply_capacity_update( - "node-1".into(), - IndexUid::for_test("test-index", 0), - "test-source".into(), - 5, - 1, - ); - pool.insert("node-1".into(), mocked_ingester(Some("az-1"))); - - let picked = table - .pick_node("test-index", "test-source", &pool, &HashSet::new()) - .unwrap(); - assert_eq!(picked.node_id, NodeId::from("node-1")); - } - - #[test] - fn test_pick_node_missing_entry() { - let table = NodeBasedRoutingTable::new(Some("az-1".to_string())); - let pool = IngesterPool::default(); - - assert!( - table - .pick_node("nonexistent", "source", &pool, &HashSet::new()) - .is_none() - ); - } - - #[test] - fn test_power_of_two_choices() { - // 3 candidates: best appears in the random pair 2/3 of the time and always - // wins when it does, so it should win ~67% of 1000 runs. Asserting > 550 - // is ~7.5 standard deviations from the mean — effectively impossible to flake. - let high = IngesterNode { - node_id: "high".into(), - index_uid: IndexUid::for_test("idx", 0), - source_id: "src".into(), - capacity_score: 9, - open_shard_count: 2, - }; - let mid = IngesterNode { - node_id: "mid".into(), - index_uid: IndexUid::for_test("idx", 0), - source_id: "src".into(), - capacity_score: 5, - open_shard_count: 2, - }; - let low = IngesterNode { - node_id: "low".into(), - index_uid: IndexUid::for_test("idx", 0), - source_id: "src".into(), - capacity_score: 1, - open_shard_count: 2, - }; - let candidates: Vec<&IngesterNode> = vec![&high, &mid, &low]; - - let mut high_wins = 0; - for _ in 0..1000 { - if power_of_two_choices(&candidates).node_id == "high" { - high_wins += 1; - } - } - assert!(high_wins > 550, "high won only {high_wins}/1000 times"); - } - - #[test] - fn test_merge_from_shards() { - let mut table = NodeBasedRoutingTable::default(); - let index_uid = IndexUid::for_test("test-index", 0); - let key = ("test-index".to_string(), "test-source".to_string()); - - let make_shard = |id: u64, leader: &str, open: bool| Shard { - index_uid: Some(index_uid.clone()), - source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(id)), - shard_state: if open { - ShardState::Open as i32 - } else { - ShardState::Closed as i32 - }, - leader_id: leader.to_string(), - ..Default::default() - }; - - // Two open shards on node-1, one open + one closed on node-2, only closed on node-3. - let shards = vec![ - make_shard(1, "node-1", true), - make_shard(2, "node-1", true), - make_shard(3, "node-2", true), - make_shard(4, "node-2", false), - make_shard(5, "node-3", false), - ]; - table.merge_from_shards(index_uid.clone(), "test-source".into(), shards); - - let entry = table.table.get(&key).unwrap(); - assert_eq!(entry.nodes.len(), 3); - - let n1 = entry.nodes.get("node-1").unwrap(); - assert_eq!(n1.open_shard_count, 2); - assert_eq!(n1.capacity_score, 5); - - let n2 = entry.nodes.get("node-2").unwrap(); - assert_eq!(n2.open_shard_count, 1); - - let n3 = entry.nodes.get("node-3").unwrap(); - assert_eq!(n3.open_shard_count, 0); - - // Merging again adds new nodes but preserves existing ones. - let shards = vec![make_shard(10, "node-4", true)]; - table.merge_from_shards(index_uid, "test-source".into(), shards); - - let entry = table.table.get(&key).unwrap(); - assert_eq!(entry.nodes.len(), 4); - assert!(entry.nodes.contains_key("node-1")); - assert!(entry.nodes.contains_key("node-2")); - assert!(entry.nodes.contains_key("node-3")); - assert!(entry.nodes.contains_key("node-4")); - } -} diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs index a1f5bf86302..46a476431f9 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs @@ -46,10 +46,11 @@ use super::debouncing::{ }; use super::ingester::PERSIST_REQUEST_TIMEOUT; use super::metrics::IngestResultMetrics; -use super::node_routing_table::NodeBasedRoutingTable; +use super::routing_table::RoutingTable; use super::workbench::IngestWorkbench; use super::{IngesterPool, pending_subrequests}; use crate::get_ingest_router_buffer_size; +use crate::ingest_v2::metrics::INGEST_V2_METRICS; /// Duration after which ingest requests time out with [`IngestV2Error::Timeout`]. fn ingest_request_timeout() -> Duration { @@ -101,7 +102,7 @@ struct RouterState { // Debounces `GetOrCreateOpenShardsRequest` requests to the control plane. debouncer: GetOrCreateOpenShardsRequestDebouncer, // Routing table of nodes, their WAL capacity, and the number of open shards per source. - node_routing_table: NodeBasedRoutingTable, + routing_table: RoutingTable, } impl fmt::Debug for IngestRouter { @@ -124,7 +125,7 @@ impl IngestRouter { ) -> Self { let state = Arc::new(Mutex::new(RouterState { debouncer: GetOrCreateOpenShardsRequestDebouncer::default(), - node_routing_table: NodeBasedRoutingTable::new(self_availability_zone), + routing_table: RoutingTable::new(self_availability_zone), })); let ingest_semaphore_permits = get_ingest_router_buffer_size().as_u64() as usize; let ingest_semaphore = Arc::new(Semaphore::new(ingest_semaphore_permits)); @@ -160,7 +161,7 @@ impl IngestRouter { let mut state_guard = self.state.lock().await; for subrequest in pending_subrequests(&workbench.subworkbenches) { - if !state_guard.node_routing_table.has_open_nodes( + if !state_guard.routing_table.has_open_nodes( &subrequest.index_id, &subrequest.source_id, ingester_pool, @@ -251,7 +252,7 @@ impl IngestRouter { let mut state_guard = self.state.lock().await; for success in response.successes { - state_guard.node_routing_table.merge_from_shards( + state_guard.routing_table.merge_from_shards( success.index_uid().clone(), success.source_id, success.open_shards, @@ -302,7 +303,7 @@ impl IngestRouter { // opportunity to get a fresh routing update. let mut state_guard = self.state.lock().await; for shard_update in routing_update.source_shard_updates { - state_guard.node_routing_table.apply_capacity_update( + state_guard.routing_table.apply_capacity_update( leader_id.clone(), shard_update.index_uid().clone(), shard_update.source_id, @@ -353,7 +354,7 @@ impl IngestRouter { let state_guard = self.state.lock().await; for subrequest in pending_subrequests(&workbench.subworkbenches) { - let ingester_node = state_guard.node_routing_table.pick_node( + let ingester_node = state_guard.routing_table.pick_node( &subrequest.index_id, &subrequest.source_id, &self.ingester_pool, @@ -367,6 +368,13 @@ impl IngestRouter { continue; } }; + let az_locality = state_guard + .routing_table + .classify_az_locality(&ingester_node.node_id, &self.ingester_pool); + INGEST_V2_METRICS + .ingest_attempts + .with_label_values([az_locality]) + .inc(); let persist_subrequest = PersistSubrequest { subrequest_id: subrequest.subrequest_id, index_uid: Some(ingester_node.index_uid.clone()), @@ -474,9 +482,7 @@ impl IngestRouter { pub async fn debug_info(&self) -> JsonValue { let state_guard = self.state.lock().await; - let routing_table_json = state_guard - .node_routing_table - .debug_info(&self.ingester_pool); + let routing_table_json = state_guard.routing_table.debug_info(&self.ingester_pool); json!({ "routing_table": routing_table_json, @@ -486,8 +492,7 @@ impl IngestRouter { fn update_ingest_metrics(ingest_result: &IngestV2Result, num_subrequests: usize) { let num_subrequests = num_subrequests as u64; - let ingest_results_metrics: &IngestResultMetrics = - &crate::ingest_v2::metrics::INGEST_V2_METRICS.ingest_results; + let ingest_results_metrics: &IngestResultMetrics = &INGEST_V2_METRICS.ingest_results; match ingest_result { Ok(ingest_response) => { ingest_results_metrics @@ -607,7 +612,7 @@ impl EventSubscriber for WeakRouterState { return; }; let mut state_guard = state.lock().await; - state_guard.node_routing_table.apply_capacity_update( + state_guard.routing_table.apply_capacity_update( update.node_id, update.source_uid.index_uid, update.source_uid.source_id, @@ -674,7 +679,7 @@ mod tests { { let mut state_guard = router.state.lock().await; - state_guard.node_routing_table.apply_capacity_update( + state_guard.routing_table.apply_capacity_update( "test-ingester-0".into(), IndexUid::for_test("test-index-0", 0), "test-source".to_string(), @@ -1273,7 +1278,7 @@ mod tests { let index_uid_1: IndexUid = IndexUid::for_test("test-index-1", 0); { let mut state_guard = router.state.lock().await; - state_guard.node_routing_table.merge_from_shards( + state_guard.routing_table.merge_from_shards( index_uid_0.clone(), "test-source".to_string(), vec![Shard { @@ -1285,7 +1290,7 @@ mod tests { ..Default::default() }], ); - state_guard.node_routing_table.merge_from_shards( + state_guard.routing_table.merge_from_shards( index_uid_1.clone(), "test-source".to_string(), vec![Shard { @@ -1420,7 +1425,7 @@ mod tests { let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0); { let mut state_guard = router.state.lock().await; - state_guard.node_routing_table.merge_from_shards( + state_guard.routing_table.merge_from_shards( index_uid.clone(), "test-source".to_string(), vec![Shard { @@ -1529,7 +1534,7 @@ mod tests { { let mut state_guard = router.state.lock().await; - state_guard.node_routing_table.merge_from_shards( + state_guard.routing_table.merge_from_shards( index_uid_0.clone(), "test-source".to_string(), vec![Shard { @@ -1540,7 +1545,7 @@ mod tests { ..Default::default() }], ); - state_guard.node_routing_table.merge_from_shards( + state_guard.routing_table.merge_from_shards( index_uid_1.clone(), "test-source".to_string(), vec![Shard { @@ -1584,7 +1589,7 @@ mod tests { let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0); { let mut state_guard = router.state.lock().await; - state_guard.node_routing_table.merge_from_shards( + state_guard.routing_table.merge_from_shards( index_uid.clone(), "test-source".to_string(), vec![Shard { @@ -1689,7 +1694,7 @@ mod tests { ingester_pool.insert("test-ingester-0".into(), mocked_ingester()); let state_guard = router.state.lock().await; let node = state_guard - .node_routing_table + .routing_table .pick_node("test-index", "test-source", &ingester_pool, &HashSet::new()) .unwrap(); assert_eq!(node.node_id, NodeId::from("test-ingester-0")); @@ -1836,7 +1841,7 @@ mod tests { ingester_pool.insert("test-ingester-0".into(), mocked_ingester()); let state_guard = router.state.lock().await; let node = state_guard - .node_routing_table + .routing_table .pick_node("test-index", "test-source", &ingester_pool, &HashSet::new()) .unwrap(); assert_eq!(node.node_id, NodeId::from("test-ingester-0")); diff --git a/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs index 4b4150d6e98..670822a6ae3 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs @@ -12,485 +12,238 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; -use std::sync::atomic::{AtomicUsize, Ordering}; -use quickwit_proto::ingest::{Shard, ShardIds, ShardState}; -use quickwit_proto::types::{IndexId, IndexUid, NodeId, ShardId, SourceId}; -use serde_json::{Value as JsonValue, json}; -use tracing::{info, warn}; +use itertools::Itertools; +use quickwit_proto::ingest::Shard; +use quickwit_proto::types::{IndexId, IndexUid, NodeId, SourceId}; +use rand::rng; +use rand::seq::IndexedRandom; use crate::IngesterPool; -#[derive(Debug)] -pub(super) struct RoutingEntry { +/// A single ingester node's routing-relevant data for a specific (index, source) pair. +/// Each entry is self-describing: it carries its own node_id, index_uid, and source_id +/// so it can always be attributed back to a specific source on a specific node. +#[derive(Debug, Clone)] +pub(super) struct IngesterNode { + pub node_id: NodeId, pub index_uid: IndexUid, + #[allow(unused)] pub source_id: SourceId, - pub shard_id: ShardId, - pub shard_state: ShardState, - pub leader_id: NodeId, + /// Score from 0-10. Higher means more available capacity. + pub capacity_score: usize, + /// Number of open shards on this node for this (index, source) pair. Tiebreaker for power of + /// two choices comparison - we favor a node with more open shards. + pub open_shard_count: usize, } -impl From for RoutingEntry { - fn from(shard: Shard) -> Self { - let shard_id = shard.shard_id().clone(); - let shard_state = shard.shard_state(); - Self { - index_uid: shard.index_uid().clone(), - source_id: shard.source_id, - shard_id, - shard_state, - leader_id: shard.leader_id.into(), - } - } +#[derive(Debug, Default)] +pub(super) struct RoutingEntry { + pub nodes: HashMap, } -/// The set of shards the router is aware of for the given index and source. -#[derive(Debug, Default)] -pub(super) struct RoutingTableEntry { - /// Index UID of the shards. - pub index_uid: IndexUid, - /// Source ID of the shards. - pub source_id: SourceId, - /// Shards located on this node. - pub local_shards: Vec, - pub local_round_robin_idx: AtomicUsize, - /// Shards located on remote nodes. - pub remote_shards: Vec, - pub remote_round_robin_idx: AtomicUsize, +/// Given a slice of candidates, picks the better of two random choices. +/// Higher capacity_score wins; tiebreak on more open_shard_count (more landing spots). +fn power_of_two_choices<'a>(candidates: &[&'a IngesterNode]) -> &'a IngesterNode { + debug_assert!(candidates.len() >= 2); + let mut iter = candidates.choose_multiple(&mut rng(), 2); + let (&a, &b) = (iter.next().unwrap(), iter.next().unwrap()); + + if (a.capacity_score, a.open_shard_count) >= (b.capacity_score, b.open_shard_count) { + a + } else { + b + } } -impl RoutingTableEntry { - /// Creates a new entry and ensures that the shards are open, unique, and sorted by shard ID. - fn new( - self_node_id: &NodeId, - index_uid: IndexUid, - source_id: SourceId, - mut shards: Vec, - ) -> Self { - let num_shards = shards.len(); +fn pick_from(candidates: Vec<&IngesterNode>) -> Option<&IngesterNode> { + match candidates.len() { + 0 => None, + 1 => Some(candidates[0]), + _ => Some(power_of_two_choices(&candidates)), + } +} - shards.sort_unstable_by(|left, right| left.shard_id.cmp(&right.shard_id)); - shards.dedup_by(|left, right| left.shard_id == right.shard_id); +impl RoutingEntry { + /// Pick an ingester node to persist the request to. Uses power of two choices based on reported + /// ingester capacity, if more than one eligible node exists. Prefers nodes in the same + /// availability zone, falling back to remote nodes. + fn pick_node( + &self, + ingester_pool: &IngesterPool, + unavailable_leaders: &HashSet, + self_availability_zone: &Option, + ) -> Option<&IngesterNode> { + let (local_ingesters, remote_ingesters): (Vec<&IngesterNode>, Vec<&IngesterNode>) = self + .nodes + .values() + .filter(|node| { + node.capacity_score > 0 + && node.open_shard_count > 0 + && ingester_pool.contains_key(&node.node_id) + && !unavailable_leaders.contains(&node.node_id) + }) + .partition(|node| { + let node_az = ingester_pool + .get(&node.node_id) + .and_then(|h| h.availability_zone); + node_az == *self_availability_zone + }); - let (local_shards, remote_shards): (Vec<_>, Vec<_>) = shards - .into_iter() - .filter(|shard| shard.is_open()) - .map(RoutingEntry::from) - .partition(|shard| *self_node_id == shard.leader_id); + pick_from(local_ingesters).or_else(|| pick_from(remote_ingesters)) + } +} - if num_shards > local_shards.len() + remote_shards.len() { - warn!("input shards should not contain closed shards or duplicates"); - } +#[derive(Debug, Default)] +pub(super) struct RoutingTable { + table: HashMap<(IndexId, SourceId), RoutingEntry>, + self_availability_zone: Option, +} +impl RoutingTable { + pub fn new(self_availability_zone: Option) -> Self { Self { - index_uid, - source_id, - local_shards, - remote_shards, + self_availability_zone, ..Default::default() } } - fn empty(index_uid: IndexUid, source_id: SourceId) -> Self { - Self { - index_uid, - source_id, - ..Default::default() - } - } - - /// Returns `true` if at least one shard in the table entry is open and has a leader available. - /// As it goes through the list of shards in the entry, it populates `closed_shard_ids` and - /// `unavailable_leaders` with the shard IDs of the closed shards and the node ID of the - /// unavailable ingesters encountered along the way. - pub fn has_open_shards( + pub fn pick_node( &self, + index_id: &str, + source_id: &str, ingester_pool: &IngesterPool, - closed_shard_ids: &mut Vec, - unavailable_leaders: &mut HashSet, - ) -> bool { - let shards = self.local_shards.iter().chain(self.remote_shards.iter()); - - for shard in shards { - match shard.shard_state { - ShardState::Closed => { - closed_shard_ids.push(shard.shard_id.clone()); - continue; - } - ShardState::Unavailable | ShardState::Unspecified => { - continue; - } - ShardState::Open => { - if unavailable_leaders.contains(&shard.leader_id) { - continue; - } - if ingester_pool.contains_key(&shard.leader_id) { - return true; - } else { - let leader_id: NodeId = shard.leader_id.clone(); - unavailable_leaders.insert(leader_id); - } - } - } - } - false + unavailable_leaders: &HashSet, + ) -> Option<&IngesterNode> { + let key = (index_id.to_string(), source_id.to_string()); + let entry = self.table.get(&key)?; + entry.pick_node( + ingester_pool, + unavailable_leaders, + &self.self_availability_zone, + ) } - /// Returns the next open and available shard in the table entry in a round-robin fashion. - pub fn next_open_shard_round_robin( + pub fn classify_az_locality( &self, + target_node_id: &NodeId, ingester_pool: &IngesterPool, - rate_limited_shards: &HashSet, - ) -> Result<&RoutingEntry, NextOpenShardError> { - let mut error = NextOpenShardError::NoShardsAvailable; - - for (shards, round_robin_idx) in [ - (&self.local_shards, &self.local_round_robin_idx), - (&self.remote_shards, &self.remote_round_robin_idx), - ] { - if shards.is_empty() { - continue; - } - for _attempt in 0..shards.len() { - let shard_idx = round_robin_idx.fetch_add(1, Ordering::Relaxed); - let shard_routing_entry: &RoutingEntry = &shards[shard_idx % shards.len()]; - - if !shard_routing_entry.shard_state.is_open() { - continue; - } - if rate_limited_shards.contains(&shard_routing_entry.shard_id) { - error = NextOpenShardError::RateLimited; - continue; - } - if ingester_pool.contains_key(&shard_routing_entry.leader_id) { - return Ok(shard_routing_entry); - } - } - } - Err(error) - } - - /// Inserts the open shards the routing table is not aware of. - fn insert_open_shards( - &mut self, - self_node_id: &NodeId, - leader_id: &NodeId, - index_uid: &IndexUid, - shard_ids: &[ShardId], - ) { - match self.index_uid.cmp(index_uid) { - // If we receive an update for a new incarnation of the index, then we clear the entry - // and insert all the shards. - std::cmp::Ordering::Less => { - self.index_uid = index_uid.clone(); - self.clear_shards(); - } - // If we receive an update for a previous incarnation of the index, then we ignore it. - std::cmp::Ordering::Greater => { - return; - } - std::cmp::Ordering::Equal => {} + ) -> &'static str { + let Some(self_az) = &self.self_availability_zone else { + return "az_unaware"; }; - let target_shards = if self_node_id == leader_id { - &mut self.local_shards - } else { - &mut self.remote_shards - }; - let mut num_inserted_shards = 0; - let num_target_shards = target_shards.len(); - - if num_target_shards == 0 { - target_shards.reserve(num_target_shards); - target_shards.extend(shard_ids.iter().map(|shard_id| RoutingEntry { - index_uid: self.index_uid.clone(), - source_id: self.source_id.clone(), - shard_id: shard_id.clone(), - shard_state: ShardState::Open, - leader_id: leader_id.clone(), - })); - num_inserted_shards = target_shards.len(); - } else { - let shard_ids_range = target_shards[0].shard_id.clone() - ..=target_shards[num_target_shards - 1].shard_id.clone(); - - for shard_id in shard_ids { - // If we can't find the shard, then we insert it. - if shard_ids_range.contains(shard_id) { - continue; - } - if target_shards[..num_target_shards] - .binary_search_by(|shard| shard.shard_id.cmp(shard_id)) - .is_err() - { - target_shards.push(RoutingEntry { - index_uid: self.index_uid.clone(), - source_id: self.source_id.clone(), - shard_id: shard_id.clone(), - shard_state: ShardState::Open, - leader_id: leader_id.clone(), - }); - num_inserted_shards += 1; - } - } - } - if num_inserted_shards > 0 { - target_shards.sort_unstable_by(|left, right| left.shard_id.cmp(&right.shard_id)); - - info!( - index_uid=%self.index_uid, - source_id=%self.source_id, - "inserted {num_inserted_shards} shards into routing table" - ); + let target_az = ingester_pool + .get(target_node_id) + .and_then(|entry| entry.availability_zone); + match target_az { + Some(ref az) if az == self_az => "same_az", + Some(_) => "cross_az", + None => "az_unaware", } } - /// Clears local and remote shards. - fn clear_shards(&mut self) { - self.local_shards.clear(); - self.local_round_robin_idx = AtomicUsize::default(); - self.remote_shards.clear(); - self.remote_round_robin_idx = AtomicUsize::default(); - } - - /// Closes the shards identified by their shard IDs. - fn close_shards(&mut self, index_uid: &IndexUid, shard_ids: &[ShardId]) { - // If the shard table was just recently updated with shards for a new index UID, then we can - // safely discard this request. - if self.index_uid != *index_uid { - return; - } - for shards in [&mut self.local_shards, &mut self.remote_shards] { - if shards.is_empty() { - continue; - } - let num_shards = shards.len(); - let shard_ids_range = - shards[0].shard_id.clone()..=shards[num_shards - 1].shard_id.clone(); - - for shard_id in shard_ids { - if !shard_ids_range.contains(shard_id) { - continue; - } - if let Ok(shard_idx) = shards.binary_search_by(|shard| shard.shard_id.cmp(shard_id)) - { - shards[shard_idx].shard_state = ShardState::Closed; - } - } - } - } - - /// Shards the shards identified by their shard IDs. - fn delete_shards(&mut self, index_uid: &IndexUid, shard_ids: &[ShardId]) { - // If the shard table was just recently updated with shards for a new index UID, then we can - // safely discard this request. - if self.index_uid != *index_uid { - return; - } - for shards in [&mut self.local_shards, &mut self.remote_shards] { - if shards.is_empty() { - continue; - } - let num_shards = shards.len(); - let shard_ids_range = - shards[0].shard_id.clone()..=shards[num_shards - 1].shard_id.clone(); - let mut deleted_any = false; - - for shard_id in shard_ids { - if !shard_ids_range.contains(shard_id) { - continue; - } - if let Ok(shard_idx) = shards.binary_search_by(|shard| shard.shard_id.cmp(shard_id)) - { - // We use `Unspecified` as a tombstone. - shards[shard_idx].shard_state = ShardState::Unspecified; - deleted_any = true; - } - } - if deleted_any { - shards.retain(|shard| shard.shard_state != ShardState::Unspecified); + pub fn debug_info( + &self, + ingester_pool: &IngesterPool, + ) -> HashMap> { + let mut per_index: HashMap> = HashMap::new(); + for ((index_id, source_id), entry) in &self.table { + for (node_id, node) in &entry.nodes { + let az = ingester_pool.get(node_id).and_then(|h| h.availability_zone); + per_index + .entry(index_id.clone()) + .or_default() + .push(serde_json::json!({ + "source_id": source_id, + "node_id": node_id, + "capacity_score": node.capacity_score, + "open_shard_count": node.open_shard_count, + "availability_zone": az, + })); } } + per_index } - #[cfg(test)] - pub fn len(&self) -> usize { - self.local_shards.len() + self.remote_shards.len() - } - - #[cfg(test)] - pub fn all_shards(&self) -> Vec<&RoutingEntry> { - let mut shards = Vec::with_capacity(self.len()); - shards.extend(&self.local_shards); - shards.extend(&self.remote_shards); - shards - } -} - -#[derive(Debug, PartialEq, Eq)] -pub(super) enum NextOpenShardError { - NoShardsAvailable, - RateLimited, -} - -/// Stores the list of shards the router is aware of for each index and source. The resolution from -/// index and source to shards is performed using index ID (not index UID) and source ID. -#[derive(Debug)] -pub(super) struct RoutingTable { - pub self_node_id: NodeId, - pub table: HashMap<(IndexId, SourceId), RoutingTableEntry>, -} - -impl RoutingTable { - pub fn find_entry( + pub fn has_open_nodes( &self, - index_id: impl Into, - source_id: impl Into, - ) -> Option<&RoutingTableEntry> { - let key = (index_id.into(), source_id.into()); - self.table.get(&key) - } - - /// Returns `true` if the router already knows about a shard for a given source that has - /// an available `leader`. - /// - /// If this function returns false, it populates the set of unavailable leaders and closed - /// shards. These will be joined to the GetOrCreate shard request emitted to the control - /// plane. - pub fn has_open_shards( - &self, - index_id: impl Into, - source_id: impl Into, + index_id: &str, + source_id: &str, ingester_pool: &IngesterPool, - closed_shards: &mut Vec, - unavailable_leaders: &mut HashSet, + unavailable_leaders: &HashSet, ) -> bool { - let Some(entry) = self.find_entry(index_id, source_id) else { + let key = (index_id.to_string(), source_id.to_string()); + let Some(entry) = self.table.get(&key) else { return false; }; - let mut closed_shard_ids: Vec = Vec::new(); - - let result = - entry.has_open_shards(ingester_pool, &mut closed_shard_ids, unavailable_leaders); - - if !closed_shard_ids.is_empty() { - closed_shards.push(ShardIds { - index_uid: entry.index_uid.clone().into(), - source_id: entry.source_id.clone(), - shard_ids: closed_shard_ids, - }); - } - result + entry.nodes.values().any(|node| { + node.capacity_score > 0 + && node.open_shard_count > 0 + && ingester_pool.contains_key(&node.node_id) + && !unavailable_leaders.contains(&node.node_id) + }) } - /// Replaces the routing table entry for the source with the provided shards. - pub fn replace_shards( + /// Applies a capacity update from the IngesterCapacityScoreUpdate broadcast. This is the + /// primary way the table learns about node availability and capacity. + pub fn apply_capacity_update( &mut self, + node_id: NodeId, index_uid: IndexUid, - source_id: impl Into, - shards: Vec, + source_id: SourceId, + capacity_score: usize, + open_shard_count: usize, ) { - let index_id: IndexId = index_uid.index_id.to_string(); - let source_id: SourceId = source_id.into(); - let key = (index_id, source_id.clone()); - - match self.table.entry(key) { - Entry::Vacant(entry) => { - entry.insert(RoutingTableEntry::new( - &self.self_node_id, - index_uid, - source_id, - shards, - )); - } - Entry::Occupied(mut entry) => { - assert!( - entry.get().index_uid <= index_uid, - "new index incarnation should be greater or equal" - ); + let key = (index_uid.index_id.to_string(), source_id.clone()); - entry.insert(RoutingTableEntry::new( - &self.self_node_id, - index_uid, - source_id, - shards, - )); - } + let entry = self.table.entry(key).or_default(); + let ingester_node = IngesterNode { + node_id: node_id.clone(), + index_uid, + source_id, + capacity_score, + open_shard_count, }; + entry.nodes.insert(node_id, ingester_node); } - /// Inserts the shards the routing table is not aware of. - pub fn insert_open_shards( + /// Merges routing updates from a GetOrCreateOpenShards control plane response into the + /// table. For existing nodes, updates their open shard count, including if the count is 0, from + /// the CP response while preserving capacity scores if they already exist. + /// New nodes get a default capacity_score of 5. + pub fn merge_from_shards( &mut self, - leader_id: &NodeId, index_uid: IndexUid, - source_id: impl Into, - shard_ids: &[ShardId], - ) { - let index_id: IndexId = index_uid.index_id.to_string(); - let source_id: SourceId = source_id.into(); - let key = (index_id, source_id.clone()); - - self.table - .entry(key.clone()) - .or_insert_with(|| RoutingTableEntry::empty(index_uid.clone(), source_id)) - .insert_open_shards(&self.self_node_id, leader_id, &index_uid, shard_ids); - } - - /// Closes the targeted shards. - pub fn close_shards( - &mut self, - index_uid: &IndexUid, - source_id: impl Into, - shard_ids: &[ShardId], - ) { - let key = (index_uid.index_id.clone(), source_id.into()); - if let Some(entry) = self.table.get_mut(&key) { - entry.close_shards(index_uid, shard_ids); - } - } - - /// Deletes the targeted shards. - pub fn delete_shards( - &mut self, - index_uid: &IndexUid, - source_id: impl Into, - shard_ids: &[ShardId], + source_id: SourceId, + shards: Vec, ) { - let key = (index_uid.index_id.clone(), source_id.into()); - if let Some(entry) = self.table.get_mut(&key) { - entry.delete_shards(index_uid, shard_ids); - } - } - - pub fn debug_info(&self) -> HashMap> { - let mut per_index_shards_json: HashMap> = HashMap::new(); - - for ((index_id, source_id), entry) in &self.table { - for (shards, is_local) in &[(&entry.local_shards, true), (&entry.remote_shards, false)] - { - let shards_json = shards.iter().map(|shard| { - json!({ - "index_uid": shard.index_uid, - "source_id": source_id, - "shard_id": shard.shard_id, - "shard_state": shard.shard_state.as_json_str_name(), - "is_local": is_local, - }) + let per_leader_count: HashMap = shards + .iter() + .map(|shard| { + let num_open_shards = shard.is_open() as usize; + let leader_id = NodeId::from(shard.leader_id.clone()); + (leader_id, num_open_shards) + }) + .into_grouping_map() + .sum(); + + let key = (index_uid.index_id.to_string(), source_id.clone()); + let entry = self.table.entry(key).or_default(); + + for (node_id, open_shard_count) in per_leader_count { + entry + .nodes + .entry(node_id.clone()) + .and_modify(|node| node.open_shard_count = open_shard_count) + .or_insert_with(|| IngesterNode { + node_id, + index_uid: index_uid.clone(), + source_id: source_id.clone(), + capacity_score: 5, + open_shard_count, }); - per_index_shards_json - .entry(index_id.clone()) - .or_default() - .extend(shards_json); - } } - per_index_shards_json - } - - #[cfg(test)] - pub fn len(&self) -> usize { - self.table.len() } } @@ -498,624 +251,315 @@ impl RoutingTable { mod tests { use quickwit_proto::ingest::ShardState; use quickwit_proto::ingest::ingester::IngesterServiceClient; + use quickwit_proto::types::ShardId; use super::*; + use crate::IngesterPoolEntry; + + fn mocked_ingester(availability_zone: Option<&str>) -> IngesterPoolEntry { + IngesterPoolEntry { + client: IngesterServiceClient::mocked(), + availability_zone: availability_zone.map(|s| s.to_string()), + } + } #[test] - fn test_routing_table_entry_new() { - let self_node_id: NodeId = "test-node-0".into(); - let index_uid = IndexUid::for_test("test-index", 0); - let source_id: SourceId = "test-source".into(); - let table_entry = RoutingTableEntry::new( - &self_node_id, - index_uid.clone(), - source_id.clone(), - Vec::new(), + fn test_apply_capacity_update() { + let mut table = RoutingTable::default(); + let key = ("test-index".to_string(), "test-source".into()); + + // Insert first node. + table.apply_capacity_update( + "node-1".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 8, + 3, ); - assert_eq!(table_entry.len(), 0); - - let index_uid: IndexUid = IndexUid::for_test("test-index", 0); - let shards = vec![ - Shard { - index_uid: Some(index_uid.clone()), - source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(3)), - shard_state: ShardState::Open as i32, - leader_id: "test-node-0".to_string(), - ..Default::default() - }, - Shard { - index_uid: Some(index_uid.clone()), - source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), - shard_state: ShardState::Open as i32, - leader_id: "test-node-0".to_string(), - ..Default::default() - }, - Shard { - index_uid: Some(index_uid.clone()), - source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(2)), - shard_state: ShardState::Open as i32, - leader_id: "test-node-1".to_string(), - ..Default::default() - }, - Shard { - index_uid: Some(index_uid.clone()), - source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(1)), - shard_state: ShardState::Open as i32, - leader_id: "test-node-0".to_string(), - ..Default::default() - }, - Shard { - index_uid: Some(index_uid.clone()), - source_id: "test-source".to_string(), - shard_id: Some(ShardId::from(4)), - shard_state: ShardState::Closed as i32, - leader_id: "test-node-0".to_string(), - ..Default::default() - }, - ]; - let table_entry = RoutingTableEntry::new(&self_node_id, index_uid, source_id, shards); - assert_eq!(table_entry.local_shards.len(), 2); - assert_eq!(table_entry.local_shards[0].shard_id, ShardId::from(1)); - assert_eq!(table_entry.local_shards[1].shard_id, ShardId::from(3)); - - assert_eq!(table_entry.remote_shards.len(), 1); - assert_eq!(table_entry.remote_shards[0].shard_id, ShardId::from(2)); + let entry = table.table.get(&key).unwrap(); + assert_eq!(entry.nodes.len(), 1); + assert_eq!(entry.nodes.get("node-1").unwrap().capacity_score, 8); + + // Update existing node. + table.apply_capacity_update( + "node-1".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 4, + 5, + ); + let node = table.table.get(&key).unwrap().nodes.get("node-1").unwrap(); + assert_eq!(node.capacity_score, 4); + assert_eq!(node.open_shard_count, 5); + + // Add second node. + table.apply_capacity_update( + "node-2".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 6, + 2, + ); + assert_eq!(table.table.get(&key).unwrap().nodes.len(), 2); + + // Zero shards: node stays in table but becomes ineligible for routing. + table.apply_capacity_update( + "node-1".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 0, + 0, + ); + let entry = table.table.get(&key).unwrap(); + assert_eq!(entry.nodes.len(), 2); + assert_eq!(entry.nodes.get("node-1").unwrap().open_shard_count, 0); + assert_eq!(entry.nodes.get("node-1").unwrap().capacity_score, 0); } #[test] - fn test_routing_table_entry_has_open_shards() { - let index_uid = IndexUid::for_test("test-index", 0); - let source_id: SourceId = "test-source".into(); - let table_entry = RoutingTableEntry::empty(index_uid.clone(), source_id.clone()); - - let mut closed_shard_ids = Vec::new(); - let ingester_pool = IngesterPool::default(); - let mut unavailable_leaders = HashSet::new(); - - assert!(!table_entry.has_open_shards( - &ingester_pool, - &mut closed_shard_ids, - &mut unavailable_leaders - )); - assert!(closed_shard_ids.is_empty()); - assert!(unavailable_leaders.is_empty()); - - ingester_pool.insert( - "test-ingester-0".into(), - crate::IngesterPoolEntry { - client: IngesterServiceClient::mocked(), - availability_zone: None, - }, + fn test_has_open_nodes() { + let mut table = RoutingTable::default(); + let pool = IngesterPool::default(); + + // Empty table. + assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new())); + + // Node exists but is not in pool. + table.apply_capacity_update( + "node-1".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 8, + 3, ); - ingester_pool.insert( - "test-ingester-1".into(), - crate::IngesterPoolEntry { - client: IngesterServiceClient::mocked(), - availability_zone: None, - }, + assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new())); + + // Node is in pool → true. + pool.insert("node-1".into(), mocked_ingester(None)); + assert!(table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new())); + + // Node is unavailable → false. + let unavailable: HashSet = HashSet::from(["node-1".into()]); + assert!(!table.has_open_nodes("test-index", "test-source", &pool, &unavailable)); + + // Second node available → true despite first being unavailable. + table.apply_capacity_update( + "node-2".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 6, + 2, ); - - let table_entry = RoutingTableEntry { - index_uid: index_uid.clone(), - source_id: source_id.clone(), - local_shards: vec![ - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(1), - shard_state: ShardState::Closed, - leader_id: "test-ingester-0".into(), - }, - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(2), - shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), - }, - ], - local_round_robin_idx: AtomicUsize::default(), - remote_shards: Vec::new(), - remote_round_robin_idx: AtomicUsize::default(), - }; - assert!(table_entry.has_open_shards( - &ingester_pool, - &mut closed_shard_ids, - &mut unavailable_leaders - )); - assert_eq!(closed_shard_ids.len(), 1); - assert_eq!(closed_shard_ids[0], ShardId::from(1)); - assert!(unavailable_leaders.is_empty()); - - closed_shard_ids.clear(); - - let table_entry = RoutingTableEntry { - index_uid: index_uid.clone(), - source_id, - local_shards: Vec::new(), - local_round_robin_idx: AtomicUsize::default(), - remote_shards: vec![ - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(1), - shard_state: ShardState::Closed, - leader_id: "test-ingester-1".into(), - }, - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(2), - shard_state: ShardState::Open, - leader_id: "test-ingester-2".into(), - }, - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(3), - shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), - }, - ], - remote_round_robin_idx: AtomicUsize::default(), - }; - assert!(table_entry.has_open_shards( - &ingester_pool, - &mut closed_shard_ids, - &mut unavailable_leaders - )); - assert_eq!(closed_shard_ids.len(), 1); - assert_eq!(closed_shard_ids[0], ShardId::from(1)); - assert_eq!(unavailable_leaders.len(), 1); - assert!(unavailable_leaders.contains("test-ingester-2")); + pool.insert("node-2".into(), mocked_ingester(None)); + assert!(table.has_open_nodes("test-index", "test-source", &pool, &unavailable)); + + // Node with capacity_score=0 is not eligible. + table.apply_capacity_update( + "node-2".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 0, + 2, + ); + assert!(!table.has_open_nodes("test-index", "test-source", &pool, &unavailable)); } #[test] - fn test_routing_table_entry_next_open_shard_round_robin() { - let index_uid = IndexUid::for_test("test-index", 0); - let source_id: SourceId = "test-source".into(); - let table_entry = RoutingTableEntry::empty(index_uid.clone(), source_id.clone()); - let ingester_pool = IngesterPool::default(); - let mut rate_limited_shards = HashSet::new(); - - let error = table_entry - .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards) - .unwrap_err(); - assert_eq!(error, NextOpenShardError::NoShardsAvailable); - - ingester_pool.insert( - "test-ingester-0".into(), - crate::IngesterPoolEntry { - client: IngesterServiceClient::mocked(), - availability_zone: None, - }, + fn test_pick_node_prefers_same_az() { + let mut table = RoutingTable::new(Some("az-1".to_string())); + let pool = IngesterPool::default(); + + table.apply_capacity_update( + "node-1".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 5, + 1, ); - ingester_pool.insert( - "test-ingester-1".into(), - crate::IngesterPoolEntry { - client: IngesterServiceClient::mocked(), - availability_zone: None, - }, + table.apply_capacity_update( + "node-2".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 5, + 1, ); + pool.insert("node-1".into(), mocked_ingester(Some("az-1"))); + pool.insert("node-2".into(), mocked_ingester(Some("az-2"))); - let table_entry = RoutingTableEntry { - index_uid: index_uid.clone(), - source_id: source_id.clone(), - local_shards: vec![ - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(1), - shard_state: ShardState::Closed, - leader_id: "test-ingester-0".into(), - }, - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(2), - shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), - }, - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(3), - shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), - }, - ], - local_round_robin_idx: AtomicUsize::default(), - remote_shards: Vec::new(), - remote_round_robin_idx: AtomicUsize::default(), - }; - let shard = table_entry - .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards) - .unwrap(); - assert_eq!(shard.shard_id, ShardId::from(2)); - - let shard = table_entry - .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards) - .unwrap(); - assert_eq!(shard.shard_id, ShardId::from(3)); - - let shard = table_entry - .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards) - .unwrap(); - assert_eq!(shard.shard_id, ShardId::from(2)); - - let table_entry = RoutingTableEntry { - index_uid: index_uid.clone(), - source_id: source_id.clone(), - local_shards: vec![RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(1), - shard_state: ShardState::Closed, - leader_id: "test-ingester-0".into(), - }], - local_round_robin_idx: AtomicUsize::default(), - remote_shards: vec![ - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(2), - shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), - }, - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(3), - shard_state: ShardState::Closed, - leader_id: "test-ingester-1".into(), - }, - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(4), - shard_state: ShardState::Open, - leader_id: "test-ingester-2".into(), - }, - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(5), - shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), - }, - ], - remote_round_robin_idx: AtomicUsize::default(), - }; - let shard = table_entry - .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards) - .unwrap(); - assert_eq!(shard.shard_id, ShardId::from(2)); - - let shard = table_entry - .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards) - .unwrap(); - assert_eq!(shard.shard_id, ShardId::from(5)); - - let shard = table_entry - .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards) - .unwrap(); - assert_eq!(shard.shard_id, ShardId::from(2)); - - rate_limited_shards.insert(ShardId::from(5)); - - let shard = table_entry - .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards) + let picked = table + .pick_node("test-index", "test-source", &pool, &HashSet::new()) .unwrap(); - assert_eq!(shard.shard_id, ShardId::from(2)); + assert_eq!(picked.node_id, NodeId::from("node-1")); } #[test] - fn test_routing_table_entry_next_open_shard_round_robin_rate_limited_error() { - let index_uid = IndexUid::for_test("test-index", 0); - let source_id: SourceId = "test-source".into(); - - let ingester_pool = IngesterPool::default(); - ingester_pool.insert( - "test-ingester-0".into(), - crate::IngesterPoolEntry { - client: IngesterServiceClient::mocked(), - availability_zone: None, - }, + fn test_pick_node_falls_back_to_cross_az() { + let mut table = RoutingTable::new(Some("az-1".to_string())); + let pool = IngesterPool::default(); + + table.apply_capacity_update( + "node-2".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 5, + 1, ); + pool.insert("node-2".into(), mocked_ingester(Some("az-2"))); - let rate_limited_shards = HashSet::from_iter([ShardId::from(1)]); - - let table_entry = RoutingTableEntry { - index_uid: index_uid.clone(), - source_id: source_id.clone(), - local_shards: vec![RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(1), - shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), - }], - local_round_robin_idx: AtomicUsize::default(), - remote_shards: Vec::new(), - remote_round_robin_idx: AtomicUsize::default(), - }; - let error = table_entry - .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards) - .unwrap_err(); - assert_eq!(error, NextOpenShardError::RateLimited); + let picked = table + .pick_node("test-index", "test-source", &pool, &HashSet::new()) + .unwrap(); + assert_eq!(picked.node_id, NodeId::from("node-2")); } #[test] - fn test_routing_table_entry_insert_open_shards() { - let index_uid_0 = IndexUid::for_test("test-index", 0); - let source_id: SourceId = "test-source".into(); - let mut table_entry = RoutingTableEntry::empty(index_uid_0.clone(), source_id.clone()); - - let local_node_id: NodeId = "test-ingester-0".into(); - let remote_node_id: NodeId = "test-ingester-1".into(); - table_entry.insert_open_shards(&local_node_id, &local_node_id, &index_uid_0, &[]); - - assert_eq!(table_entry.local_shards.len(), 0); - assert_eq!(table_entry.remote_shards.len(), 0); - - table_entry.insert_open_shards( - &local_node_id, - &local_node_id, - &index_uid_0, - &[ShardId::from(2)], + fn test_pick_node_no_az_awareness() { + let mut table = RoutingTable::default(); + let pool = IngesterPool::default(); + + table.apply_capacity_update( + "node-1".into(), + IndexUid::for_test("test-index", 0), + "test-source".into(), + 5, + 1, ); + pool.insert("node-1".into(), mocked_ingester(Some("az-1"))); - assert_eq!(table_entry.local_shards.len(), 1); - assert_eq!(table_entry.remote_shards.len(), 0); - - assert_eq!(table_entry.local_shards[0].index_uid, index_uid_0); - assert_eq!(table_entry.local_shards[0].source_id, source_id); - assert_eq!(table_entry.local_shards[0].shard_id, ShardId::from(2)); - assert_eq!(table_entry.local_shards[0].shard_state, ShardState::Open); - assert_eq!(table_entry.local_shards[0].leader_id, local_node_id); + let picked = table + .pick_node("test-index", "test-source", &pool, &HashSet::new()) + .unwrap(); + assert_eq!(picked.node_id, NodeId::from("node-1")); + } - table_entry.local_shards[0].shard_state = ShardState::Closed; - table_entry.insert_open_shards( - &local_node_id, - &local_node_id, - &index_uid_0, - &[ShardId::from(1), ShardId::from(2)], + #[test] + fn test_pick_node_missing_entry() { + let table = RoutingTable::new(Some("az-1".to_string())); + let pool = IngesterPool::default(); + + assert!( + table + .pick_node("nonexistent", "source", &pool, &HashSet::new()) + .is_none() ); + } - assert_eq!(table_entry.local_shards.len(), 2); - assert_eq!(table_entry.remote_shards.len(), 0); - - assert_eq!(table_entry.local_shards[0].shard_id, ShardId::from(1)); - assert_eq!(table_entry.local_shards[0].shard_state, ShardState::Open); - assert_eq!(table_entry.local_shards[1].shard_id, ShardId::from(2)); - assert_eq!(table_entry.local_shards[1].shard_state, ShardState::Closed); - - table_entry.local_shards.clear(); - table_entry.insert_open_shards( - &local_node_id, - &remote_node_id, - &index_uid_0, - &[ShardId::from(2)], - ); + #[test] + fn test_power_of_two_choices() { + // 3 candidates: best appears in the random pair 2/3 of the time and always + // wins when it does, so it should win ~67% of 1000 runs. Asserting > 550 + // is ~7.5 standard deviations from the mean — effectively impossible to flake. + let high = IngesterNode { + node_id: "high".into(), + index_uid: IndexUid::for_test("idx", 0), + source_id: "src".into(), + capacity_score: 9, + open_shard_count: 2, + }; + let mid = IngesterNode { + node_id: "mid".into(), + index_uid: IndexUid::for_test("idx", 0), + source_id: "src".into(), + capacity_score: 5, + open_shard_count: 2, + }; + let low = IngesterNode { + node_id: "low".into(), + index_uid: IndexUid::for_test("idx", 0), + source_id: "src".into(), + capacity_score: 1, + open_shard_count: 2, + }; + let candidates: Vec<&IngesterNode> = vec![&high, &mid, &low]; - assert_eq!(table_entry.local_shards.len(), 0); - assert_eq!(table_entry.remote_shards.len(), 1); + let mut high_wins = 0; + for _ in 0..1000 { + if power_of_two_choices(&candidates).node_id == "high" { + high_wins += 1; + } + } + assert!(high_wins > 550, "high won only {high_wins}/1000 times"); + } - assert_eq!(table_entry.remote_shards[0].index_uid, index_uid_0); - assert_eq!(table_entry.remote_shards[0].source_id, source_id); - assert_eq!(table_entry.remote_shards[0].shard_id, ShardId::from(2)); - assert_eq!(table_entry.remote_shards[0].shard_state, ShardState::Open); - assert_eq!(table_entry.remote_shards[0].leader_id, remote_node_id); + #[test] + fn test_merge_from_shards() { + let mut table = RoutingTable::default(); + let index_uid = IndexUid::for_test("test-index", 0); + let key = ("test-index".to_string(), "test-source".to_string()); + + let make_shard = |id: u64, leader: &str, open: bool| Shard { + index_uid: Some(index_uid.clone()), + source_id: "test-source".to_string(), + shard_id: Some(ShardId::from(id)), + shard_state: if open { + ShardState::Open as i32 + } else { + ShardState::Closed as i32 + }, + leader_id: leader.to_string(), + ..Default::default() + }; - table_entry.remote_shards[0].shard_state = ShardState::Closed; - table_entry.insert_open_shards( - &local_node_id, - &remote_node_id, - &index_uid_0, - &[ShardId::from(1), ShardId::from(2)], - ); + // Two open shards on node-1, one open + one closed on node-2, only closed on node-3. + let shards = vec![ + make_shard(1, "node-1", true), + make_shard(2, "node-1", true), + make_shard(3, "node-2", true), + make_shard(4, "node-2", false), + make_shard(5, "node-3", false), + ]; + table.merge_from_shards(index_uid.clone(), "test-source".into(), shards); - assert_eq!(table_entry.local_shards.len(), 0); - assert_eq!(table_entry.remote_shards.len(), 2); + let entry = table.table.get(&key).unwrap(); + assert_eq!(entry.nodes.len(), 3); - assert_eq!(table_entry.remote_shards[0].shard_id, ShardId::from(1)); - assert_eq!(table_entry.remote_shards[0].shard_state, ShardState::Open); - assert_eq!(table_entry.remote_shards[1].shard_id, ShardId::from(2)); - assert_eq!(table_entry.remote_shards[1].shard_state, ShardState::Closed); + let n1 = entry.nodes.get("node-1").unwrap(); + assert_eq!(n1.open_shard_count, 2); + assert_eq!(n1.capacity_score, 5); - // Update index incarnation. - let index_uid_1 = IndexUid::for_test("test-index", 1); - table_entry.insert_open_shards( - &local_node_id, - &local_node_id, - &index_uid_1, - &[ShardId::from(1)], - ); + let n2 = entry.nodes.get("node-2").unwrap(); + assert_eq!(n2.open_shard_count, 1); - assert_eq!(table_entry.index_uid, index_uid_1); - assert_eq!(table_entry.local_shards.len(), 1); - assert_eq!(table_entry.remote_shards.len(), 0); + let n3 = entry.nodes.get("node-3").unwrap(); + assert_eq!(n3.open_shard_count, 0); - assert_eq!(table_entry.local_shards[0].index_uid, index_uid_1); - assert_eq!(table_entry.local_shards[0].source_id, source_id); - assert_eq!(table_entry.local_shards[0].shard_id, ShardId::from(1)); - assert_eq!(table_entry.local_shards[0].shard_state, ShardState::Open); - assert_eq!(table_entry.local_shards[0].leader_id, local_node_id); + // Merging again adds new nodes but preserves existing ones. + let shards = vec![make_shard(10, "node-4", true)]; + table.merge_from_shards(index_uid, "test-source".into(), shards); - // Ignore previous index incarnation. - table_entry.insert_open_shards( - &local_node_id, - &local_node_id, - &index_uid_0, - &[ShardId::from(12), ShardId::from(42), ShardId::from(1337)], - ); - assert_eq!(table_entry.index_uid, index_uid_1); - assert_eq!(table_entry.local_shards.len(), 1); - assert_eq!(table_entry.remote_shards.len(), 0); + let entry = table.table.get(&key).unwrap(); + assert_eq!(entry.nodes.len(), 4); + assert!(entry.nodes.contains_key("node-1")); + assert!(entry.nodes.contains_key("node-2")); + assert!(entry.nodes.contains_key("node-3")); + assert!(entry.nodes.contains_key("node-4")); } #[test] - fn test_routing_table_entry_close_shards() { - let index_uid = IndexUid::for_test("test-index", 0); - let source_id: SourceId = "test-source".into(); - - let mut table_entry = RoutingTableEntry::empty(index_uid.clone(), source_id.clone()); - table_entry.close_shards(&index_uid, &[]); - table_entry.close_shards(&index_uid, &[ShardId::from(1)]); - assert!(table_entry.local_shards.is_empty()); - assert!(table_entry.remote_shards.is_empty()); - - let mut table_entry = RoutingTableEntry { - index_uid: index_uid.clone(), - source_id: source_id.clone(), - local_shards: vec![ - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(1), - shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), - }, - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(2), - shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), - }, - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(3), - shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), - }, - ], - local_round_robin_idx: AtomicUsize::default(), - remote_shards: vec![ - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(5), - shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), - }, - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(6), - shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), - }, - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(7), - shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), - }, - ], - remote_round_robin_idx: AtomicUsize::default(), - }; - table_entry.close_shards( - &index_uid, - &[ - ShardId::from(1), - ShardId::from(3), - ShardId::from(4), - ShardId::from(6), - ShardId::from(8), - ], + fn test_classify_az_locality() { + let table = RoutingTable::new(Some("az-1".to_string())); + let pool = IngesterPool::default(); + pool.insert("node-local".into(), mocked_ingester(Some("az-1"))); + pool.insert("node-remote".into(), mocked_ingester(Some("az-2"))); + pool.insert("node-no-az".into(), mocked_ingester(None)); + + assert_eq!( + table.classify_az_locality(&"node-local".into(), &pool), + "same_az" + ); + assert_eq!( + table.classify_az_locality(&"node-remote".into(), &pool), + "cross_az" + ); + assert_eq!( + table.classify_az_locality(&"node-no-az".into(), &pool), + "az_unaware" ); - assert!(table_entry.local_shards[0].shard_state.is_closed()); - assert!(table_entry.local_shards[1].shard_state.is_open()); - assert!(table_entry.local_shards[2].shard_state.is_closed()); - assert!(table_entry.remote_shards[0].shard_state.is_open()); - assert!(table_entry.remote_shards[1].shard_state.is_closed()); - assert!(table_entry.remote_shards[2].shard_state.is_open()); - } - - #[test] - fn test_routing_table_entry_delete_shards() { - let index_uid = IndexUid::for_test("test-index", 0); - let source_id: SourceId = "test-source".into(); - - let mut table_entry = RoutingTableEntry::empty(index_uid.clone(), source_id.clone()); - table_entry.delete_shards(&index_uid, &[]); - table_entry.delete_shards(&index_uid, &[ShardId::from(1)]); - assert!(table_entry.local_shards.is_empty()); - assert!(table_entry.remote_shards.is_empty()); - let mut table_entry = RoutingTableEntry { - index_uid: index_uid.clone(), - source_id: source_id.clone(), - local_shards: vec![ - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(1), - shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), - }, - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(2), - shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), - }, - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(3), - shard_state: ShardState::Open, - leader_id: "test-ingester-0".into(), - }, - ], - local_round_robin_idx: AtomicUsize::default(), - remote_shards: vec![ - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(5), - shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), - }, - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(6), - shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), - }, - RoutingEntry { - index_uid: index_uid.clone(), - source_id: "test-source".to_string(), - shard_id: ShardId::from(7), - shard_state: ShardState::Open, - leader_id: "test-ingester-1".into(), - }, - ], - remote_round_robin_idx: AtomicUsize::default(), - }; - table_entry.delete_shards( - &index_uid, - &[ - ShardId::from(1), - ShardId::from(3), - ShardId::from(4), - ShardId::from(6), - ShardId::from(8), - ], + let table_no_az = RoutingTable::default(); + assert_eq!( + table_no_az.classify_az_locality(&"node-local".into(), &pool), + "az_unaware" ); - assert_eq!(table_entry.local_shards.len(), 1); - assert_eq!(table_entry.local_shards[0].shard_id, ShardId::from(2)); - assert_eq!(table_entry.remote_shards.len(), 2); - assert_eq!(table_entry.remote_shards[0].shard_id, ShardId::from(5)); - assert_eq!(table_entry.remote_shards[1].shard_id, ShardId::from(7)); } } diff --git a/quickwit/quickwit-ingest/src/ingest_v2/state.rs b/quickwit/quickwit-ingest/src/ingest_v2/state.rs index e158bce7c58..b77de8d608a 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/state.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/state.rs @@ -35,7 +35,7 @@ use tracing::{error, info}; use super::models::IngesterShard; use super::rate_meter::RateMeter; use super::replication::{ReplicationStreamTaskHandle, ReplicationTaskHandle}; -use super::wal_capacity_timeseries::WalDiskCapacityTimeSeries; +use super::wal_capacity_tracker::WalCapacityTracker; use crate::ingest_v2::mrecordlog_utils::{force_delete_queue, queue_position_range}; use crate::mrecordlog_async::MultiRecordLogAsync; use crate::{FollowerId, LeaderId, OpenShardCounts}; @@ -61,7 +61,7 @@ pub(super) struct InnerIngesterState { pub replication_streams: HashMap, // Replication tasks running for each replication stream opened with leaders. pub replication_tasks: HashMap, - pub wal_capacity_time_series: WalDiskCapacityTimeSeries, + pub wal_capacity_tracker: WalCapacityTracker, status: IngesterStatus, status_tx: watch::Sender, } @@ -130,7 +130,7 @@ impl InnerIngesterState { } impl IngesterState { - fn new(disk_capacity: ByteSize) -> Self { + fn new(disk_capacity: ByteSize, memory_capacity: ByteSize) -> Self { let status = IngesterStatus::Initializing; let (status_tx, status_rx) = watch::channel(status); let inner = InnerIngesterState { @@ -138,7 +138,7 @@ impl IngesterState { doc_mappers: Default::default(), replication_streams: Default::default(), replication_tasks: Default::default(), - wal_capacity_time_series: WalDiskCapacityTimeSeries::new(disk_capacity), + wal_capacity_tracker: WalCapacityTracker::new(disk_capacity, memory_capacity), status, status_tx, }; @@ -155,9 +155,10 @@ impl IngesterState { pub fn load( wal_dir_path: &Path, disk_capacity: ByteSize, + memory_capacity: ByteSize, rate_limiter_settings: RateLimiterSettings, ) -> Self { - let state = Self::new(disk_capacity); + let state = Self::new(disk_capacity, memory_capacity); let state_clone = state.clone(); let wal_dir_path = wal_dir_path.to_path_buf(); @@ -180,6 +181,7 @@ impl IngesterState { let mut state = IngesterState::load( temp_dir.path(), disk_capacity, + ByteSize::mb(256), RateLimiterSettings::default(), ); @@ -530,7 +532,7 @@ mod tests { #[tokio::test] async fn test_ingester_state_does_not_lock_while_initializing() { - let state = IngesterState::new(ByteSize::mb(256)); + let state = IngesterState::new(ByteSize::mb(256), ByteSize::mb(256)); let inner_guard = state.inner.lock().await; assert_eq!(inner_guard.status(), IngesterStatus::Initializing); @@ -545,7 +547,7 @@ mod tests { #[tokio::test] async fn test_ingester_state_failed() { - let state = IngesterState::new(ByteSize::mb(256)); + let state = IngesterState::new(ByteSize::mb(256), ByteSize::mb(256)); state.inner.lock().await.set_status(IngesterStatus::Failed); @@ -558,7 +560,7 @@ mod tests { #[tokio::test] async fn test_ingester_state_init() { - let mut state = IngesterState::new(ByteSize::mb(256)); + let mut state = IngesterState::new(ByteSize::mb(256), ByteSize::mb(256)); let temp_dir = tempfile::tempdir().unwrap(); state diff --git a/quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_timeseries.rs b/quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_tracker.rs similarity index 76% rename from quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_timeseries.rs rename to quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_tracker.rs index 58f030cbf74..f24e8254053 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_timeseries.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_tracker.rs @@ -25,38 +25,36 @@ const WAL_CAPACITY_LOOKBACK_WINDOW_LEN: usize = 6; /// reading would be discarded when the next reading is inserted. const WAL_CAPACITY_READINGS_LEN: usize = WAL_CAPACITY_LOOKBACK_WINDOW_LEN + 1; -pub struct WalDiskCapacityTimeSeries { - disk_capacity: ByteSize, +struct WalCapacityTimeSeries { + capacity: ByteSize, readings: RingBuffer, } -impl WalDiskCapacityTimeSeries { - pub fn new(disk_capacity: ByteSize) -> Self { +impl WalCapacityTimeSeries { + fn new(capacity: ByteSize) -> Self { #[cfg(not(test))] - assert!(disk_capacity.as_u64() > 0); + assert!(capacity.as_u64() > 0); Self { - disk_capacity, + capacity, readings: RingBuffer::default(), } } - /// Records a disk usage reading and returns the resulting capacity score. - pub fn record_and_score(&mut self, disk_used: ByteSize) -> usize { - self.record(disk_used); + fn record_and_score(&mut self, used: ByteSize) -> usize { + self.record(used); let remaining = self.current().unwrap_or(1.0); let delta = self.delta().unwrap_or(0.0); compute_capacity_score(remaining, delta) } - /// Computes a capacity score for the given disk usage without recording it. - pub fn score(&self, disk_used: ByteSize) -> usize { - let remaining = 1.0 - (disk_used.as_u64() as f64 / self.disk_capacity.as_u64() as f64); + fn score(&self, used: ByteSize) -> usize { + let remaining = 1.0 - (used.as_u64() as f64 / self.capacity.as_u64() as f64); let delta = self.delta().unwrap_or(0.0); compute_capacity_score(remaining, delta) } - fn record(&mut self, disk_used: ByteSize) { - let remaining = 1.0 - (disk_used.as_u64() as f64 / self.disk_capacity.as_u64() as f64); + fn record(&mut self, used: ByteSize) { + let remaining = 1.0 - (used.as_u64() as f64 / self.capacity.as_u64() as f64); self.readings.push_back(remaining.clamp(0.0, 1.0)); } @@ -64,8 +62,6 @@ impl WalDiskCapacityTimeSeries { self.readings.last() } - /// How much remaining capacity changed between the oldest and newest readings. - /// Positive = improving, negative = draining. fn delta(&self) -> Option { let current = self.readings.last()?; let oldest = self.readings.front()?; @@ -73,6 +69,35 @@ impl WalDiskCapacityTimeSeries { } } +pub struct WalCapacityTracker { + disk: WalCapacityTimeSeries, + memory: WalCapacityTimeSeries, +} + +impl WalCapacityTracker { + pub fn new(disk_capacity: ByteSize, memory_capacity: ByteSize) -> Self { + Self { + disk: WalCapacityTimeSeries::new(disk_capacity), + memory: WalCapacityTimeSeries::new(memory_capacity), + } + } + + /// Records disk and memory usage readings and returns the resulting capacity score. + /// The score is the minimum of the individual disk and memory scores. + pub fn record_and_score(&mut self, disk_used: ByteSize, memory_used: ByteSize) -> usize { + let disk_score = self.disk.record_and_score(disk_used); + let memory_score = self.memory.record_and_score(memory_used); + disk_score.min(memory_score) + } + + /// Computes a capacity score for the given usage without recording it. + pub fn score(&self, disk_used: ByteSize, memory_used: ByteSize) -> usize { + let disk_score = self.disk.score(disk_used); + let memory_score = self.memory.score(memory_used); + disk_score.min(memory_score) + } +} + /// Computes a capacity score from 0 to 10 using a PD controller. /// /// The score has two components: @@ -115,18 +140,18 @@ fn compute_capacity_score(remaining_capacity: f64, capacity_delta: f64) -> usize mod tests { use super::*; - fn ts() -> WalDiskCapacityTimeSeries { - WalDiskCapacityTimeSeries::new(ByteSize::b(100)) + fn ts() -> WalCapacityTimeSeries { + WalCapacityTimeSeries::new(ByteSize::b(100)) } /// Helper: record a reading with `used` bytes against the series' fixed capacity. - fn record(series: &mut WalDiskCapacityTimeSeries, used: u64) { + fn record(series: &mut WalCapacityTimeSeries, used: u64) { series.record(ByteSize::b(used)); } #[test] fn test_wal_disk_capacity_current_after_record() { - let mut series = WalDiskCapacityTimeSeries::new(ByteSize::b(256)); + let mut series = WalCapacityTimeSeries::new(ByteSize::b(256)); // 192 of 256 used => 25% remaining series.record(ByteSize::b(192)); assert_eq!(series.current(), Some(0.25)); @@ -211,4 +236,14 @@ mod tests { record(&mut series, 0); assert_eq!(series.delta(), Some(0.50)); } + + #[test] + fn test_wal_capacity_tracker_returns_min() { + let mut tracker = WalCapacityTracker::new(ByteSize::b(100), ByteSize::b(100)); + // Disk 10% used (score 9), memory 90% used (score 2) → returns 2. + assert_eq!( + tracker.record_and_score(ByteSize::b(10), ByteSize::b(90)), + 2 + ); + } } From 056113e2a0f22414e3aa52f77a453edb7a7189f0 Mon Sep 17 00:00:00 2001 From: Nadav Gov-Ari Date: Fri, 13 Mar 2026 15:51:21 -0400 Subject: [PATCH 8/9] PR comments --- .github/workflows/ci.yml | 1 - quickwit/quickwit-control-plane/Cargo.toml | 1 + .../src/control_plane.rs | 15 +-- .../src/ingest/ingest_controller.rs | 66 ++++++------- .../src/source/ingest/mod.rs | 22 ++--- .../src/ingest_v2/broadcast/capacity_score.rs | 3 +- .../quickwit-ingest/src/ingest_v2/fetch.rs | 22 ++--- .../quickwit-ingest/src/ingest_v2/ingester.rs | 92 +++++++++++++++++++ quickwit/quickwit-ingest/src/ingest_v2/mod.rs | 20 ++++ .../quickwit-ingest/src/ingest_v2/router.rs | 18 +--- .../protos/quickwit/ingester.proto | 2 + quickwit/quickwit-serve/src/lib.rs | 2 + 12 files changed, 171 insertions(+), 93 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 502950b5399..85093c255f6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -56,7 +56,6 @@ jobs: run: | df -h - if [ "$(df -BG / | awk 'NR==2 {gsub("G","",$4); print $4}')" -lt 30 ]; then echo "Less than 30GiB available. Running cleanup..." sudo rm -rf /usr/share/dotnet diff --git a/quickwit/quickwit-control-plane/Cargo.toml b/quickwit/quickwit-control-plane/Cargo.toml index 2957c9858c4..abaf672e908 100644 --- a/quickwit/quickwit-control-plane/Cargo.toml +++ b/quickwit/quickwit-control-plane/Cargo.toml @@ -47,6 +47,7 @@ quickwit-cluster = { workspace = true, features = ["testsuite"] } quickwit-common = { workspace = true, features = ["testsuite"] } quickwit-config = { workspace = true, features = ["testsuite"] } quickwit-indexing = { workspace = true } +quickwit-ingest = { workspace = true, features = ["testsuite"] } quickwit-metastore = { workspace = true, features = ["testsuite"] } quickwit-proto = { workspace = true, features = ["testsuite"] } diff --git a/quickwit/quickwit-control-plane/src/control_plane.rs b/quickwit/quickwit-control-plane/src/control_plane.rs index 3603a2b3b7d..4e08b3a6b44 100644 --- a/quickwit/quickwit-control-plane/src/control_plane.rs +++ b/quickwit/quickwit-control-plane/src/control_plane.rs @@ -1216,13 +1216,6 @@ mod tests { use super::*; use crate::IndexerNodeInfo; - fn ingester_pool_entry(client: IngesterServiceClient) -> IngesterPoolEntry { - IngesterPoolEntry { - client, - status: IngesterStatus::Ready, - availability_zone: None, - } - } #[tokio::test] async fn test_control_plane_create_index() { let universe = Universe::with_accelerated_time(); @@ -2230,7 +2223,7 @@ mod tests { assert!(&retain_shards_for_source.shard_ids.is_empty()); Ok(RetainShardsResponse {}) }); - let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("node1".into(), ingester); let cluster_config = ClusterConfig::for_test(); @@ -2276,7 +2269,7 @@ mod tests { ); Ok(RetainShardsResponse {}) }); - let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("node1".into(), ingester); let mut index_0 = IndexMetadata::for_test("test-index-0", "ram:///test-index-0"); @@ -2651,7 +2644,7 @@ mod tests { }; Ok(response) }); - let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert(ingester_id, ingester); let mut mock_metastore = MockMetastoreService::new(); @@ -2805,7 +2798,7 @@ mod tests { }; Ok(response) }); - let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert(ingester_id, ingester); let mut mock_metastore = MockMetastoreService::new(); diff --git a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs index 85def904c0a..94c2e23c87f 100644 --- a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs +++ b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs @@ -325,17 +325,17 @@ impl IngestController { /// Syncs the ingester in a fire and forget manner. /// /// The returned oneshot is just here for unit test to wait for the operation to terminate. - fn sync_with_ingester(&self, ingester_node: &NodeId, model: &ControlPlaneModel) -> WaitHandle { - info!(ingester = %ingester_node, "sync_with_ingester"); + fn sync_with_ingester(&self, ingester_id: &NodeId, model: &ControlPlaneModel) -> WaitHandle { + info!(ingester = %ingester_id, "sync_with_ingester"); let (wait_drop_guard, wait_handle) = WaitHandle::new(); - let Some(ingester) = self.ingester_pool.get(ingester_node) else { + let Some(ingester) = self.ingester_pool.get(ingester_id) else { // TODO: (Maybe) We should mark the ingester as unavailable, and stop advertise its // shard to routers. - warn!("failed to sync with ingester `{ingester_node}`: not available"); + warn!("failed to sync with ingester `{ingester_id}`: not available"); return wait_handle; }; let mut retain_shards_req = RetainShardsRequest::default(); - for (source_uid, shard_ids) in &*model.list_shards_for_node(ingester_node) { + for (source_uid, shard_ids) in &*model.list_shards_for_node(ingester_id) { let shards_for_source = RetainShardsForSource { index_uid: Some(source_uid.index_uid.clone()), source_id: source_uid.source_id.clone(), @@ -345,8 +345,8 @@ impl IngestController { .retain_shards_for_sources .push(shards_for_source); } - info!(ingester = %ingester_node, "retain shards ingester"); - let operation: String = format!("retain shards `{ingester_node}`"); + info!(%ingester_id, "retain shards ingester"); + let operation: String = format!("retain shards `{ingester_id}`"); fire_and_forget( async move { if let Err(retain_shards_err) = @@ -1341,14 +1341,6 @@ mod tests { use super::*; - fn ingester_pool_entry(client: IngesterServiceClient) -> IngesterPoolEntry { - IngesterPoolEntry { - client, - status: IngesterStatus::Ready, - availability_zone: None, - } - } - const TEST_SHARD_THROUGHPUT_LIMIT_MIB: f32 = DEFAULT_SHARD_THROUGHPUT_LIMIT.as_u64() as f32 / quickwit_common::shared_consts::MIB as f32; @@ -1408,7 +1400,7 @@ mod tests { let ingester_pool = IngesterPool::default(); ingester_pool.insert( NodeId::from("test-ingester-1"), - ingester_pool_entry(client.clone()), + IngesterPoolEntry::ready_with_client(client.clone()), ); let mut mock_ingester = MockIngesterService::new(); @@ -1439,7 +1431,7 @@ mod tests { let ingester = IngesterServiceClient::from_mock(mock_ingester); ingester_pool.insert( NodeId::from("test-ingester-2"), - ingester_pool_entry(ingester.clone()), + IngesterPoolEntry::ready_with_client(ingester.clone()), ); let replication_factor = 2; @@ -1628,7 +1620,7 @@ mod tests { let ingester_pool = IngesterPool::default(); ingester_pool.insert( NodeId::from("test-ingester-1"), - ingester_pool_entry(client.clone()), + IngesterPoolEntry::ready_with_client(client.clone()), ); let replication_factor = 1; @@ -1742,7 +1734,7 @@ mod tests { ingester_pool.insert( NodeId::from("test-ingester-1"), - ingester_pool_entry(IngesterServiceClient::mocked()), + IngesterPoolEntry::ready_with_client(IngesterServiceClient::mocked()), ); let leader_follower_pairs_opt = @@ -1754,7 +1746,7 @@ mod tests { ingester_pool.insert( "test-ingester-2".into(), - ingester_pool_entry(IngesterServiceClient::mocked()), + IngesterPoolEntry::ready_with_client(IngesterServiceClient::mocked()), ); let leader_follower_pairs = controller @@ -1876,7 +1868,7 @@ mod tests { ingester_pool.insert( "test-ingester-3".into(), - ingester_pool_entry(IngesterServiceClient::mocked()), + IngesterPoolEntry::ready_with_client(IngesterServiceClient::mocked()), ); let unavailable_leaders = FnvHashSet::from_iter([NodeId::from("test-ingester-2")]); let leader_follower_pairs = controller @@ -1970,7 +1962,7 @@ mod tests { Ok(response) }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert(ingester_id_0, ingester_pool_entry(ingester_0)); + ingester_pool.insert(ingester_id_0, IngesterPoolEntry::ready_with_client(ingester_0)); let ingester_id_1 = NodeId::from("test-ingester-1"); let mut mock_ingester_1 = MockIngesterService::new(); @@ -1991,7 +1983,7 @@ mod tests { Err(IngestV2Error::Internal("internal error".to_string())) }); - let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1)); + let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert(ingester_id_1, ingester_1); let ingester_id_2 = NodeId::from("test-ingester-2"); @@ -2001,7 +1993,7 @@ mod tests { let client_2 = IngesterServiceClient::tower() .stack_init_shards_layer(DelayLayer::new(INIT_SHARDS_REQUEST_TIMEOUT * 2)) .build_from_mock(mock_ingester_2); - ingester_pool.insert(ingester_id_2, ingester_pool_entry(client_2)); + ingester_pool.insert(ingester_id_2, IngesterPoolEntry::ready_with_client(client_2)); let init_shards_response = controller .init_shards(Vec::new(), &Progress::default()) @@ -2209,7 +2201,7 @@ mod tests { ingester_pool.insert( NodeId::from("test-ingester-1"), - ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)), + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)), ); let source_uids: HashMap = HashMap::from_iter([(source_uid.clone(), 1)]); let unavailable_leaders = FnvHashSet::default(); @@ -2379,7 +2371,7 @@ mod tests { "failed to close shards".to_string(), )) }); - let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("test-ingester".into(), ingester); let shard_infos = BTreeSet::from_iter([ @@ -2532,7 +2524,7 @@ mod tests { }, ); - let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("test-ingester".into(), ingester); let shard_infos = BTreeSet::from_iter([ShardInfo { @@ -2678,7 +2670,7 @@ mod tests { }; Ok(response) }); - let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("test-ingester".into(), ingester); // Test failed to open shards. @@ -2800,7 +2792,7 @@ mod tests { }; Ok(response) }); - let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("test-ingester".into(), ingester); // Test failed to close shard. @@ -3032,15 +3024,15 @@ mod tests { }); ingester_pool.insert( "node-1".into(), - ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1)), + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)), ); ingester_pool.insert( "node-2".into(), - ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_2)), + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_2)), ); ingester_pool.insert( "node-3".into(), - ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_3)), + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_3)), ); let node_id = "node-1".into(); let wait_handle = controller.sync_with_ingester(&node_id, &model); @@ -3170,7 +3162,7 @@ mod tests { Ok(response) }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert(ingester_id_0.clone(), ingester_pool_entry(ingester_0)); + ingester_pool.insert(ingester_id_0.clone(), IngesterPoolEntry::ready_with_client(ingester_0)); let ingester_id_1 = NodeId::from("test-ingester-1"); let mut mock_ingester_1 = MockIngesterService::new(); @@ -3188,7 +3180,7 @@ mod tests { Err(IngestV2Error::Internal("internal error".to_string())) }); let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1); - ingester_pool.insert(ingester_id_1.clone(), ingester_pool_entry(ingester_1)); + ingester_pool.insert(ingester_id_1.clone(), IngesterPoolEntry::ready_with_client(ingester_1)); let ingester_id_2 = NodeId::from("test-ingester-2"); let mut mock_ingester_2 = MockIngesterService::new(); @@ -3197,7 +3189,7 @@ mod tests { let client_2 = IngesterServiceClient::tower() .stack_close_shards_layer(DelayLayer::new(CLOSE_SHARDS_REQUEST_TIMEOUT * 2)) .build_from_mock(mock_ingester_2); - ingester_pool.insert(ingester_id_2.clone(), ingester_pool_entry(client_2)); + ingester_pool.insert(ingester_id_2.clone(), IngesterPoolEntry::ready_with_client(client_2)); // In this test: // - ingester 0 will close shard 0 successfully and fail to close shard 1; @@ -3377,7 +3369,7 @@ mod tests { Ok(response) }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert(ingester_id_0.clone(), ingester_pool_entry(ingester_0)); + ingester_pool.insert(ingester_id_0.clone(), IngesterPoolEntry::ready_with_client(ingester_0)); let ingester_id_1 = NodeId::from("test-ingester-1"); let mut mock_ingester_1 = MockIngesterService::new(); @@ -3418,7 +3410,7 @@ mod tests { }; Ok(response) }); - let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1)); + let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert(ingester_id_1.clone(), ingester_1); let num_opened_shards = controller diff --git a/quickwit/quickwit-indexing/src/source/ingest/mod.rs b/quickwit/quickwit-indexing/src/source/ingest/mod.rs index 4d5ae8179ca..2aa02c09963 100644 --- a/quickwit/quickwit-indexing/src/source/ingest/mod.rs +++ b/quickwit/quickwit-indexing/src/source/ingest/mod.rs @@ -690,14 +690,6 @@ mod tests { use crate::models::RawDocBatch; use crate::source::SourceActor; - fn ingester_pool_entry(client: IngesterServiceClient) -> IngesterPoolEntry { - IngesterPoolEntry { - client, - status: IngesterStatus::Ready, - availability_zone: None, - } - } - // In this test, we simulate a source to which we sequentially assign the following set of // shards [] // [1] (triggers a reset, and the creation of a publish lock) @@ -940,7 +932,7 @@ mod tests { Ok(response) }); - let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0)); + let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); let event_broker = EventBroker::default(); @@ -1137,7 +1129,7 @@ mod tests { Ok(response) }); - let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0)); + let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); let event_broker = EventBroker::default(); @@ -1302,7 +1294,7 @@ mod tests { Ok(response) }); - let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0)); + let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); let event_broker = EventBroker::default(); @@ -1610,7 +1602,7 @@ mod tests { }) }); - let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0)); + let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); let event_broker = EventBroker::default(); @@ -1710,7 +1702,7 @@ mod tests { Ok(TruncateShardsResponse {}) }); - let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0)); + let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); let mut mock_ingester_1 = MockIngesterService::new(); @@ -1737,7 +1729,7 @@ mod tests { Ok(TruncateShardsResponse {}) }); - let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1)); + let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert("test-ingester-1".into(), ingester_1.clone()); let mut mock_ingester_3 = MockIngesterService::new(); @@ -1757,7 +1749,7 @@ mod tests { Ok(TruncateShardsResponse {}) }); - let ingester_3 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_3)); + let ingester_3 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_3)); ingester_pool.insert("test-ingester-3".into(), ingester_3.clone()); let event_broker = EventBroker::default(); diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/capacity_score.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/capacity_score.rs index abaee5c7f5f..062539db797 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/capacity_score.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/capacity_score.rs @@ -66,7 +66,8 @@ impl BroadcastIngesterCapacityScoreTask { let mut guard = state .lock_fully() .await - .map_err(|_| anyhow::anyhow!("failed to acquire ingester state lock"))?; + .context("failed to acquire ingester state lock")?; + let usage = guard.mrecordlog.resource_usage(); let disk_used = ByteSize::b(usage.disk_used_bytes as u64); let memory_used = ByteSize::b(usage.memory_used_bytes as u64); diff --git a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs index bed2c693a09..96970fc4e20 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs @@ -650,14 +650,6 @@ pub(super) mod tests { } } - fn ingester_pool_entry(client: IngesterServiceClient) -> IngesterPoolEntry { - IngesterPoolEntry { - client, - status: IngesterStatus::Ready, - availability_zone: None, - } - } - #[tokio::test] async fn test_fetch_task_happy_path() { let tempdir = tempfile::tempdir().unwrap(); @@ -1339,7 +1331,7 @@ pub(super) mod tests { Ok(service_stream_1) }); - let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1)); + let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert("test-ingester-1".into(), ingester_1); let fetch_payload = FetchPayload { @@ -1438,7 +1430,7 @@ pub(super) mod tests { "open fetch stream error".to_string(), )) }); - let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0)); + let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); let mut mock_ingester_1 = MockIngesterService::new(); let index_uid_clone = index_uid.clone(); @@ -1453,7 +1445,7 @@ pub(super) mod tests { Ok(service_stream_1) }); - let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1)); + let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert("test-ingester-0".into(), ingester_0); ingester_pool.insert("test-ingester-1".into(), ingester_1); @@ -1553,7 +1545,7 @@ pub(super) mod tests { Ok(service_stream_0) }); - let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0)); + let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); let mut mock_ingester_1 = MockIngesterService::new(); let index_uid_clone = index_uid.clone(); @@ -1568,7 +1560,7 @@ pub(super) mod tests { Ok(service_stream_1) }); - let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1)); + let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert("test-ingester-0".into(), ingester_0); ingester_pool.insert("test-ingester-1".into(), ingester_1); @@ -1671,7 +1663,7 @@ pub(super) mod tests { shard_id: ShardId::from(1), }) }); - let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0)); + let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0); fault_tolerant_fetch_stream( @@ -1759,7 +1751,7 @@ pub(super) mod tests { Ok(service_stream_2) }); - let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("test-ingester".into(), ingester); diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs index b01500398c0..5091784b885 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs @@ -2872,6 +2872,98 @@ mod tests { .assert_records_eq(&queue_id_01, .., &[]); } + #[tokio::test] + async fn test_ingester_persist_returns_routing_update() { + let (ingester_ctx, ingester) = IngesterForTest::default().build().await; + + let index_uid_0 = IndexUid::for_test("test-index-0", 0); + let index_uid_1 = IndexUid::for_test("test-index-1", 0); + let source_id = SourceId::from("test-source"); + + let doc_mapping_uid = DocMappingUid::random(); + let doc_mapping_json = format!( + r#"{{ + "doc_mapping_uid": "{doc_mapping_uid}" + }}"# + ); + let init_shards_request = InitShardsRequest { + subrequests: vec![ + InitShardSubrequest { + subrequest_id: 0, + shard: Some(Shard { + index_uid: Some(index_uid_0.clone()), + source_id: source_id.clone(), + shard_id: Some(ShardId::from(1)), + shard_state: ShardState::Open as i32, + leader_id: ingester_ctx.node_id.to_string(), + doc_mapping_uid: Some(doc_mapping_uid), + ..Default::default() + }), + doc_mapping_json: doc_mapping_json.clone(), + validate_docs: false, + }, + InitShardSubrequest { + subrequest_id: 1, + shard: Some(Shard { + index_uid: Some(index_uid_1.clone()), + source_id: source_id.clone(), + shard_id: Some(ShardId::from(1)), + shard_state: ShardState::Open as i32, + leader_id: ingester_ctx.node_id.to_string(), + doc_mapping_uid: Some(doc_mapping_uid), + ..Default::default() + }), + doc_mapping_json, + validate_docs: false, + }, + ], + }; + ingester.init_shards(init_shards_request).await.unwrap(); + + let persist_request = PersistRequest { + leader_id: ingester_ctx.node_id.to_string(), + commit_type: CommitTypeV2::Force as i32, + subrequests: vec![ + PersistSubrequest { + subrequest_id: 0, + index_uid: Some(index_uid_0.clone()), + source_id: source_id.clone(), + doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-010"}"#])), + }, + PersistSubrequest { + subrequest_id: 1, + index_uid: Some(index_uid_1.clone()), + source_id: source_id.clone(), + doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-110"}"#])), + }, + ], + }; + let persist_response = ingester.persist(persist_request).await.unwrap(); + assert_eq!(persist_response.successes.len(), 2); + + let routing_update = persist_response + .routing_update + .expect("routing update should be present"); + + assert!( + routing_update.capacity_score > 0, + "capacity score should be non-zero after a small persist" + ); + + let mut source_shard_updates = routing_update.source_shard_updates; + source_shard_updates.sort_by(|a, b| a.index_uid().cmp(b.index_uid())); + + assert_eq!(source_shard_updates.len(), 2); + assert_eq!(source_shard_updates[0].index_uid(), &index_uid_0); + assert_eq!(source_shard_updates[0].source_id, source_id.as_str()); + assert_eq!(source_shard_updates[0].open_shard_count, 1); + assert_eq!(source_shard_updates[1].index_uid(), &index_uid_1); + assert_eq!(source_shard_updates[1].source_id, source_id.as_str()); + assert_eq!(source_shard_updates[1].open_shard_count, 1); + + assert!(routing_update.closed_shards.is_empty()); + } + #[tokio::test] async fn test_ingester_open_replication_stream() { let (_ingester_ctx, ingester) = IngesterForTest::default() diff --git a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs index 9e5122abff3..a2b1095bdf3 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs @@ -73,6 +73,26 @@ pub struct IngesterPoolEntry { pub availability_zone: Option, } +impl IngesterPoolEntry { + #[cfg(any(test, feature = "testsuite"))] + pub fn ready_with_client(client: IngesterServiceClient) -> Self { + IngesterPoolEntry { + client, + status: IngesterStatus::Ready, + availability_zone: None, + } + } + + #[cfg(any(test, feature = "testsuite"))] + pub fn mocked_ingester() -> Self { + IngesterPoolEntry { + client: IngesterServiceClient::mocked(), + status: IngesterStatus::Ready, + availability_zone: None, + } + } +} + pub type IngesterPool = Pool; /// Identifies an ingester client, typically a source, for logging and debugging purposes. diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs index 8ee5b9ea863..ad37032e550 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs @@ -647,14 +647,6 @@ mod tests { use crate::IngesterPoolEntry; use crate::ingest_v2::workbench::SubworkbenchFailure; - fn mocked_ingester() -> IngesterPoolEntry { - IngesterPoolEntry { - client: IngesterServiceClient::mocked(), - status: IngesterStatus::Ready, - availability_zone: None, - } - } - #[tokio::test] async fn test_router_make_get_or_create_open_shard_request() { let self_node_id = "test-router".into(); @@ -743,7 +735,7 @@ mod tests { drop(rendezvous_1); drop(rendezvous_2); - ingester_pool.insert("test-ingester-0".into(), mocked_ingester()); + ingester_pool.insert("test-ingester-0".into(), IngesterPoolEntry::mocked_ingester()); { // Ingester-0 is in pool and in table, but marked unavailable on the workbench // (simulating a prior transport error). has_open_nodes returns false → both @@ -1178,8 +1170,8 @@ mod tests { let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); let ingester_pool = IngesterPool::default(); - ingester_pool.insert("test-ingester-0".into(), mocked_ingester()); - ingester_pool.insert("test-ingester-1".into(), mocked_ingester()); + ingester_pool.insert("test-ingester-0".into(), IngesterPoolEntry::mocked_ingester()); + ingester_pool.insert("test-ingester-1".into(), IngesterPoolEntry::mocked_ingester()); let replication_factor = 1; let router = IngestRouter::new( @@ -1696,7 +1688,7 @@ mod tests { // Give the async subscriber a moment to process. tokio::time::sleep(Duration::from_millis(10)).await; - ingester_pool.insert("test-ingester-0".into(), mocked_ingester()); + ingester_pool.insert("test-ingester-0".into(), IngesterPoolEntry::mocked_ingester()); let state_guard = router.state.lock().await; let node = state_guard .routing_table @@ -1843,7 +1835,7 @@ mod tests { .process_persist_results(&mut workbench, persist_futures) .await; - ingester_pool.insert("test-ingester-0".into(), mocked_ingester()); + ingester_pool.insert("test-ingester-0".into(), IngesterPoolEntry::mocked_ingester()); let state_guard = router.state.lock().await; let node = state_guard .routing_table diff --git a/quickwit/quickwit-proto/protos/quickwit/ingester.proto b/quickwit/quickwit-proto/protos/quickwit/ingester.proto index 57568f46b8b..d48bc553a62 100644 --- a/quickwit/quickwit-proto/protos/quickwit/ingester.proto +++ b/quickwit/quickwit-proto/protos/quickwit/ingester.proto @@ -74,6 +74,7 @@ message PersistSubrequest { quickwit.common.IndexUid index_uid = 2; string source_id = 3; quickwit.ingest.DocBatchV2 doc_batch = 5; + reserved 4; } message PersistResponse { @@ -119,6 +120,7 @@ message PersistFailure { quickwit.common.IndexUid index_uid = 2; string source_id = 3; PersistFailureReason reason = 5; + reserved 4; } message SynReplicationMessage { diff --git a/quickwit/quickwit-serve/src/lib.rs b/quickwit/quickwit-serve/src/lib.rs index d9b4bc0e3f2..3ad5bd72aa7 100644 --- a/quickwit/quickwit-serve/src/lib.rs +++ b/quickwit/quickwit-serve/src/lib.rs @@ -1013,6 +1013,8 @@ fn setup_ingester_pool( ); Some(change) } + // only update the ingester pool when the ingester status changes, to avoid + // unnecessary churn ClusterChange::Update { previous, updated } if updated.is_indexer() && previous.ingester_status() != updated.ingester_status() => From 1f17dfd0bbf58963024eb4695bff4edd6c6bab78 Mon Sep 17 00:00:00 2001 From: Nadav Gov-Ari Date: Mon, 16 Mar 2026 11:16:04 -0400 Subject: [PATCH 9/9] lints --- .../src/control_plane.rs | 12 +++-- .../src/ingest/ingest_controller.rs | 48 ++++++++++++++----- .../src/source/ingest/mod.rs | 24 ++++++---- .../quickwit-ingest/src/ingest_v2/fetch.rs | 25 ++++++---- .../quickwit-ingest/src/ingest_v2/router.rs | 25 ++++++++-- 5 files changed, 94 insertions(+), 40 deletions(-) diff --git a/quickwit/quickwit-control-plane/src/control_plane.rs b/quickwit/quickwit-control-plane/src/control_plane.rs index 4e08b3a6b44..1056aba6eb8 100644 --- a/quickwit/quickwit-control-plane/src/control_plane.rs +++ b/quickwit/quickwit-control-plane/src/control_plane.rs @@ -2223,7 +2223,8 @@ mod tests { assert!(&retain_shards_for_source.shard_ids.is_empty()); Ok(RetainShardsResponse {}) }); - let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("node1".into(), ingester); let cluster_config = ClusterConfig::for_test(); @@ -2269,7 +2270,8 @@ mod tests { ); Ok(RetainShardsResponse {}) }); - let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("node1".into(), ingester); let mut index_0 = IndexMetadata::for_test("test-index-0", "ram:///test-index-0"); @@ -2644,7 +2646,8 @@ mod tests { }; Ok(response) }); - let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert(ingester_id, ingester); let mut mock_metastore = MockMetastoreService::new(); @@ -2798,7 +2801,8 @@ mod tests { }; Ok(response) }); - let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert(ingester_id, ingester); let mut mock_metastore = MockMetastoreService::new(); diff --git a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs index 94c2e23c87f..53d3ee4034f 100644 --- a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs +++ b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs @@ -1962,7 +1962,10 @@ mod tests { Ok(response) }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert(ingester_id_0, IngesterPoolEntry::ready_with_client(ingester_0)); + ingester_pool.insert( + ingester_id_0, + IngesterPoolEntry::ready_with_client(ingester_0), + ); let ingester_id_1 = NodeId::from("test-ingester-1"); let mut mock_ingester_1 = MockIngesterService::new(); @@ -1983,7 +1986,8 @@ mod tests { Err(IngestV2Error::Internal("internal error".to_string())) }); - let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); + let ingester_1 = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert(ingester_id_1, ingester_1); let ingester_id_2 = NodeId::from("test-ingester-2"); @@ -1993,7 +1997,10 @@ mod tests { let client_2 = IngesterServiceClient::tower() .stack_init_shards_layer(DelayLayer::new(INIT_SHARDS_REQUEST_TIMEOUT * 2)) .build_from_mock(mock_ingester_2); - ingester_pool.insert(ingester_id_2, IngesterPoolEntry::ready_with_client(client_2)); + ingester_pool.insert( + ingester_id_2, + IngesterPoolEntry::ready_with_client(client_2), + ); let init_shards_response = controller .init_shards(Vec::new(), &Progress::default()) @@ -2371,7 +2378,8 @@ mod tests { "failed to close shards".to_string(), )) }); - let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("test-ingester".into(), ingester); let shard_infos = BTreeSet::from_iter([ @@ -2524,7 +2532,8 @@ mod tests { }, ); - let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("test-ingester".into(), ingester); let shard_infos = BTreeSet::from_iter([ShardInfo { @@ -2670,7 +2679,8 @@ mod tests { }; Ok(response) }); - let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("test-ingester".into(), ingester); // Test failed to open shards. @@ -2792,7 +2802,8 @@ mod tests { }; Ok(response) }); - let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("test-ingester".into(), ingester); // Test failed to close shard. @@ -3162,7 +3173,10 @@ mod tests { Ok(response) }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert(ingester_id_0.clone(), IngesterPoolEntry::ready_with_client(ingester_0)); + ingester_pool.insert( + ingester_id_0.clone(), + IngesterPoolEntry::ready_with_client(ingester_0), + ); let ingester_id_1 = NodeId::from("test-ingester-1"); let mut mock_ingester_1 = MockIngesterService::new(); @@ -3180,7 +3194,10 @@ mod tests { Err(IngestV2Error::Internal("internal error".to_string())) }); let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1); - ingester_pool.insert(ingester_id_1.clone(), IngesterPoolEntry::ready_with_client(ingester_1)); + ingester_pool.insert( + ingester_id_1.clone(), + IngesterPoolEntry::ready_with_client(ingester_1), + ); let ingester_id_2 = NodeId::from("test-ingester-2"); let mut mock_ingester_2 = MockIngesterService::new(); @@ -3189,7 +3206,10 @@ mod tests { let client_2 = IngesterServiceClient::tower() .stack_close_shards_layer(DelayLayer::new(CLOSE_SHARDS_REQUEST_TIMEOUT * 2)) .build_from_mock(mock_ingester_2); - ingester_pool.insert(ingester_id_2.clone(), IngesterPoolEntry::ready_with_client(client_2)); + ingester_pool.insert( + ingester_id_2.clone(), + IngesterPoolEntry::ready_with_client(client_2), + ); // In this test: // - ingester 0 will close shard 0 successfully and fail to close shard 1; @@ -3369,7 +3389,10 @@ mod tests { Ok(response) }); let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0); - ingester_pool.insert(ingester_id_0.clone(), IngesterPoolEntry::ready_with_client(ingester_0)); + ingester_pool.insert( + ingester_id_0.clone(), + IngesterPoolEntry::ready_with_client(ingester_0), + ); let ingester_id_1 = NodeId::from("test-ingester-1"); let mut mock_ingester_1 = MockIngesterService::new(); @@ -3410,7 +3433,8 @@ mod tests { }; Ok(response) }); - let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); + let ingester_1 = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert(ingester_id_1.clone(), ingester_1); let num_opened_shards = controller diff --git a/quickwit/quickwit-indexing/src/source/ingest/mod.rs b/quickwit/quickwit-indexing/src/source/ingest/mod.rs index 2aa02c09963..46b7ddd8e0e 100644 --- a/quickwit/quickwit-indexing/src/source/ingest/mod.rs +++ b/quickwit/quickwit-indexing/src/source/ingest/mod.rs @@ -676,8 +676,7 @@ mod tests { use quickwit_ingest::IngesterPoolEntry; use quickwit_proto::indexing::IndexingPipelineId; use quickwit_proto::ingest::ingester::{ - FetchMessage, IngesterServiceClient, IngesterStatus, MockIngesterService, - TruncateShardsResponse, + FetchMessage, IngesterServiceClient, MockIngesterService, TruncateShardsResponse, }; use quickwit_proto::ingest::{IngestV2Error, MRecordBatch, Shard, ShardState}; use quickwit_proto::metastore::{AcquireShardsResponse, MockMetastoreService}; @@ -932,7 +931,8 @@ mod tests { Ok(response) }); - let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); + let ingester_0 = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); let event_broker = EventBroker::default(); @@ -1129,7 +1129,8 @@ mod tests { Ok(response) }); - let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); + let ingester_0 = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); let event_broker = EventBroker::default(); @@ -1294,7 +1295,8 @@ mod tests { Ok(response) }); - let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); + let ingester_0 = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); let event_broker = EventBroker::default(); @@ -1602,7 +1604,8 @@ mod tests { }) }); - let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); + let ingester_0 = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); let event_broker = EventBroker::default(); @@ -1702,7 +1705,8 @@ mod tests { Ok(TruncateShardsResponse {}) }); - let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); + let ingester_0 = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0.clone()); let mut mock_ingester_1 = MockIngesterService::new(); @@ -1729,7 +1733,8 @@ mod tests { Ok(TruncateShardsResponse {}) }); - let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); + let ingester_1 = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert("test-ingester-1".into(), ingester_1.clone()); let mut mock_ingester_3 = MockIngesterService::new(); @@ -1749,7 +1754,8 @@ mod tests { Ok(TruncateShardsResponse {}) }); - let ingester_3 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_3)); + let ingester_3 = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_3)); ingester_pool.insert("test-ingester-3".into(), ingester_3.clone()); let event_broker = EventBroker::default(); diff --git a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs index 96970fc4e20..73c1fb2858d 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs @@ -627,9 +627,7 @@ pub(super) mod tests { use bytes::Bytes; use quickwit_proto::ingest::ShardState; - use quickwit_proto::ingest::ingester::{ - IngesterServiceClient, IngesterStatus, MockIngesterService, - }; + use quickwit_proto::ingest::ingester::{IngesterServiceClient, MockIngesterService}; use quickwit_proto::types::queue_id; use tokio::time::timeout; @@ -1331,7 +1329,8 @@ pub(super) mod tests { Ok(service_stream_1) }); - let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); + let ingester_1 = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert("test-ingester-1".into(), ingester_1); let fetch_payload = FetchPayload { @@ -1430,7 +1429,8 @@ pub(super) mod tests { "open fetch stream error".to_string(), )) }); - let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); + let ingester_0 = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); let mut mock_ingester_1 = MockIngesterService::new(); let index_uid_clone = index_uid.clone(); @@ -1445,7 +1445,8 @@ pub(super) mod tests { Ok(service_stream_1) }); - let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); + let ingester_1 = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert("test-ingester-0".into(), ingester_0); ingester_pool.insert("test-ingester-1".into(), ingester_1); @@ -1545,7 +1546,8 @@ pub(super) mod tests { Ok(service_stream_0) }); - let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); + let ingester_0 = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); let mut mock_ingester_1 = MockIngesterService::new(); let index_uid_clone = index_uid.clone(); @@ -1560,7 +1562,8 @@ pub(super) mod tests { Ok(service_stream_1) }); - let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); + let ingester_1 = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)); ingester_pool.insert("test-ingester-0".into(), ingester_0); ingester_pool.insert("test-ingester-1".into(), ingester_1); @@ -1663,7 +1666,8 @@ pub(super) mod tests { shard_id: ShardId::from(1), }) }); - let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); + let ingester_0 = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0)); ingester_pool.insert("test-ingester-0".into(), ingester_0); fault_tolerant_fetch_stream( @@ -1751,7 +1755,8 @@ pub(super) mod tests { Ok(service_stream_2) }); - let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); + let ingester = + IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)); ingester_pool.insert("test-ingester".into(), ingester); diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs index ad37032e550..581b648ae71 100644 --- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs +++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs @@ -735,7 +735,10 @@ mod tests { drop(rendezvous_1); drop(rendezvous_2); - ingester_pool.insert("test-ingester-0".into(), IngesterPoolEntry::mocked_ingester()); + ingester_pool.insert( + "test-ingester-0".into(), + IngesterPoolEntry::mocked_ingester(), + ); { // Ingester-0 is in pool and in table, but marked unavailable on the workbench // (simulating a prior transport error). has_open_nodes returns false → both @@ -1170,8 +1173,14 @@ mod tests { let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()); let ingester_pool = IngesterPool::default(); - ingester_pool.insert("test-ingester-0".into(), IngesterPoolEntry::mocked_ingester()); - ingester_pool.insert("test-ingester-1".into(), IngesterPoolEntry::mocked_ingester()); + ingester_pool.insert( + "test-ingester-0".into(), + IngesterPoolEntry::mocked_ingester(), + ); + ingester_pool.insert( + "test-ingester-1".into(), + IngesterPoolEntry::mocked_ingester(), + ); let replication_factor = 1; let router = IngestRouter::new( @@ -1688,7 +1697,10 @@ mod tests { // Give the async subscriber a moment to process. tokio::time::sleep(Duration::from_millis(10)).await; - ingester_pool.insert("test-ingester-0".into(), IngesterPoolEntry::mocked_ingester()); + ingester_pool.insert( + "test-ingester-0".into(), + IngesterPoolEntry::mocked_ingester(), + ); let state_guard = router.state.lock().await; let node = state_guard .routing_table @@ -1835,7 +1847,10 @@ mod tests { .process_persist_results(&mut workbench, persist_futures) .await; - ingester_pool.insert("test-ingester-0".into(), IngesterPoolEntry::mocked_ingester()); + ingester_pool.insert( + "test-ingester-0".into(), + IngesterPoolEntry::mocked_ingester(), + ); let state_guard = router.state.lock().await; let node = state_guard .routing_table