From 76cfc8432f1dce7bdb43d5ede37ccc618fb01126 Mon Sep 17 00:00:00 2001
From: nadav-govari <nadav.govari@datadoghq.com>
Date: Tue, 17 Feb 2026 11:45:55 -0500
Subject: [PATCH 1/9] Implement IngesterCapacityScore broadcast (#6152)

---
 quickwit/quickwit-common/src/lib.rs           |   1 +
 quickwit/quickwit-common/src/ring_buffer.rs   | 170 +++++++
 quickwit/quickwit-common/src/shared_consts.rs |   3 +
 quickwit/quickwit-ingest/Cargo.toml           |   2 +-
 .../broadcast/ingester_capacity_score.rs      | 455 ++++++++++++++++++
 .../local_shards.rs}                          |  88 +---
 .../src/ingest_v2/broadcast/mod.rs            |  76 +++
 .../quickwit-ingest/src/ingest_v2/state.rs    |  87 +++-
 8 files changed, 809 insertions(+), 73 deletions(-)
 create mode 100644 quickwit/quickwit-common/src/ring_buffer.rs
 create mode 100644 quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs
 rename quickwit/quickwit-ingest/src/ingest_v2/{broadcast.rs => broadcast/local_shards.rs} (91%)
 create mode 100644 quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs
diff --git a/quickwit/quickwit-common/src/lib.rs b/quickwit/quickwit-common/src/lib.rs
index 0f3af2bc5ba..11147f975f9 100644
--- a/quickwit/quickwit-common/src/lib.rs
+++ b/quickwit/quickwit-common/src/lib.rs
@@ -36,6 +36,7 @@ pub mod rate_limited_tracing;
 pub mod rate_limiter;
 pub mod rendezvous_hasher;
 pub mod retry;
+pub mod ring_buffer;
 pub mod runtimes;
 pub mod shared_consts;
 pub mod sorted_iter;
diff --git a/quickwit/quickwit-common/src/ring_buffer.rs b/quickwit/quickwit-common/src/ring_buffer.rs
new file mode 100644
index 00000000000..5d884d8188f
--- /dev/null
+++ b/quickwit/quickwit-common/src/ring_buffer.rs
@@ -0,0 +1,170 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt::{Debug, Formatter};
+
+/// Fixed-size buffer that keeps the last N elements pushed into it.
+///
+/// `head` is the write cursor. It advances by one on each push and wraps
+/// back to 0 when it reaches N, overwriting the oldest element.
+///
+/// ```text
+/// RingBuffer<u32, 4> after pushing 1, 2, 3, 4, 5, 6:
+///
+///   buffer = [5, 6, 3, 4]    head = 2    len = 4
+///                 ^
+///                 next write goes here
+///
+///   logical view (oldest → newest): [3, 4, 5, 6]
+/// ```
+pub struct RingBuffer<T: Copy + Default, const N: usize> {
+    buffer: [T; N],
+    head: usize,
+    len: usize,
+}
+
+impl<T: Copy + Default, const N: usize> Default for RingBuffer<T, N> {
+    fn default() -> Self {
+        Self {
+            buffer: [T::default(); N],
+            head: 0,
+            len: 0,
+        }
+    }
+}
+
+impl<T: Copy + Default + Debug, const N: usize> Debug for RingBuffer<T, N> {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_list().entries(self.iter()).finish()
+    }
+}
+
+impl<T: Copy + Default, const N: usize> RingBuffer<T, N> {
+    pub fn push_back(&mut self, value: T) {
+        self.buffer[self.head] = value;
+        self.head = (self.head + 1) % N;
+        if self.len < N {
+            self.len += 1;
+        }
+    }
+
+    pub fn last(&self) -> Option<T> {
+        if self.len == 0 {
+            return None;
+        }
+        Some(self.buffer[(self.head + N - 1) % N])
+    }
+
+    pub fn front(&self) -> Option<T> {
+        if self.len == 0 {
+            return None;
+        }
+        Some(self.buffer[(self.head + N - self.len) % N])
+    }
+
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    pub fn iter(&self) -> impl Iterator<Item = &T> + '_ {
+        let start = (self.head + N - self.len) % N;
+        (0..self.len).map(move |i| &self.buffer[(start + i) % N])
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_empty() {
+        let rb = RingBuffer::<u32, 4>::default();
+        assert!(rb.is_empty());
+        assert_eq!(rb.len(), 0);
+        assert_eq!(rb.last(), None);
+        assert_eq!(rb.front(), None);
+        assert_eq!(rb.iter().count(), 0);
+    }
+
+    #[test]
+    fn test_single_push() {
+        let mut rb = RingBuffer::<u32, 4>::default();
+        rb.push_back(10);
+        assert_eq!(rb.len(), 1);
+        assert!(!rb.is_empty());
+        assert_eq!(rb.last(), Some(10));
+        assert_eq!(rb.front(), Some(10));
+        assert_eq!(rb.iter().copied().collect::<Vec<_>>(), vec![10]);
+    }
+
+    #[test]
+    fn test_partial_fill() {
+        let mut rb = RingBuffer::<u32, 4>::default();
+        rb.push_back(1);
+        rb.push_back(2);
+        rb.push_back(3);
+        assert_eq!(rb.len(), 3);
+        assert_eq!(rb.last(), Some(3));
+        assert_eq!(rb.front(), Some(1));
+        assert_eq!(rb.iter().copied().collect::<Vec<_>>(), vec![1, 2, 3]);
+    }
+
+    #[test]
+    fn test_exactly_full() {
+        let mut rb = RingBuffer::<u32, 4>::default();
+        for i in 1..=4 {
+            rb.push_back(i);
+        }
+        assert_eq!(rb.len(), 4);
+        assert_eq!(rb.last(), Some(4));
+        assert_eq!(rb.front(), Some(1));
+        assert_eq!(rb.iter().copied().collect::<Vec<_>>(), vec![1, 2, 3, 4]);
+    }
+
+    #[test]
+    fn test_wrap_around() {
+        let mut rb = RingBuffer::<u32, 4>::default();
+        for i in 1..=6 {
+            rb.push_back(i);
+        }
+        assert_eq!(rb.len(), 4);
+        assert_eq!(rb.last(), Some(6));
+        assert_eq!(rb.front(), Some(3));
+        assert_eq!(rb.iter().copied().collect::<Vec<_>>(), vec![3, 4, 5, 6]);
+    }
+
+    #[test]
+    fn test_many_wraps() {
+        let mut rb = RingBuffer::<u32, 3>::default();
+        for i in 1..=100 {
+            rb.push_back(i);
+        }
+        assert_eq!(rb.len(), 3);
+        assert_eq!(rb.last(), Some(100));
+        assert_eq!(rb.front(), Some(98));
+        assert_eq!(rb.iter().copied().collect::<Vec<_>>(), vec![98, 99, 100]);
+    }
+
+    #[test]
+    fn test_debug() {
+        let mut rb = RingBuffer::<u32, 3>::default();
+        rb.push_back(1);
+        rb.push_back(2);
+        assert_eq!(format!("{:?}", rb), "[1, 2]");
+    }
+}
diff --git a/quickwit/quickwit-common/src/shared_consts.rs b/quickwit/quickwit-common/src/shared_consts.rs
index 9923705f0b2..437058f28fb 100644
--- a/quickwit/quickwit-common/src/shared_consts.rs
+++ b/quickwit/quickwit-common/src/shared_consts.rs
@@ -64,6 +64,9 @@ pub const SCROLL_BATCH_LEN: usize = 1_000;
 /// Prefix used in chitchat to broadcast the list of primary shards hosted by a leader.
 pub const INGESTER_PRIMARY_SHARDS_PREFIX: &str = "ingester.primary_shards:";
 
+/// Prefix used in chitchat to broadcast per-source ingester capacity scores and open shard counts.
+pub const INGESTER_CAPACITY_SCORE_PREFIX: &str = "ingester.capacity_score:";
+
 /// File name for the encoded list of fields in the split
 pub const SPLIT_FIELDS_FILE_NAME: &str = "split_fields";
 
diff --git a/quickwit/quickwit-ingest/Cargo.toml b/quickwit/quickwit-ingest/Cargo.toml
index 3dfa0bf6c0c..3149f2aaaf3 100644
--- a/quickwit/quickwit-ingest/Cargo.toml
+++ b/quickwit/quickwit-ingest/Cargo.toml
@@ -18,6 +18,7 @@ bytesize = { workspace = true }
 fail = { workspace = true, optional = true }
 futures = { workspace = true }
 http = { workspace = true }
+itertools = { workspace = true }
 mockall = { workspace = true, optional = true }
 mrecordlog = { workspace = true }
 once_cell = { workspace = true }
@@ -43,7 +44,6 @@ quickwit-doc-mapper = { workspace = true, features = ["testsuite"] }
 quickwit-proto = { workspace = true }
 
 [dev-dependencies]
-itertools = { workspace = true }
 mockall = { workspace = true }
 rand = { workspace = true }
 rand_distr = { workspace = true }
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs
new file mode 100644
index 00000000000..6f8abc66ef8
--- /dev/null
+++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs
@@ -0,0 +1,455 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::BTreeSet;
+
+use anyhow::{Context, Result};
+use bytesize::ByteSize;
+use quickwit_cluster::{Cluster, ListenerHandle};
+use quickwit_common::pubsub::{Event, EventBroker};
+use quickwit_common::ring_buffer::RingBuffer;
+use quickwit_common::shared_consts::INGESTER_CAPACITY_SCORE_PREFIX;
+use quickwit_proto::ingest::ingester::IngesterStatus;
+use quickwit_proto::types::{IndexUid, NodeId, SourceId, SourceUid};
+use serde::{Deserialize, Serialize};
+use tokio::task::JoinHandle;
+use tracing::{info, warn};
+
+use super::{BROADCAST_INTERVAL_PERIOD, make_key, parse_key};
+use crate::ingest_v2::state::WeakIngesterState;
+
+pub type OpenShardCounts = Vec<(IndexUid, SourceId, usize)>;
+
+/// The lookback window length is meant to capture readings far enough back in time to give
+/// a rough rate of change estimate. At size 6, with broadcast interval of 5 seconds, this would be
+/// 30 seconds of readings.
+const WAL_CAPACITY_LOOKBACK_WINDOW_LEN: usize = 6;
+
+/// The ring buffer stores one extra element so that `delta()` can compare the newest reading
+/// with the one that is exactly `WAL_CAPACITY_LOOKBACK_WINDOW_LEN` steps ago. Otherwise, that
+/// reading would be discarded when the next reading is inserted.
+const WAL_CAPACITY_READINGS_LEN: usize = WAL_CAPACITY_LOOKBACK_WINDOW_LEN + 1;
+
+struct WalMemoryCapacityTimeSeries {
+    readings: RingBuffer<f64, WAL_CAPACITY_READINGS_LEN>,
+}
+
+impl WalMemoryCapacityTimeSeries {
+    fn new() -> Self {
+        Self {
+            readings: RingBuffer::default(),
+        }
+    }
+
+    fn record(&mut self, memory_used: ByteSize, memory_allocated: ByteSize) {
+        let allocated = memory_allocated.as_u64();
+        if allocated == 0 {
+            self.readings.push_back(1.0);
+            return;
+        }
+        let remaining = 1.0 - (memory_used.as_u64() as f64 / allocated as f64);
+        self.readings.push_back(remaining.clamp(0.0, 1.0));
+    }
+
+    fn current(&self) -> Option<f64> {
+        self.readings.last()
+    }
+
+    /// How much remaining capacity changed between the oldest and newest readings.
+    /// Positive = improving, negative = draining.
+    fn delta(&self) -> Option<f64> {
+        let current = self.readings.last()?;
+        let oldest = self.readings.front()?;
+        Some(current - oldest)
+    }
+}
+
+/// Computes a capacity score from 0 to 10 using a PD controller.
+///
+/// The score has two components:
+///
+/// - **P (proportional):** How much WAL capacity remains right now. An ingester with 100% free
+///   capacity gets `PROPORTIONAL_WEIGHT` points; 50% gets half; and so on. If remaining capacity
+///   drops to `MIN_PERMISSIBLE_CAPACITY` or below, the score is immediately 0.
+///
+/// - **D (derivative):** Up to `DERIVATIVE_WEIGHT` bonus points based on how fast remaining
+///   capacity is changing over the lookback window. A higher drain rate is worse, so we invert it:
+///   `drain / MAX_DRAIN_RATE` normalizes the drain to a 0–1 penalty, and subtracting from 1
+///   converts it into a 0–1 bonus. Multiplied by `DERIVATIVE_WEIGHT`, a stable node gets the full
+///   bonus and a node draining at `MAX_DRAIN_RATE` or faster gets nothing.
+///
+/// Putting it together: a completely idle ingester scores 10 (8 + 2).
+/// One that is full but stable scores ~2. One that is draining rapidly scores less.
+/// A score of 0 means the ingester is at or below minimum permissible capacity.
+///
+/// Below this remaining capacity fraction, the score is immediately 0.
+const MIN_PERMISSIBLE_CAPACITY: f64 = 0.05;
+/// Weight of the proportional term (max points from P).
+const PROPORTIONAL_WEIGHT: f64 = 8.0;
+/// Weight of the derivative term (max points from D).
+const DERIVATIVE_WEIGHT: f64 = 2.0;
+/// The drain rate (as a fraction of total capacity over the lookback window) at which the
+/// derivative penalty is fully applied. Drain rates beyond this are clamped.
+const MAX_DRAIN_RATE: f64 = 0.10;
+
+fn compute_capacity_score(remaining_capacity: f64, capacity_delta: f64) -> usize {
+    if remaining_capacity <= MIN_PERMISSIBLE_CAPACITY {
+        return 0;
+    }
+    let p = PROPORTIONAL_WEIGHT * remaining_capacity;
+    let drain = (-capacity_delta).clamp(0.0, MAX_DRAIN_RATE);
+    let d = DERIVATIVE_WEIGHT * (1.0 - drain / MAX_DRAIN_RATE);
+    (p + d).clamp(0.0, 10.0) as usize
+}
+
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+pub struct IngesterCapacityScore {
+    pub capacity_score: usize,
+    pub open_shard_count: usize,
+}
+
+/// Periodically snapshots the ingester's WAL memory usage and open shard counts, computes
+/// a capacity score, and broadcasts it to other nodes via Chitchat.
+pub(crate) struct BroadcastIngesterCapacityScoreTask {
+    cluster: Cluster,
+    weak_state: WeakIngesterState,
+    wal_capacity_time_series: WalMemoryCapacityTimeSeries,
+}
+
+impl BroadcastIngesterCapacityScoreTask {
+    pub fn spawn(cluster: Cluster, weak_state: WeakIngesterState) -> JoinHandle<()> {
+        let mut broadcaster = Self {
+            cluster,
+            weak_state,
+            wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(),
+        };
+        tokio::spawn(async move { broadcaster.run().await })
+    }
+
+    async fn snapshot(&self) -> Result<Option<(ByteSize, ByteSize, OpenShardCounts)>> {
+        let state = self
+            .weak_state
+            .upgrade()
+            .context("ingester state has been dropped")?;
+
+        // lock fully asserts that the ingester is ready. There's a likelihood that this task runs
+        // before the WAL is loaded, so we make sure that the ingester is ready just in case.
+        if *state.status_rx.borrow() != IngesterStatus::Ready {
+            return Ok(None);
+        }
+
+        let guard = state
+            .lock_fully()
+            .await
+            .map_err(|_| anyhow::anyhow!("failed to acquire ingester state lock"))?;
+        let usage = guard.mrecordlog.resource_usage();
+        let memory_used = ByteSize::b(usage.memory_used_bytes as u64);
+        let memory_allocated = ByteSize::b(usage.memory_allocated_bytes as u64);
+        let open_shard_counts = guard.get_open_shard_counts();
+
+        Ok(Some((memory_used, memory_allocated, open_shard_counts)))
+    }
+
+    async fn run(&mut self) {
+        let mut interval = tokio::time::interval(BROADCAST_INTERVAL_PERIOD);
+        let mut previous_sources: BTreeSet<SourceUid> = BTreeSet::new();
+
+        loop {
+            interval.tick().await;
+
+            let (memory_used, memory_allocated, open_shard_counts) = match self.snapshot().await {
+                Ok(Some(snapshot)) => snapshot,
+                Ok(None) => continue,
+                Err(error) => {
+                    info!("stopping ingester capacity broadcast: {error}");
+                    return;
+                }
+            };
+
+            self.wal_capacity_time_series
+                .record(memory_used, memory_allocated);
+
+            let remaining_capacity = self.wal_capacity_time_series.current().unwrap_or(1.0);
+            let capacity_delta = self.wal_capacity_time_series.delta().unwrap_or(0.0);
+            let capacity_score = compute_capacity_score(remaining_capacity, capacity_delta);
+
+            previous_sources = self
+                .broadcast_capacity(capacity_score, &open_shard_counts, &previous_sources)
+                .await;
+        }
+    }
+
+    async fn broadcast_capacity(
+        &self,
+        capacity_score: usize,
+        open_shard_counts: &OpenShardCounts,
+        previous_sources: &BTreeSet<SourceUid>,
+    ) -> BTreeSet<SourceUid> {
+        let mut current_sources = BTreeSet::new();
+
+        for (index_uid, source_id, open_shard_count) in open_shard_counts {
+            let source_uid = SourceUid {
+                index_uid: index_uid.clone(),
+                source_id: source_id.clone(),
+            };
+            let key = make_key(INGESTER_CAPACITY_SCORE_PREFIX, &source_uid);
+            let capacity = IngesterCapacityScore {
+                capacity_score,
+                open_shard_count: *open_shard_count,
+            };
+            let value = serde_json::to_string(&capacity)
+                .expect("`IngesterCapacityScore` should be JSON serializable");
+            self.cluster.set_self_key_value(key, value).await;
+            current_sources.insert(source_uid);
+        }
+
+        for removed_source in previous_sources.difference(&current_sources) {
+            let key = make_key(INGESTER_CAPACITY_SCORE_PREFIX, removed_source);
+            self.cluster.remove_self_key(&key).await;
+        }
+
+        current_sources
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct IngesterCapacityScoreUpdate {
+    pub node_id: NodeId,
+    pub source_uid: SourceUid,
+    pub capacity_score: usize,
+    pub open_shard_count: usize,
+}
+
+impl Event for IngesterCapacityScoreUpdate {}
+
+pub async fn setup_ingester_capacity_update_listener(
+    cluster: Cluster,
+    event_broker: EventBroker,
+) -> ListenerHandle {
+    cluster
+        .subscribe(INGESTER_CAPACITY_SCORE_PREFIX, move |event| {
+            let Some(source_uid) = parse_key(event.key) else {
+                warn!("failed to parse source UID from key `{}`", event.key);
+                return;
+            };
+            let Ok(ingester_capacity) = serde_json::from_str::<IngesterCapacityScore>(event.value)
+            else {
+                warn!("failed to parse ingester capacity `{}`", event.value);
+                return;
+            };
+            let node_id: NodeId = event.node.node_id.clone().into();
+            event_broker.publish(IngesterCapacityScoreUpdate {
+                node_id,
+                source_uid,
+                capacity_score: ingester_capacity.capacity_score,
+                open_shard_count: ingester_capacity.open_shard_count,
+            });
+        })
+        .await
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+
+    use quickwit_cluster::{ChannelTransport, create_cluster_for_test};
+    use quickwit_proto::types::ShardId;
+
+    use super::*;
+    use crate::ingest_v2::models::IngesterShard;
+    use crate::ingest_v2::state::IngesterState;
+
+    fn ts() -> WalMemoryCapacityTimeSeries {
+        WalMemoryCapacityTimeSeries::new()
+    }
+
+    /// Helper: record a reading with `used` out of `allocated` bytes.
+    fn record(series: &mut WalMemoryCapacityTimeSeries, used: u64, allocated: u64) {
+        series.record(ByteSize::b(used), ByteSize::b(allocated));
+    }
+
+    #[test]
+    fn test_wal_memory_capacity_current_after_record() {
+        let mut series = ts();
+        // 192 of 256 used => 25% remaining
+        record(&mut series, 192, 256);
+        assert_eq!(series.current(), Some(0.25));
+
+        // 16 of 256 used => 93.75% remaining
+        record(&mut series, 16, 256);
+        assert_eq!(series.current(), Some(0.9375));
+    }
+
+    #[test]
+    fn test_wal_memory_capacity_record_saturates_at_zero() {
+        let mut series = ts();
+        record(&mut series, 200, 100);
+        assert_eq!(series.current(), Some(0.0));
+    }
+
+    #[test]
+    fn test_wal_memory_capacity_delta_growing() {
+        let mut series = ts();
+        // oldest: 60 of 100 used => 40% remaining
+        record(&mut series, 60, 100);
+        // current: 20 of 100 used => 80% remaining
+        record(&mut series, 20, 100);
+        // delta = 0.80 - 0.40 = 0.40
+        assert_eq!(series.delta(), Some(0.40));
+    }
+
+    #[test]
+    fn test_wal_memory_capacity_delta_shrinking() {
+        let mut series = ts();
+        // oldest: 20 of 100 used => 80% remaining
+        record(&mut series, 20, 100);
+        // current: 60 of 100 used => 40% remaining
+        record(&mut series, 60, 100);
+        // delta = 0.40 - 0.80 = -0.40
+        assert_eq!(series.delta(), Some(-0.40));
+    }
+
+    #[test]
+    fn test_capacity_score_draining_vs_stable() {
+        // Node A: capacity draining — usage increases 10, 20, ..., 70 over 7 ticks.
+        let mut node_a = ts();
+        for used in (10..=70).step_by(10) {
+            record(&mut node_a, used, 100);
+        }
+        let a_remaining = node_a.current().unwrap();
+        let a_delta = node_a.delta().unwrap();
+        let a_score = compute_capacity_score(a_remaining, a_delta);
+
+        // Node B: steady at 50% usage over 7 ticks.
+        let mut node_b = ts();
+        for _ in 0..7 {
+            record(&mut node_b, 50, 100);
+        }
+        let b_remaining = node_b.current().unwrap();
+        let b_delta = node_b.delta().unwrap();
+        let b_score = compute_capacity_score(b_remaining, b_delta);
+
+        // p=2.4, d=0 (max drain) => 2
+        assert_eq!(a_score, 2);
+        // p=4, d=2 (stable) => 6
+        assert_eq!(b_score, 6);
+        assert!(b_score > a_score);
+    }
+
+    #[tokio::test]
+    async fn test_snapshot_state_dropped() {
+        let transport = ChannelTransport::default();
+        let cluster = create_cluster_for_test(Vec::new(), &["indexer"], &transport, true)
+            .await
+            .unwrap();
+        let (_temp_dir, state) = IngesterState::for_test().await;
+        let weak_state = state.weak();
+        drop(state);
+
+        let task = BroadcastIngesterCapacityScoreTask {
+            cluster,
+            weak_state,
+            wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(),
+        };
+        assert!(task.snapshot().await.is_err());
+    }
+
+    #[tokio::test]
+    async fn test_broadcast_ingester_capacity() {
+        let transport = ChannelTransport::default();
+        let cluster = create_cluster_for_test(Vec::new(), &["indexer"], &transport, true)
+            .await
+            .unwrap();
+        let event_broker = EventBroker::default();
+
+        let (_temp_dir, state) = IngesterState::for_test().await;
+        let index_uid = IndexUid::for_test("test-index", 0);
+        let mut state_guard = state.lock_partially().await.unwrap();
+        let shard = IngesterShard::new_solo(
+            index_uid.clone(),
+            SourceId::from("test-source"),
+            ShardId::from(0),
+        )
+        .advertisable()
+        .build();
+        state_guard.shards.insert(shard.queue_id(), shard);
+        let open_shard_counts = state_guard.get_open_shard_counts();
+        drop(state_guard);
+
+        // Simulate 500 of 1000 bytes used => 50% remaining, 0 delta => score = 6
+        let mut task = BroadcastIngesterCapacityScoreTask {
+            cluster: cluster.clone(),
+            weak_state: state.weak(),
+            wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(),
+        };
+        task.wal_capacity_time_series
+            .record(ByteSize::b(500), ByteSize::b(1000));
+
+        let remaining = task.wal_capacity_time_series.current().unwrap();
+        let delta = task.wal_capacity_time_series.delta().unwrap();
+        let capacity_score = compute_capacity_score(remaining, delta);
+        assert_eq!(capacity_score, 6);
+
+        let update_counter = Arc::new(AtomicUsize::new(0));
+        let update_counter_clone = update_counter.clone();
+        let index_uid_clone = index_uid.clone();
+        let _sub = event_broker.subscribe(move |event: IngesterCapacityScoreUpdate| {
+            update_counter_clone.fetch_add(1, Ordering::Release);
+            assert_eq!(event.source_uid.index_uid, index_uid_clone);
+            assert_eq!(event.source_uid.source_id, "test-source");
+            assert_eq!(event.capacity_score, 6);
+            assert_eq!(event.open_shard_count, 1);
+        });
+
+        let _listener =
+            setup_ingester_capacity_update_listener(cluster.clone(), event_broker).await;
+
+        let previous_sources = BTreeSet::new();
+        task.broadcast_capacity(capacity_score, &open_shard_counts, &previous_sources)
+            .await;
+        tokio::time::sleep(BROADCAST_INTERVAL_PERIOD * 2).await;
+
+        assert_eq!(update_counter.load(Ordering::Acquire), 1);
+
+        let source_uid = SourceUid {
+            index_uid: index_uid.clone(),
+            source_id: SourceId::from("test-source"),
+        };
+        let key = make_key(INGESTER_CAPACITY_SCORE_PREFIX, &source_uid);
+        let value = cluster.get_self_key_value(&key).await.unwrap();
+        let deserialized: IngesterCapacityScore = serde_json::from_str(&value).unwrap();
+        assert_eq!(deserialized.capacity_score, 6);
+        assert_eq!(deserialized.open_shard_count, 1);
+    }
+
+    #[test]
+    fn test_wal_memory_capacity_delta_spans_lookback_window() {
+        let mut series = ts();
+
+        // Fill to exactly the lookback window length (6 readings), all same value.
+        for _ in 0..WAL_CAPACITY_LOOKBACK_WINDOW_LEN {
+            record(&mut series, 50, 100);
+        }
+        assert_eq!(series.delta(), Some(0.0));
+
+        // 7th reading fills the ring buffer. Delta spans 6 intervals.
+        record(&mut series, 0, 100);
+        assert_eq!(series.delta(), Some(0.50));
+
+        // 8th reading evicts the oldest 50-remaining. Delta still spans 6 intervals.
+        record(&mut series, 0, 100);
+        assert_eq!(series.delta(), Some(0.50));
+    }
+}
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/local_shards.rs
similarity index 91%
rename from quickwit/quickwit-ingest/src/ingest_v2/broadcast.rs
rename to quickwit/quickwit-ingest/src/ingest_v2/broadcast/local_shards.rs
index 9bbbe94bb47..6ba10915f56 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/local_shards.rs
@@ -18,6 +18,7 @@ use std::time::Duration;
 use bytesize::ByteSize;
 use quickwit_cluster::{Cluster, ListenerHandle};
 use quickwit_common::pubsub::{Event, EventBroker};
+use quickwit_common::ring_buffer::RingBuffer;
 use quickwit_common::shared_consts::INGESTER_PRIMARY_SHARDS_PREFIX;
 use quickwit_common::sorted_iter::{KeyDiff, SortedByKeyIterator};
 use quickwit_common::tower::{ConstantRate, Rate};
@@ -27,15 +28,10 @@ use serde::{Deserialize, Serialize, Serializer};
 use tokio::task::JoinHandle;
 use tracing::{debug, warn};
 
-use super::metrics::INGEST_V2_METRICS;
-use super::state::WeakIngesterState;
+use super::{BROADCAST_INTERVAL_PERIOD, make_key, parse_key};
 use crate::RateMibPerSec;
-
-const BROADCAST_INTERVAL_PERIOD: Duration = if cfg!(test) {
-    Duration::from_millis(50)
-} else {
-    Duration::from_secs(5)
-};
+use crate::ingest_v2::metrics::INGEST_V2_METRICS;
+use crate::ingest_v2::state::WeakIngesterState;
 
 const ONE_MIB: ByteSize = ByteSize::mib(1);
 
@@ -152,7 +148,7 @@ impl LocalShardsSnapshot {
 
 /// Takes a snapshot of the primary shards hosted by the ingester at regular intervals and
 /// broadcasts it to other nodes via Chitchat.
-pub(super) struct BroadcastLocalShardsTask {
+pub struct BroadcastLocalShardsTask {
     cluster: Cluster,
     weak_state: WeakIngesterState,
     shard_throughput_time_series_map: ShardThroughputTimeSeriesMap,
@@ -229,36 +225,24 @@ impl ShardThroughputTimeSeriesMap {
 #[derive(Default)]
 struct ShardThroughputTimeSeries {
     shard_state: ShardState,
-    measurements: [ByteSize; SHARD_THROUGHPUT_LONG_TERM_WINDOW_LEN],
-    len: usize,
+    throughput: RingBuffer<ByteSize, SHARD_THROUGHPUT_LONG_TERM_WINDOW_LEN>,
 }
 
 impl ShardThroughputTimeSeries {
     fn last(&self) -> ByteSize {
-        self.measurements.last().copied().unwrap_or_default()
+        self.throughput.last().unwrap_or_default()
     }
 
     fn average(&self) -> ByteSize {
-        if self.len == 0 {
+        if self.throughput.is_empty() {
             return ByteSize::default();
         }
-        let sum = self
-            .measurements
-            .iter()
-            .rev()
-            .take(self.len)
-            .map(ByteSize::as_u64)
-            .sum::<u64>();
-        ByteSize::b(sum / self.len as u64)
+        let sum = self.throughput.iter().map(ByteSize::as_u64).sum::<u64>();
+        ByteSize::b(sum / self.throughput.len() as u64)
     }
 
     fn record(&mut self, new_throughput_measurement: ByteSize) {
-        self.len = (self.len + 1).min(SHARD_THROUGHPUT_LONG_TERM_WINDOW_LEN);
-        self.measurements.rotate_left(1);
-        let Some(last_measurement) = self.measurements.last_mut() else {
-            return;
-        };
-        *last_measurement = new_throughput_measurement;
+        self.throughput.push_back(new_throughput_measurement);
     }
 }
 
@@ -338,13 +322,13 @@ impl BroadcastLocalShardsTask {
                     source_uid,
                     shard_infos,
                 } => {
-                    let key = make_key(source_uid);
+                    let key = make_key(INGESTER_PRIMARY_SHARDS_PREFIX, source_uid);
                     let value = serde_json::to_string(&shard_infos)
                         .expect("`ShardInfos` should be JSON serializable");
                     self.cluster.set_self_key_value(key, value).await;
                 }
                 ShardInfosChange::Removed { source_uid } => {
-                    let key = make_key(source_uid);
+                    let key = make_key(INGESTER_PRIMARY_SHARDS_PREFIX, source_uid);
                     self.cluster.remove_self_key(&key).await;
                 }
             }
@@ -371,22 +355,6 @@ impl BroadcastLocalShardsTask {
     }
 }
 
-fn make_key(source_uid: &SourceUid) -> String {
-    format!(
-        "{INGESTER_PRIMARY_SHARDS_PREFIX}{}:{}",
-        source_uid.index_uid, source_uid.source_id
-    )
-}
-
-fn parse_key(key: &str) -> Option<SourceUid> {
-    let (index_uid_str, source_id_str) = key.rsplit_once(':')?;
-
-    Some(SourceUid {
-        index_uid: index_uid_str.parse().ok()?,
-        source_id: source_id_str.to_string(),
-    })
-}
-
 #[derive(Debug, Clone)]
 pub struct LocalShardsUpdate {
     pub leader_id: NodeId,
@@ -429,10 +397,12 @@ mod tests {
     use std::sync::atomic::{AtomicUsize, Ordering};
 
     use quickwit_cluster::{ChannelTransport, create_cluster_for_test};
+    use quickwit_common::shared_consts::INGESTER_PRIMARY_SHARDS_PREFIX;
     use quickwit_proto::ingest::ShardState;
-    use quickwit_proto::types::{IndexUid, SourceId};
+    use quickwit_proto::types::{IndexUid, NodeId, ShardId, SourceId, SourceUid};
 
     use super::*;
+    use crate::RateMibPerSec;
     use crate::ingest_v2::models::IngesterShard;
     use crate::ingest_v2::state::IngesterState;
 
@@ -626,30 +596,6 @@ mod tests {
         assert!(value_opt.is_none());
     }
 
-    #[test]
-    fn test_make_key() {
-        let source_uid = SourceUid {
-            index_uid: IndexUid::for_test("test-index", 0),
-            source_id: SourceId::from("test-source"),
-        };
-        let key = make_key(&source_uid);
-        assert_eq!(
-            key,
-            "ingester.primary_shards:test-index:00000000000000000000000000:test-source"
-        );
-    }
-
-    #[test]
-    fn test_parse_key() {
-        let key = "test-index:00000000000000000000000000:test-source";
-        let source_uid = parse_key(key).unwrap();
-        assert_eq!(
-            &source_uid.index_uid.to_string(),
-            "test-index:00000000000000000000000000"
-        );
-        assert_eq!(source_uid.source_id, "test-source".to_string());
-    }
-
     #[tokio::test]
     async fn test_local_shards_update_listener() {
         let transport = ChannelTransport::default();
@@ -686,7 +632,7 @@ mod tests {
             index_uid: index_uid.clone(),
             source_id: SourceId::from("test-source"),
         };
-        let key = make_key(&source_uid);
+        let key = make_key(INGESTER_PRIMARY_SHARDS_PREFIX, &source_uid);
         let value = serde_json::to_string(&vec![ShardInfo {
             shard_id: ShardId::from(1),
             shard_state: ShardState::Open,
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs
new file mode 100644
index 00000000000..d2184a0e392
--- /dev/null
+++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs
@@ -0,0 +1,76 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#[allow(dead_code)]
+mod ingester_capacity_score;
+mod local_shards;
+
+use std::time::Duration;
+
+use quickwit_proto::types::SourceUid;
+
+pub(in crate::ingest_v2) const BROADCAST_INTERVAL_PERIOD: Duration = if cfg!(test) {
+    Duration::from_millis(50)
+} else {
+    Duration::from_secs(5)
+};
+
+pub use local_shards::{
+    BroadcastLocalShardsTask, LocalShardsUpdate, ShardInfo, ShardInfos,
+    setup_local_shards_update_listener,
+};
+
+fn make_key(prefix: &str, source_uid: &SourceUid) -> String {
+    format!("{prefix}{}:{}", source_uid.index_uid, source_uid.source_id)
+}
+
+fn parse_key(key: &str) -> Option<SourceUid> {
+    let (index_uid_str, source_id_str) = key.rsplit_once(':')?;
+    Some(SourceUid {
+        index_uid: index_uid_str.parse().ok()?,
+        source_id: source_id_str.to_string(),
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use quickwit_common::shared_consts::INGESTER_PRIMARY_SHARDS_PREFIX;
+    use quickwit_proto::types::{IndexUid, SourceId, SourceUid};
+
+    use super::*;
+
+    #[test]
+    fn test_make_key() {
+        let source_uid = SourceUid {
+            index_uid: IndexUid::for_test("test-index", 0),
+            source_id: SourceId::from("test-source"),
+        };
+        let key = make_key(INGESTER_PRIMARY_SHARDS_PREFIX, &source_uid);
+        assert_eq!(
+            key,
+            "ingester.primary_shards:test-index:00000000000000000000000000:test-source"
+        );
+    }
+
+    #[test]
+    fn test_parse_key() {
+        let key = "test-index:00000000000000000000000000:test-source";
+        let source_uid = parse_key(key).unwrap();
+        assert_eq!(
+            &source_uid.index_uid.to_string(),
+            "test-index:00000000000000000000000000"
+        );
+        assert_eq!(source_uid.source_id, "test-source".to_string());
+    }
+}
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/state.rs b/quickwit/quickwit-ingest/src/ingest_v2/state.rs
index 591ef4f704f..bf1c648c6cb 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/state.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/state.rs
@@ -19,6 +19,7 @@ use std::path::Path;
 use std::sync::{Arc, Weak};
 use std::time::{Duration, Instant};
 
+use itertools::Itertools;
 use mrecordlog::error::{DeleteQueueError, TruncateError};
 use quickwit_common::pretty::PrettyDisplay;
 use quickwit_common::rate_limiter::{RateLimiter, RateLimiterSettings};
@@ -87,6 +88,17 @@ impl InnerIngesterState {
             .max_by_key(|(available_permits, _)| *available_permits)
             .map(|(_, shard)| shard)
     }
+
+    pub fn get_open_shard_counts(&self) -> Vec<(IndexUid, SourceId, usize)> {
+        self.shards
+            .values()
+            .filter(|shard| shard.is_advertisable && !shard.is_replica() && shard.is_open())
+            .map(|shard| (shard.index_uid.clone(), shard.source_id.clone()))
+            .counts()
+            .into_iter()
+            .map(|((index_uid, source_id), count)| (index_uid, source_id, count))
+            .collect()
+    }
 }
 
 impl IngesterState {
@@ -467,7 +479,7 @@ impl WeakIngesterState {
 #[cfg(test)]
 mod tests {
     use bytesize::ByteSize;
-    use quickwit_proto::types::ShardId;
+    use quickwit_proto::types::{NodeId, ShardId, SourceId};
     use tokio::time::timeout;
 
     use super::*;
@@ -642,4 +654,77 @@ mod tests {
             locked_state.find_most_capacity_shard_mut(&index_uid, &SourceId::from("other-source"));
         assert!(shard_opt.is_none());
     }
+
+    fn open_shard(
+        index_uid: IndexUid,
+        source_id: SourceId,
+        shard_id: ShardId,
+        is_replica: bool,
+    ) -> IngesterShard {
+        let builder = if is_replica {
+            IngesterShard::new_replica(index_uid, source_id, shard_id, NodeId::from("test-leader"))
+        } else {
+            IngesterShard::new_solo(index_uid, source_id, shard_id)
+        };
+        builder.advertisable().build()
+    }
+
+    #[tokio::test]
+    async fn test_get_open_shard_counts() {
+        let (_temp_dir, state) = IngesterState::for_test().await;
+        let mut state_guard = state.lock_partially().await.unwrap();
+
+        let index_a = IndexUid::for_test("index-a", 0);
+        let index_b = IndexUid::for_test("index-b", 0);
+        let index_c = IndexUid::for_test("index-c", 0);
+
+        // (index-a, source-a): 1 open solo shard.
+        let s = open_shard(
+            index_a.clone(),
+            SourceId::from("source-a"),
+            ShardId::from(1),
+            false,
+        );
+        state_guard.shards.insert(s.queue_id(), s);
+
+        // (index-b, source-b): 1 open solo + 1 replica. Only the solo should be counted.
+        let s = open_shard(
+            index_b.clone(),
+            SourceId::from("source-b"),
+            ShardId::from(2),
+            false,
+        );
+        state_guard.shards.insert(s.queue_id(), s);
+        let s = open_shard(
+            index_b.clone(),
+            SourceId::from("source-b"),
+            ShardId::from(3),
+            true,
+        );
+        state_guard.shards.insert(s.queue_id(), s);
+
+        // (index-c, source-c): 2 open solo shards.
+        let s = open_shard(
+            index_c.clone(),
+            SourceId::from("source-c"),
+            ShardId::from(4),
+            false,
+        );
+        state_guard.shards.insert(s.queue_id(), s);
+        let s = open_shard(
+            index_c.clone(),
+            SourceId::from("source-c"),
+            ShardId::from(5),
+            false,
+        );
+        state_guard.shards.insert(s.queue_id(), s);
+
+        let mut counts = state_guard.get_open_shard_counts();
+        counts.sort_by(|a, b| a.0.cmp(&b.0));
+
+        assert_eq!(counts.len(), 3);
+        assert_eq!(counts[0], (index_a, SourceId::from("source-a"), 1));
+        assert_eq!(counts[1], (index_b, SourceId::from("source-b"), 1));
+        assert_eq!(counts[2], (index_c, SourceId::from("source-c"), 2));
+    }
 }

From 6b22b1d4e066ffb2b7d01568a53ad73962cc87a7 Mon Sep 17 00:00:00 2001
From: nadav-govari <nadav.govari@datadoghq.com>
Date: Thu, 19 Feb 2026 13:44:03 -0500
Subject: [PATCH 2/9] Implement node based routing table (#6159)

---
 .../broadcast/ingester_capacity_score.rs      |  79 ++--
 .../src/ingest_v2/broadcast/mod.rs            |   4 +
 .../quickwit-ingest/src/ingest_v2/ingester.rs |   5 +-
 quickwit/quickwit-ingest/src/ingest_v2/mod.rs |   7 +-
 .../src/ingest_v2/node_routing_table.rs       | 417 ++++++++++++++++++
 .../quickwit-ingest/src/ingest_v2/router.rs   |  28 +-
 .../quickwit-ingest/src/ingest_v2/state.rs    |   4 +-
 7 files changed, 499 insertions(+), 45 deletions(-)
 create mode 100644 quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs

diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs
index 6f8abc66ef8..1927eb788f7 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs
@@ -42,23 +42,22 @@ const WAL_CAPACITY_LOOKBACK_WINDOW_LEN: usize = 6;
 const WAL_CAPACITY_READINGS_LEN: usize = WAL_CAPACITY_LOOKBACK_WINDOW_LEN + 1;
 
 struct WalMemoryCapacityTimeSeries {
+    memory_capacity: ByteSize,
     readings: RingBuffer<f64, WAL_CAPACITY_READINGS_LEN>,
 }
 
 impl WalMemoryCapacityTimeSeries {
-    fn new() -> Self {
+    fn new(memory_capacity: ByteSize) -> Self {
+        #[cfg(not(test))]
+        assert!(memory_capacity.as_u64() > 0);
         Self {
+            memory_capacity,
             readings: RingBuffer::default(),
         }
     }
 
-    fn record(&mut self, memory_used: ByteSize, memory_allocated: ByteSize) {
-        let allocated = memory_allocated.as_u64();
-        if allocated == 0 {
-            self.readings.push_back(1.0);
-            return;
-        }
-        let remaining = 1.0 - (memory_used.as_u64() as f64 / allocated as f64);
+    fn record(&mut self, memory_used: ByteSize) {
+        let remaining = 1.0 - (memory_used.as_u64() as f64 / self.memory_capacity.as_u64() as f64);
         self.readings.push_back(remaining.clamp(0.0, 1.0));
     }
 
@@ -121,23 +120,27 @@ pub struct IngesterCapacityScore {
 
 /// Periodically snapshots the ingester's WAL memory usage and open shard counts, computes
 /// a capacity score, and broadcasts it to other nodes via Chitchat.
-pub(crate) struct BroadcastIngesterCapacityScoreTask {
+pub struct BroadcastIngesterCapacityScoreTask {
     cluster: Cluster,
     weak_state: WeakIngesterState,
     wal_capacity_time_series: WalMemoryCapacityTimeSeries,
 }
 
 impl BroadcastIngesterCapacityScoreTask {
-    pub fn spawn(cluster: Cluster, weak_state: WeakIngesterState) -> JoinHandle<()> {
+    pub fn spawn(
+        cluster: Cluster,
+        weak_state: WeakIngesterState,
+        memory_capacity: ByteSize,
+    ) -> JoinHandle<()> {
         let mut broadcaster = Self {
             cluster,
             weak_state,
-            wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(),
+            wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(memory_capacity),
         };
         tokio::spawn(async move { broadcaster.run().await })
     }
 
-    async fn snapshot(&self) -> Result<Option<(ByteSize, ByteSize, OpenShardCounts)>> {
+    async fn snapshot(&self) -> Result<Option<(ByteSize, OpenShardCounts)>> {
         let state = self
             .weak_state
             .upgrade()
@@ -155,10 +158,9 @@ impl BroadcastIngesterCapacityScoreTask {
             .map_err(|_| anyhow::anyhow!("failed to acquire ingester state lock"))?;
         let usage = guard.mrecordlog.resource_usage();
         let memory_used = ByteSize::b(usage.memory_used_bytes as u64);
-        let memory_allocated = ByteSize::b(usage.memory_allocated_bytes as u64);
         let open_shard_counts = guard.get_open_shard_counts();
 
-        Ok(Some((memory_used, memory_allocated, open_shard_counts)))
+        Ok(Some((memory_used, open_shard_counts)))
     }
 
     async fn run(&mut self) {
@@ -168,7 +170,7 @@ impl BroadcastIngesterCapacityScoreTask {
         loop {
             interval.tick().await;
 
-            let (memory_used, memory_allocated, open_shard_counts) = match self.snapshot().await {
+            let (memory_used, open_shard_counts) = match self.snapshot().await {
                 Ok(Some(snapshot)) => snapshot,
                 Ok(None) => continue,
                 Err(error) => {
@@ -177,8 +179,7 @@ impl BroadcastIngesterCapacityScoreTask {
                 }
             };
 
-            self.wal_capacity_time_series
-                .record(memory_used, memory_allocated);
+            self.wal_capacity_time_series.record(memory_used);
 
             let remaining_capacity = self.wal_capacity_time_series.current().unwrap_or(1.0);
             let capacity_delta = self.wal_capacity_time_series.delta().unwrap_or(0.0);
@@ -272,30 +273,31 @@ mod tests {
     use crate::ingest_v2::state::IngesterState;
 
     fn ts() -> WalMemoryCapacityTimeSeries {
-        WalMemoryCapacityTimeSeries::new()
+        WalMemoryCapacityTimeSeries::new(ByteSize::b(100))
     }
 
-    /// Helper: record a reading with `used` out of `allocated` bytes.
-    fn record(series: &mut WalMemoryCapacityTimeSeries, used: u64, allocated: u64) {
-        series.record(ByteSize::b(used), ByteSize::b(allocated));
+    /// Helper: record a reading with `used` bytes against the series' fixed capacity.
+    fn record(series: &mut WalMemoryCapacityTimeSeries, used: u64) {
+        series.record(ByteSize::b(used));
     }
 
     #[test]
     fn test_wal_memory_capacity_current_after_record() {
-        let mut series = ts();
+        let mut series = WalMemoryCapacityTimeSeries::new(ByteSize::b(256));
         // 192 of 256 used => 25% remaining
-        record(&mut series, 192, 256);
+        series.record(ByteSize::b(192));
         assert_eq!(series.current(), Some(0.25));
 
         // 16 of 256 used => 93.75% remaining
-        record(&mut series, 16, 256);
+        series.record(ByteSize::b(16));
         assert_eq!(series.current(), Some(0.9375));
     }
 
     #[test]
     fn test_wal_memory_capacity_record_saturates_at_zero() {
         let mut series = ts();
-        record(&mut series, 200, 100);
+        // 200 used out of 100 capacity => clamped to 0.0
+        record(&mut series, 200);
         assert_eq!(series.current(), Some(0.0));
     }
 
@@ -303,9 +305,9 @@ mod tests {
     fn test_wal_memory_capacity_delta_growing() {
         let mut series = ts();
         // oldest: 60 of 100 used => 40% remaining
-        record(&mut series, 60, 100);
+        record(&mut series, 60);
         // current: 20 of 100 used => 80% remaining
-        record(&mut series, 20, 100);
+        record(&mut series, 20);
         // delta = 0.80 - 0.40 = 0.40
         assert_eq!(series.delta(), Some(0.40));
     }
@@ -314,9 +316,9 @@ mod tests {
     fn test_wal_memory_capacity_delta_shrinking() {
         let mut series = ts();
         // oldest: 20 of 100 used => 80% remaining
-        record(&mut series, 20, 100);
+        record(&mut series, 20);
         // current: 60 of 100 used => 40% remaining
-        record(&mut series, 60, 100);
+        record(&mut series, 60);
         // delta = 0.40 - 0.80 = -0.40
         assert_eq!(series.delta(), Some(-0.40));
     }
@@ -326,7 +328,7 @@ mod tests {
         // Node A: capacity draining — usage increases 10, 20, ..., 70 over 7 ticks.
         let mut node_a = ts();
         for used in (10..=70).step_by(10) {
-            record(&mut node_a, used, 100);
+            record(&mut node_a, used);
         }
         let a_remaining = node_a.current().unwrap();
         let a_delta = node_a.delta().unwrap();
@@ -335,7 +337,7 @@ mod tests {
         // Node B: steady at 50% usage over 7 ticks.
         let mut node_b = ts();
         for _ in 0..7 {
-            record(&mut node_b, 50, 100);
+            record(&mut node_b, 50);
         }
         let b_remaining = node_b.current().unwrap();
         let b_delta = node_b.delta().unwrap();
@@ -361,7 +363,7 @@ mod tests {
         let task = BroadcastIngesterCapacityScoreTask {
             cluster,
             weak_state,
-            wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(),
+            wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(ByteSize::mb(1)),
         };
         assert!(task.snapshot().await.is_err());
     }
@@ -388,14 +390,13 @@ mod tests {
         let open_shard_counts = state_guard.get_open_shard_counts();
         drop(state_guard);
 
-        // Simulate 500 of 1000 bytes used => 50% remaining, 0 delta => score = 6
+        // Simulate 500 of 1000 bytes capacity used => 50% remaining, 0 delta => score = 6
         let mut task = BroadcastIngesterCapacityScoreTask {
             cluster: cluster.clone(),
             weak_state: state.weak(),
-            wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(),
+            wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(ByteSize::b(1000)),
         };
-        task.wal_capacity_time_series
-            .record(ByteSize::b(500), ByteSize::b(1000));
+        task.wal_capacity_time_series.record(ByteSize::b(500));
 
         let remaining = task.wal_capacity_time_series.current().unwrap();
         let delta = task.wal_capacity_time_series.delta().unwrap();
@@ -440,16 +441,16 @@ mod tests {
 
         // Fill to exactly the lookback window length (6 readings), all same value.
         for _ in 0..WAL_CAPACITY_LOOKBACK_WINDOW_LEN {
-            record(&mut series, 50, 100);
+            record(&mut series, 50);
         }
         assert_eq!(series.delta(), Some(0.0));
 
         // 7th reading fills the ring buffer. Delta spans 6 intervals.
-        record(&mut series, 0, 100);
+        record(&mut series, 0);
         assert_eq!(series.delta(), Some(0.50));
 
         // 8th reading evicts the oldest 50-remaining. Delta still spans 6 intervals.
-        record(&mut series, 0, 100);
+        record(&mut series, 0);
         assert_eq!(series.delta(), Some(0.50));
     }
 }
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs
index d2184a0e392..18a00209de1 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs
@@ -26,6 +26,10 @@ pub(in crate::ingest_v2) const BROADCAST_INTERVAL_PERIOD: Duration = if cfg!(tes
     Duration::from_secs(5)
 };
 
+pub use ingester_capacity_score::{
+    BroadcastIngesterCapacityScoreTask, IngesterCapacityScoreUpdate,
+    setup_ingester_capacity_update_listener,
+};
 pub use local_shards::{
     BroadcastLocalShardsTask, LocalShardsUpdate, ShardInfo, ShardInfos,
     setup_local_shards_update_listener,
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
index 65c268881ac..fc1a44f19bb 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
@@ -59,7 +59,7 @@ use tokio::time::{sleep, timeout};
 use tracing::{debug, error, info, warn};
 
 use super::IngesterPool;
-use super::broadcast::BroadcastLocalShardsTask;
+use super::broadcast::{BroadcastIngesterCapacityScoreTask, BroadcastLocalShardsTask};
 use super::doc_mapper::validate_doc_batch;
 use super::fetch::FetchStreamTask;
 use super::idle::CloseIdleShardsTask;
@@ -144,7 +144,8 @@ impl Ingester {
         let state = IngesterState::load(wal_dir_path, rate_limiter_settings);
 
         let weak_state = state.weak();
-        BroadcastLocalShardsTask::spawn(cluster, weak_state.clone());
+        BroadcastLocalShardsTask::spawn(cluster.clone(), weak_state.clone());
+        BroadcastIngesterCapacityScoreTask::spawn(cluster, weak_state.clone(), memory_capacity);
         CloseIdleShardsTask::spawn(weak_state, idle_shard_timeout);
 
         let ingester = Self {
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
index c8543faf793..3a801763feb 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
@@ -22,6 +22,8 @@ mod metrics;
 mod models;
 mod mrecord;
 mod mrecordlog_utils;
+#[allow(dead_code)]
+mod node_routing_table;
 mod publish_tracker;
 mod rate_meter;
 mod replication;
@@ -36,7 +38,10 @@ use std::ops::{Add, AddAssign};
 use std::time::Duration;
 use std::{env, fmt};
 
-pub use broadcast::{LocalShardsUpdate, ShardInfo, ShardInfos, setup_local_shards_update_listener};
+pub use broadcast::{
+    LocalShardsUpdate, ShardInfo, ShardInfos, setup_ingester_capacity_update_listener,
+    setup_local_shards_update_listener,
+};
 use bytes::buf::Writer;
 use bytes::{BufMut, BytesMut};
 use bytesize::ByteSize;
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs
new file mode 100644
index 00000000000..2e49e26b783
--- /dev/null
+++ b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs
@@ -0,0 +1,417 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::{HashMap, HashSet};
+
+use quickwit_proto::ingest::Shard;
+use quickwit_proto::types::{IndexId, IndexUid, NodeId, SourceId, SourceUid};
+use rand::rng;
+use rand::seq::IndexedRandom;
+
+use crate::IngesterPool;
+
+/// A single ingester node's routing-relevant data for a specific (index, source) pair.
+/// Each entry is self-describing: it carries its own node_id, index_uid, and source_id
+/// so it can always be attributed back to a specific source on a specific node.
+#[derive(Debug, Clone)]
+pub(super) struct IngesterNode {
+    pub node_id: NodeId,
+    pub index_uid: IndexUid,
+    pub source_id: SourceId,
+    /// Score from 0-10. Higher means more available capacity.
+    pub capacity_score: usize,
+    /// Number of open shards on this node for this (index, source) pair. Tiebreaker for power of
+    /// two choices comparison - we favor a node with more open shards.
+    pub open_shard_count: usize,
+}
+
+#[derive(Debug)]
+pub(super) struct RoutingEntry {
+    nodes: HashMap<NodeId, IngesterNode>,
+}
+
+/// Given a slice of candidates, picks the better of two random choices.
+/// Higher capacity_score wins; tiebreak on more open_shard_count (more landing spots).
+fn power_of_two_choices<'a>(candidates: &[&'a IngesterNode]) -> &'a IngesterNode {
+    debug_assert!(candidates.len() >= 2);
+    let mut iter = candidates.choose_multiple(&mut rng(), 2);
+    let (&a, &b) = (iter.next().unwrap(), iter.next().unwrap());
+
+    if (a.capacity_score, a.open_shard_count) >= (b.capacity_score, b.open_shard_count) {
+        a
+    } else {
+        b
+    }
+}
+
+impl RoutingEntry {
+    /// Pick an ingester node to persist the request to. Uses power of two choices based on reported
+    /// ingester capacity, if more than one eligible node exists.
+    pub fn pick_node(
+        &self,
+        ingester_pool: &IngesterPool,
+        unavailable_leaders: &HashSet<NodeId>,
+    ) -> Option<&IngesterNode> {
+        let eligible: Vec<&IngesterNode> = self
+            .nodes
+            .values()
+            .filter(|node| {
+                node.capacity_score > 0
+                    && node.open_shard_count > 0
+                    && ingester_pool.contains_key(&node.node_id)
+                    && !unavailable_leaders.contains(&node.node_id)
+            })
+            .collect();
+
+        match eligible.len() {
+            0 => None,
+            1 => Some(eligible[0]),
+            _ => Some(power_of_two_choices(&eligible)),
+        }
+    }
+}
+
+#[derive(Debug, Default)]
+pub(super) struct NodeBasedRoutingTable {
+    table: HashMap<(IndexId, SourceId), RoutingEntry>,
+}
+
+impl NodeBasedRoutingTable {
+    pub fn find_entry(&self, index_id: &str, source_id: &str) -> Option<&RoutingEntry> {
+        let key = (index_id.to_string(), source_id.to_string());
+        self.table.get(&key)
+    }
+
+    pub fn debug_info(&self) -> HashMap<IndexId, Vec<serde_json::Value>> {
+        let mut per_index: HashMap<IndexId, Vec<serde_json::Value>> = HashMap::new();
+        for ((index_id, source_id), entry) in &self.table {
+            for (node_id, node) in &entry.nodes {
+                per_index
+                    .entry(index_id.clone())
+                    .or_default()
+                    .push(serde_json::json!({
+                        "source_id": source_id,
+                        "node_id": node_id,
+                        "capacity_score": node.capacity_score,
+                        "open_shard_count": node.open_shard_count,
+                    }));
+            }
+        }
+        per_index
+    }
+
+    pub fn has_open_nodes(
+        &self,
+        index_id: &str,
+        source_id: &str,
+        ingester_pool: &IngesterPool,
+        unavailable_leaders: &HashSet<NodeId>,
+    ) -> bool {
+        let key = (index_id.to_string(), source_id.to_string());
+        let Some(entry) = self.table.get(&key) else {
+            return false;
+        };
+        entry.nodes.values().any(|node| {
+            node.capacity_score > 0
+                && node.open_shard_count > 0
+                && ingester_pool.contains_key(&node.node_id)
+                && !unavailable_leaders.contains(&node.node_id)
+        })
+    }
+
+    /// Applies a capacity update from the IngesterCapacityScoreUpdate broadcast. This is the
+    /// primary way the table learns about node availability and capacity.
+    pub fn apply_capacity_update(
+        &mut self,
+        node_id: NodeId,
+        source_uid: SourceUid,
+        capacity_score: usize,
+        open_shard_count: usize,
+    ) {
+        let key = (
+            source_uid.index_uid.index_id.to_string(),
+            source_uid.source_id.clone(),
+        );
+
+        let entry = self.table.entry(key).or_insert_with(|| RoutingEntry {
+            nodes: HashMap::new(),
+        });
+
+        let ingester_node = IngesterNode {
+            node_id: node_id.clone(),
+            index_uid: source_uid.index_uid,
+            source_id: source_uid.source_id,
+            capacity_score,
+            open_shard_count,
+        };
+        entry.nodes.insert(node_id, ingester_node);
+    }
+
+    /// Merges nodes from a GetOrCreateOpenShards control plane response into the
+    /// table. Only adds nodes that aren't already present — existing nodes keep
+    /// their real capacity scores from the broadcast.
+    /// TODO: New nodes get a default capacity_score of 5 until GetOrCreateOpenShards contains
+    /// capacity scores.
+    pub fn merge_from_shards(
+        &mut self,
+        index_uid: IndexUid,
+        source_id: SourceId,
+        shards: Vec<Shard>,
+    ) {
+        let key = (index_uid.index_id.to_string(), source_id.clone());
+
+        let mut per_leader_count: HashMap<NodeId, usize> = HashMap::new();
+        for shard in &shards {
+            if shard.is_open() {
+                *per_leader_count
+                    .entry(NodeId::from(shard.leader_id.clone()))
+                    .or_default() += 1;
+            }
+        }
+
+        let entry = self.table.entry(key).or_insert_with(|| RoutingEntry {
+            nodes: HashMap::new(),
+        });
+
+        for (node_id, open_shard_count) in per_leader_count {
+            if entry.nodes.contains_key(&node_id) {
+                continue;
+            }
+            let ingester_node = IngesterNode {
+                node_id: node_id.clone(),
+                index_uid: index_uid.clone(),
+                source_id: source_id.clone(),
+                capacity_score: 5,
+                open_shard_count,
+            };
+            entry.nodes.insert(node_id, ingester_node);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use quickwit_proto::ingest::ShardState;
+    use quickwit_proto::ingest::ingester::IngesterServiceClient;
+    use quickwit_proto::types::ShardId;
+
+    use super::*;
+
+    fn source_uid(index_id: &str, incarnation_id: u128, source_id: &str) -> SourceUid {
+        SourceUid {
+            index_uid: IndexUid::for_test(index_id, incarnation_id),
+            source_id: source_id.to_string(),
+        }
+    }
+
+    #[test]
+    fn test_apply_capacity_update() {
+        let mut table = NodeBasedRoutingTable::default();
+        let uid = source_uid("test-index", 0, "test-source");
+        let key = ("test-index".to_string(), "test-source".to_string());
+
+        // Insert first node.
+        table.apply_capacity_update("node-1".into(), uid.clone(), 8, 3);
+        let entry = table.table.get(&key).unwrap();
+        assert_eq!(entry.nodes.len(), 1);
+        assert_eq!(entry.nodes.get("node-1").unwrap().capacity_score, 8);
+
+        // Update existing node.
+        table.apply_capacity_update("node-1".into(), uid.clone(), 4, 5);
+        let node = table.table.get(&key).unwrap().nodes.get("node-1").unwrap();
+        assert_eq!(node.capacity_score, 4);
+        assert_eq!(node.open_shard_count, 5);
+
+        // Add second node.
+        table.apply_capacity_update("node-2".into(), uid.clone(), 6, 2);
+        assert_eq!(table.table.get(&key).unwrap().nodes.len(), 2);
+
+        // Zero shards: node stays in table but becomes ineligible for routing.
+        table.apply_capacity_update("node-1".into(), uid.clone(), 0, 0);
+        let entry = table.table.get(&key).unwrap();
+        assert_eq!(entry.nodes.len(), 2);
+        assert_eq!(entry.nodes.get("node-1").unwrap().open_shard_count, 0);
+        assert_eq!(entry.nodes.get("node-1").unwrap().capacity_score, 0);
+    }
+
+    #[test]
+    fn test_has_open_nodes() {
+        let mut table = NodeBasedRoutingTable::default();
+        let pool = IngesterPool::default();
+        let uid = source_uid("test-index", 0, "test-source");
+
+        // Empty table.
+        assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new()));
+
+        // Node exists but is not in pool.
+        table.apply_capacity_update("node-1".into(), uid.clone(), 8, 3);
+        assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new()));
+
+        // Node is in pool → true.
+        pool.insert("node-1".into(), IngesterServiceClient::mocked());
+        assert!(table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new()));
+
+        // Node is unavailable → false.
+        let unavailable: HashSet<NodeId> = HashSet::from(["node-1".into()]);
+        assert!(!table.has_open_nodes("test-index", "test-source", &pool, &unavailable));
+
+        // Second node available → true despite first being unavailable.
+        table.apply_capacity_update("node-2".into(), uid.clone(), 6, 2);
+        pool.insert("node-2".into(), IngesterServiceClient::mocked());
+        assert!(table.has_open_nodes("test-index", "test-source", &pool, &unavailable));
+
+        // Node with capacity_score=0 is not eligible.
+        table.apply_capacity_update("node-2".into(), uid, 0, 2);
+        assert!(!table.has_open_nodes("test-index", "test-source", &pool, &unavailable));
+    }
+
+    #[test]
+    fn test_pick_node() {
+        let mut table = NodeBasedRoutingTable::default();
+        let pool = IngesterPool::default();
+        let uid = source_uid("test-index", 0, "test-source");
+        let key = ("test-index".to_string(), "test-source".to_string());
+
+        // Node exists but not in pool → None.
+        table.apply_capacity_update("node-1".into(), uid.clone(), 8, 3);
+        assert!(
+            table
+                .table
+                .get(&key)
+                .unwrap()
+                .pick_node(&pool, &HashSet::new())
+                .is_none()
+        );
+
+        // Single node in pool → picks it.
+        pool.insert("node-1".into(), IngesterServiceClient::mocked());
+        let picked = table
+            .table
+            .get(&key)
+            .unwrap()
+            .pick_node(&pool, &HashSet::new())
+            .unwrap();
+        assert_eq!(picked.node_id, NodeId::from("node-1"));
+
+        // Multiple nodes → something is returned.
+        table.apply_capacity_update("node-2".into(), uid.clone(), 2, 1);
+        pool.insert("node-2".into(), IngesterServiceClient::mocked());
+        assert!(
+            table
+                .table
+                .get(&key)
+                .unwrap()
+                .pick_node(&pool, &HashSet::new())
+                .is_some()
+        );
+
+        // Node with capacity_score=0 is skipped.
+        table.apply_capacity_update("node-1".into(), uid.clone(), 0, 3);
+        table.apply_capacity_update("node-2".into(), uid, 0, 1);
+        assert!(
+            table
+                .table
+                .get(&key)
+                .unwrap()
+                .pick_node(&pool, &HashSet::new())
+                .is_none()
+        );
+    }
+
+    #[test]
+    fn test_power_of_two_choices() {
+        // 3 candidates: best appears in the random pair 2/3 of the time and always
+        // wins when it does, so it should win ~67% of 1000 runs. Asserting > 550
+        // is ~7.5 standard deviations from the mean — effectively impossible to flake.
+        let high = IngesterNode {
+            node_id: "high".into(),
+            index_uid: IndexUid::for_test("idx", 0),
+            source_id: "src".into(),
+            capacity_score: 9,
+            open_shard_count: 2,
+        };
+        let mid = IngesterNode {
+            node_id: "mid".into(),
+            index_uid: IndexUid::for_test("idx", 0),
+            source_id: "src".into(),
+            capacity_score: 5,
+            open_shard_count: 2,
+        };
+        let low = IngesterNode {
+            node_id: "low".into(),
+            index_uid: IndexUid::for_test("idx", 0),
+            source_id: "src".into(),
+            capacity_score: 1,
+            open_shard_count: 2,
+        };
+        let candidates: Vec<&IngesterNode> = vec![&high, &mid, &low];
+
+        let mut high_wins = 0;
+        for _ in 0..1000 {
+            if power_of_two_choices(&candidates).node_id == "high" {
+                high_wins += 1;
+            }
+        }
+        assert!(high_wins > 550, "high won only {high_wins}/1000 times");
+    }
+
+    #[test]
+    fn test_merge_from_shards() {
+        let mut table = NodeBasedRoutingTable::default();
+        let index_uid = IndexUid::for_test("test-index", 0);
+        let key = ("test-index".to_string(), "test-source".to_string());
+
+        let make_shard = |id: u64, leader: &str, open: bool| Shard {
+            index_uid: Some(index_uid.clone()),
+            source_id: "test-source".to_string(),
+            shard_id: Some(ShardId::from(id)),
+            shard_state: if open {
+                ShardState::Open as i32
+            } else {
+                ShardState::Closed as i32
+            },
+            leader_id: leader.to_string(),
+            ..Default::default()
+        };
+
+        // Two open shards on node-1, one open on node-2, one closed (ignored).
+        let shards = vec![
+            make_shard(1, "node-1", true),
+            make_shard(2, "node-1", true),
+            make_shard(3, "node-2", true),
+            make_shard(4, "node-2", false),
+        ];
+        table.merge_from_shards(index_uid.clone(), "test-source".into(), shards);
+
+        let entry = table.table.get(&key).unwrap();
+        assert_eq!(entry.nodes.len(), 2);
+
+        let n1 = entry.nodes.get("node-1").unwrap();
+        assert_eq!(n1.open_shard_count, 2);
+        assert_eq!(n1.capacity_score, 5);
+
+        let n2 = entry.nodes.get("node-2").unwrap();
+        assert_eq!(n2.open_shard_count, 1);
+
+        // Merging again adds new nodes but preserves existing ones.
+        let shards = vec![make_shard(10, "node-3", true)];
+        table.merge_from_shards(index_uid, "test-source".into(), shards);
+
+        let entry = table.table.get(&key).unwrap();
+        assert_eq!(entry.nodes.len(), 3);
+        assert!(entry.nodes.contains_key("node-1"));
+        assert!(entry.nodes.contains_key("node-2"));
+        assert!(entry.nodes.contains_key("node-3"));
+    }
+}
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs
index 67ad31a2722..ccd00f0209c 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs
@@ -43,12 +43,13 @@ use tokio::sync::{Mutex, Semaphore};
 use tokio::time::error::Elapsed;
 use tracing::{error, info};
 
-use super::broadcast::LocalShardsUpdate;
+use super::broadcast::{IngesterCapacityScoreUpdate, LocalShardsUpdate};
 use super::debouncing::{
     DebouncedGetOrCreateOpenShardsRequest, GetOrCreateOpenShardsRequestDebouncer,
 };
 use super::ingester::PERSIST_REQUEST_TIMEOUT;
 use super::metrics::IngestResultMetrics;
+use super::node_routing_table::NodeBasedRoutingTable;
 use super::routing_table::{NextOpenShardError, RoutingTable};
 use super::workbench::IngestWorkbench;
 use super::{IngesterPool, pending_subrequests};
@@ -105,6 +106,9 @@ struct RouterState {
     debouncer: GetOrCreateOpenShardsRequestDebouncer,
     // Holds the routing table mapping index and source IDs to shards.
     routing_table: RoutingTable,
+    // Node-based routing table, populated by capacity broadcasts.
+    // Not yet used for routing — will replace `routing_table` in a follow-up PR.
+    node_routing_table: NodeBasedRoutingTable,
 }
 
 impl fmt::Debug for IngestRouter {
@@ -130,6 +134,7 @@ impl IngestRouter {
                 self_node_id: self_node_id.clone(),
                 table: HashMap::default(),
             },
+            node_routing_table: NodeBasedRoutingTable::default(),
         }));
         let ingest_semaphore_permits = get_ingest_router_buffer_size().as_u64() as usize;
         let ingest_semaphore = Arc::new(Semaphore::new(ingest_semaphore_permits));
@@ -151,7 +156,10 @@ impl IngestRouter {
             .subscribe::<LocalShardsUpdate>(weak_router_state.clone())
             .forever();
         self.event_broker
-            .subscribe::<ShardPositionsUpdate>(weak_router_state)
+            .subscribe::<ShardPositionsUpdate>(weak_router_state.clone())
+            .forever();
+        self.event_broker
+            .subscribe::<IngesterCapacityScoreUpdate>(weak_router_state)
             .forever();
     }
 
@@ -694,6 +702,22 @@ impl EventSubscriber<ShardPositionsUpdate> for WeakRouterState {
     }
 }
 
+#[async_trait]
+impl EventSubscriber<IngesterCapacityScoreUpdate> for WeakRouterState {
+    async fn handle_event(&mut self, update: IngesterCapacityScoreUpdate) {
+        let Some(state) = self.0.upgrade() else {
+            return;
+        };
+        let mut state_guard = state.lock().await;
+        state_guard.node_routing_table.apply_capacity_update(
+            update.node_id,
+            update.source_uid,
+            update.capacity_score,
+            update.open_shard_count,
+        );
+    }
+}
+
 pub(super) struct PersistRequestSummary {
     pub leader_id: NodeId,
     pub subrequest_ids: Vec<SubrequestId>,
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/state.rs b/quickwit/quickwit-ingest/src/ingest_v2/state.rs
index bf1c648c6cb..a14f4ae9a44 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/state.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/state.rs
@@ -154,8 +154,10 @@ impl IngesterState {
     /// queues. Empty queues are deleted, while non-empty queues are recovered. However, the
     /// corresponding shards are closed and become read-only.
     pub async fn init(&self, wal_dir_path: &Path, rate_limiter_settings: RateLimiterSettings) {
-        let mut inner_guard = self.inner.lock().await;
+        // Acquire locks in the same order as `lock_fully` (mrecordlog first, then inner) to
+        // prevent ABBA deadlocks with the broadcast capacity task.
         let mut mrecordlog_guard = self.mrecordlog.write().await;
+        let mut inner_guard = self.inner.lock().await;
 
         let now = Instant::now();
 

From adb619a9a597dbf1bd105b67ee05cb20eb1b9108 Mon Sep 17 00:00:00 2001
From: nadav-govari <nadav.govari@datadoghq.com>
Date: Thu, 26 Feb 2026 13:56:54 -0500
Subject: [PATCH 3/9] Use new node based routing table for routing decisions
 (#6163)

---
 .../broadcast/ingester_capacity_score.rs      |   40 +-
 .../src/ingest_v2/debouncing.rs               |    4 +-
 .../quickwit-ingest/src/ingest_v2/ingester.rs |   48 +-
 quickwit/quickwit-ingest/src/ingest_v2/mod.rs |    1 +
 .../src/ingest_v2/node_routing_table.rs       |    2 +-
 .../quickwit-ingest/src/ingest_v2/router.rs   | 1247 +++++------------
 .../src/ingest_v2/workbench.rs                |   14 +-
 .../protos/quickwit/ingester.proto            |    5 +-
 .../quickwit/quickwit.ingest.ingester.rs      |   15 +-
 quickwit/quickwit-proto/src/ingest/mod.rs     |    9 +-
 quickwit/quickwit-serve/src/lib.rs            |    8 +-
 .../tag_fields/0002_negative_tags.yaml        |    8 +-
 .../scenarii/tag_fields/_setup.quickwit.yaml  |   10 +-
 .../tag_fields/_teardown.quickwit.yaml        |    2 +-
 14 files changed, 432 insertions(+), 981 deletions(-)

diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs
index 1927eb788f7..9531db17deb 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs
@@ -41,12 +41,12 @@ const WAL_CAPACITY_LOOKBACK_WINDOW_LEN: usize = 6;
 /// reading would be discarded when the next reading is inserted.
 const WAL_CAPACITY_READINGS_LEN: usize = WAL_CAPACITY_LOOKBACK_WINDOW_LEN + 1;
 
-struct WalMemoryCapacityTimeSeries {
+struct WalDiskCapacityTimeSeries {
     memory_capacity: ByteSize,
     readings: RingBuffer<f64, WAL_CAPACITY_READINGS_LEN>,
 }
 
-impl WalMemoryCapacityTimeSeries {
+impl WalDiskCapacityTimeSeries {
     fn new(memory_capacity: ByteSize) -> Self {
         #[cfg(not(test))]
         assert!(memory_capacity.as_u64() > 0);
@@ -123,19 +123,19 @@ pub struct IngesterCapacityScore {
 pub struct BroadcastIngesterCapacityScoreTask {
     cluster: Cluster,
     weak_state: WeakIngesterState,
-    wal_capacity_time_series: WalMemoryCapacityTimeSeries,
+    wal_capacity_time_series: WalDiskCapacityTimeSeries,
 }
 
 impl BroadcastIngesterCapacityScoreTask {
     pub fn spawn(
         cluster: Cluster,
         weak_state: WeakIngesterState,
-        memory_capacity: ByteSize,
+        disk_capacity: ByteSize,
     ) -> JoinHandle<()> {
         let mut broadcaster = Self {
             cluster,
             weak_state,
-            wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(memory_capacity),
+            wal_capacity_time_series: WalDiskCapacityTimeSeries::new(disk_capacity),
         };
         tokio::spawn(async move { broadcaster.run().await })
     }
@@ -157,10 +157,10 @@ impl BroadcastIngesterCapacityScoreTask {
             .await
             .map_err(|_| anyhow::anyhow!("failed to acquire ingester state lock"))?;
         let usage = guard.mrecordlog.resource_usage();
-        let memory_used = ByteSize::b(usage.memory_used_bytes as u64);
+        let disk_used = ByteSize::b(usage.disk_used_bytes as u64);
         let open_shard_counts = guard.get_open_shard_counts();
 
-        Ok(Some((memory_used, open_shard_counts)))
+        Ok(Some((disk_used, open_shard_counts)))
     }
 
     async fn run(&mut self) {
@@ -170,7 +170,7 @@ impl BroadcastIngesterCapacityScoreTask {
         loop {
             interval.tick().await;
 
-            let (memory_used, open_shard_counts) = match self.snapshot().await {
+            let (disk_used, open_shard_counts) = match self.snapshot().await {
                 Ok(Some(snapshot)) => snapshot,
                 Ok(None) => continue,
                 Err(error) => {
@@ -179,7 +179,7 @@ impl BroadcastIngesterCapacityScoreTask {
                 }
             };
 
-            self.wal_capacity_time_series.record(memory_used);
+            self.wal_capacity_time_series.record(disk_used);
 
             let remaining_capacity = self.wal_capacity_time_series.current().unwrap_or(1.0);
             let capacity_delta = self.wal_capacity_time_series.delta().unwrap_or(0.0);
@@ -272,18 +272,18 @@ mod tests {
     use crate::ingest_v2::models::IngesterShard;
     use crate::ingest_v2::state::IngesterState;
 
-    fn ts() -> WalMemoryCapacityTimeSeries {
-        WalMemoryCapacityTimeSeries::new(ByteSize::b(100))
+    fn ts() -> WalDiskCapacityTimeSeries {
+        WalDiskCapacityTimeSeries::new(ByteSize::b(100))
     }
 
     /// Helper: record a reading with `used` bytes against the series' fixed capacity.
-    fn record(series: &mut WalMemoryCapacityTimeSeries, used: u64) {
+    fn record(series: &mut WalDiskCapacityTimeSeries, used: u64) {
         series.record(ByteSize::b(used));
     }
 
     #[test]
-    fn test_wal_memory_capacity_current_after_record() {
-        let mut series = WalMemoryCapacityTimeSeries::new(ByteSize::b(256));
+    fn test_wal_disk_capacity_current_after_record() {
+        let mut series = WalDiskCapacityTimeSeries::new(ByteSize::b(256));
         // 192 of 256 used => 25% remaining
         series.record(ByteSize::b(192));
         assert_eq!(series.current(), Some(0.25));
@@ -294,7 +294,7 @@ mod tests {
     }
 
     #[test]
-    fn test_wal_memory_capacity_record_saturates_at_zero() {
+    fn test_wal_disk_capacity_record_saturates_at_zero() {
         let mut series = ts();
         // 200 used out of 100 capacity => clamped to 0.0
         record(&mut series, 200);
@@ -302,7 +302,7 @@ mod tests {
     }
 
     #[test]
-    fn test_wal_memory_capacity_delta_growing() {
+    fn test_wal_disk_capacity_delta_growing() {
         let mut series = ts();
         // oldest: 60 of 100 used => 40% remaining
         record(&mut series, 60);
@@ -313,7 +313,7 @@ mod tests {
     }
 
     #[test]
-    fn test_wal_memory_capacity_delta_shrinking() {
+    fn test_wal_disk_capacity_delta_shrinking() {
         let mut series = ts();
         // oldest: 20 of 100 used => 80% remaining
         record(&mut series, 20);
@@ -363,7 +363,7 @@ mod tests {
         let task = BroadcastIngesterCapacityScoreTask {
             cluster,
             weak_state,
-            wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(ByteSize::mb(1)),
+            wal_capacity_time_series: WalDiskCapacityTimeSeries::new(ByteSize::mb(1)),
         };
         assert!(task.snapshot().await.is_err());
     }
@@ -394,7 +394,7 @@ mod tests {
         let mut task = BroadcastIngesterCapacityScoreTask {
             cluster: cluster.clone(),
             weak_state: state.weak(),
-            wal_capacity_time_series: WalMemoryCapacityTimeSeries::new(ByteSize::b(1000)),
+            wal_capacity_time_series: WalDiskCapacityTimeSeries::new(ByteSize::b(1000)),
         };
         task.wal_capacity_time_series.record(ByteSize::b(500));
 
@@ -436,7 +436,7 @@ mod tests {
     }
 
     #[test]
-    fn test_wal_memory_capacity_delta_spans_lookback_window() {
+    fn test_wal_disk_capacity_delta_spans_lookback_window() {
         let mut series = ts();
 
         // Fill to exactly the lookback window length (6 readings), all same value.
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/debouncing.rs b/quickwit/quickwit-ingest/src/ingest_v2/debouncing.rs
index 041f2928c45..19d6f5d691d 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/debouncing.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/debouncing.rs
@@ -18,7 +18,6 @@ use std::sync::Arc;
 use quickwit_proto::control_plane::{
     GetOrCreateOpenShardsRequest, GetOrCreateOpenShardsSubrequest,
 };
-use quickwit_proto::ingest::ShardIds;
 use quickwit_proto::types::{IndexId, SourceId};
 use tokio::sync::{OwnedRwLockWriteGuard, RwLock};
 
@@ -69,7 +68,6 @@ impl GetOrCreateOpenShardsRequestDebouncer {
 #[derive(Default)]
 pub(super) struct DebouncedGetOrCreateOpenShardsRequest {
     subrequests: Vec<GetOrCreateOpenShardsSubrequest>,
-    pub closed_shards: Vec<ShardIds>,
     pub unavailable_leaders: Vec<String>,
     rendezvous: Rendezvous,
 }
@@ -85,8 +83,8 @@ impl DebouncedGetOrCreateOpenShardsRequest {
         }
         let request = GetOrCreateOpenShardsRequest {
             subrequests: self.subrequests,
-            closed_shards: self.closed_shards,
             unavailable_leaders: self.unavailable_leaders,
+            ..Default::default()
         };
         (Some(request), self.rendezvous)
     }
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
index fc1a44f19bb..5cd231d5a4a 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
@@ -37,16 +37,7 @@ use quickwit_proto::control_plane::{
     AdviseResetShardsRequest, ControlPlaneService, ControlPlaneServiceClient,
 };
 use quickwit_proto::indexing::ShardPositionsUpdate;
-use quickwit_proto::ingest::ingester::{
-    AckReplicationMessage, CloseShardsRequest, CloseShardsResponse, DecommissionRequest,
-    DecommissionResponse, FetchMessage, IngesterService, IngesterServiceClient,
-    IngesterServiceStream, IngesterStatus, InitShardFailure, InitShardSuccess, InitShardsRequest,
-    InitShardsResponse, ObservationMessage, OpenFetchStreamRequest, OpenObservationStreamRequest,
-    OpenReplicationStreamRequest, OpenReplicationStreamResponse, PersistFailure,
-    PersistFailureReason, PersistRequest, PersistResponse, PersistSuccess, ReplicateFailureReason,
-    ReplicateSubrequest, RetainShardsForSource, RetainShardsRequest, RetainShardsResponse,
-    SynReplicationMessage, TruncateShardsRequest, TruncateShardsResponse,
-};
+use quickwit_proto::ingest::ingester::*;
 use quickwit_proto::ingest::{
     CommitTypeV2, DocBatchV2, IngestV2Error, IngestV2Result, ParseFailure, Shard, ShardIds,
 };
@@ -145,7 +136,7 @@ impl Ingester {
 
         let weak_state = state.weak();
         BroadcastLocalShardsTask::spawn(cluster.clone(), weak_state.clone());
-        BroadcastIngesterCapacityScoreTask::spawn(cluster, weak_state.clone(), memory_capacity);
+        BroadcastIngesterCapacityScoreTask::spawn(cluster, weak_state.clone(), disk_capacity);
         CloseIdleShardsTask::spawn(weak_state, idle_shard_timeout);
 
         let ingester = Self {
@@ -469,7 +460,7 @@ impl Ingester {
                     index_uid: subrequest.index_uid,
                     source_id: subrequest.source_id,
                     shard_id: subrequest.shard_id,
-                    reason: PersistFailureReason::ShardClosed as i32,
+                    reason: PersistFailureReason::NodeUnavailable as i32,
                 };
                 persist_failures.push(persist_failure);
             }
@@ -499,7 +490,7 @@ impl Ingester {
                         index_uid: subrequest.index_uid,
                         source_id: subrequest.source_id,
                         shard_id: subrequest.shard_id,
-                        reason: PersistFailureReason::ShardNotFound as i32,
+                        reason: PersistFailureReason::NoShardsAvailable as i32,
                     };
                     persist_failures.push(persist_failure);
                     continue;
@@ -558,7 +549,7 @@ impl Ingester {
                         index_uid: subrequest.index_uid,
                         source_id: subrequest.source_id,
                         shard_id: Some(shard_id),
-                        reason: PersistFailureReason::ShardRateLimited as i32,
+                        reason: PersistFailureReason::NoShardsAvailable as i32,
                     };
                     persist_failures.push(persist_failure);
                     continue;
@@ -673,7 +664,7 @@ impl Ingester {
                         // TODO: Handle replication error:
                         // 1. Close and evict all the shards hosted by the follower.
                         // 2. Close and evict the replication client.
-                        // 3. Return `PersistFailureReason::ShardClosed` to router.
+                        // 3. Return `PersistFailureReason::NodeUnavailable` to router.
                         continue;
                     }
                 };
@@ -689,14 +680,8 @@ impl Ingester {
                 for replicate_failure in replicate_response.failures {
                     // TODO: If the replica shard is closed, close the primary shard if it is not
                     // already.
-                    let persist_failure_reason = match replicate_failure.reason() {
-                        ReplicateFailureReason::Unspecified => PersistFailureReason::Unspecified,
-                        ReplicateFailureReason::ShardNotFound => {
-                            PersistFailureReason::ShardNotFound
-                        }
-                        ReplicateFailureReason::ShardClosed => PersistFailureReason::ShardClosed,
-                        ReplicateFailureReason::WalFull => PersistFailureReason::WalFull,
-                    };
+                    let persist_failure_reason: PersistFailureReason =
+                        replicate_failure.reason().into();
                     let persist_failure = PersistFailure {
                         subrequest_id: replicate_failure.subrequest_id,
                         index_uid: replicate_failure.index_uid,
@@ -736,7 +721,7 @@ impl Ingester {
                                     "failed to persist records to shard `{queue_id}`: {io_error}"
                                 );
                                 shards_to_close.insert(queue_id);
-                                PersistFailureReason::ShardClosed
+                                PersistFailureReason::NodeUnavailable
                             }
                             AppendDocBatchError::QueueNotFound(_) => {
                                 error!(
@@ -744,7 +729,7 @@ impl Ingester {
                                      not found"
                                 );
                                 shards_to_delete.insert(queue_id);
-                                PersistFailureReason::ShardNotFound
+                                PersistFailureReason::NodeUnavailable
                             }
                         };
                         let persist_failure = PersistFailure {
@@ -2159,7 +2144,7 @@ mod tests {
         let persist_failure = &persist_response.failures[0];
         assert_eq!(
             persist_failure.reason(),
-            PersistFailureReason::ShardRateLimited
+            PersistFailureReason::NoShardsAvailable
         );
     }
 
@@ -2222,7 +2207,10 @@ mod tests {
         assert_eq!(persist_failure.index_uid(), &index_uid);
         assert_eq!(persist_failure.source_id, "test-source");
         assert_eq!(persist_failure.shard_id(), ShardId::from(1));
-        assert_eq!(persist_failure.reason(), PersistFailureReason::ShardClosed,);
+        assert_eq!(
+            persist_failure.reason(),
+            PersistFailureReason::NodeUnavailable,
+        );
 
         let state_guard = ingester.state.lock_fully().await.unwrap();
         let shard = state_guard.shards.get(&queue_id).unwrap();
@@ -2274,7 +2262,7 @@ mod tests {
         assert_eq!(persist_failure.shard_id(), ShardId::from(1));
         assert_eq!(
             persist_failure.reason(),
-            PersistFailureReason::ShardNotFound
+            PersistFailureReason::NodeUnavailable
         );
 
         let state_guard = ingester.state.lock_fully().await.unwrap();
@@ -2704,7 +2692,7 @@ mod tests {
         assert_eq!(persist_failure.shard_id(), ShardId::from(1));
         assert_eq!(
             persist_failure.reason(),
-            PersistFailureReason::ShardNotFound
+            PersistFailureReason::NoShardsAvailable
         );
 
         let state_guard = ingester.state.lock_fully().await.unwrap();
@@ -2783,7 +2771,7 @@ mod tests {
         assert_eq!(persist_failure.shard_id(), ShardId::from(1));
         assert_eq!(
             persist_failure.reason(),
-            PersistFailureReason::ShardRateLimited
+            PersistFailureReason::NoShardsAvailable
         );
 
         let state_guard = ingester.state.lock_fully().await.unwrap();
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
index 3a801763feb..0bb3d6b6138 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
@@ -28,6 +28,7 @@ mod publish_tracker;
 mod rate_meter;
 mod replication;
 mod router;
+#[allow(dead_code)]
 mod routing_table;
 mod state;
 mod workbench;
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs
index 2e49e26b783..f354011ede2 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs
@@ -38,7 +38,7 @@ pub(super) struct IngesterNode {
 
 #[derive(Debug)]
 pub(super) struct RoutingEntry {
-    nodes: HashMap<NodeId, IngesterNode>,
+    pub nodes: HashMap<NodeId, IngesterNode>,
 }
 
 /// Given a slice of candidates, picks the better of two random choices.
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs
index ccd00f0209c..da3d989d93e 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs
@@ -27,33 +27,29 @@ use quickwit_proto::control_plane::{
     ControlPlaneService, ControlPlaneServiceClient, GetOrCreateOpenShardsRequest,
     GetOrCreateOpenShardsSubrequest,
 };
-use quickwit_proto::indexing::ShardPositionsUpdate;
 use quickwit_proto::ingest::ingester::{
     IngesterService, PersistFailureReason, PersistRequest, PersistResponse, PersistSubrequest,
 };
 use quickwit_proto::ingest::router::{
     IngestFailureReason, IngestRequestV2, IngestResponseV2, IngestRouterService,
 };
-use quickwit_proto::ingest::{
-    CommitTypeV2, IngestV2Error, IngestV2Result, RateLimitingCause, ShardState,
-};
-use quickwit_proto::types::{IndexUid, NodeId, ShardId, SourceId, SubrequestId};
+use quickwit_proto::ingest::{CommitTypeV2, IngestV2Error, IngestV2Result, RateLimitingCause};
+use quickwit_proto::types::{NodeId, SubrequestId};
 use serde_json::{Value as JsonValue, json};
 use tokio::sync::{Mutex, Semaphore};
 use tokio::time::error::Elapsed;
 use tracing::{error, info};
 
-use super::broadcast::{IngesterCapacityScoreUpdate, LocalShardsUpdate};
+use super::broadcast::IngesterCapacityScoreUpdate;
 use super::debouncing::{
     DebouncedGetOrCreateOpenShardsRequest, GetOrCreateOpenShardsRequestDebouncer,
 };
 use super::ingester::PERSIST_REQUEST_TIMEOUT;
 use super::metrics::IngestResultMetrics;
 use super::node_routing_table::NodeBasedRoutingTable;
-use super::routing_table::{NextOpenShardError, RoutingTable};
 use super::workbench::IngestWorkbench;
 use super::{IngesterPool, pending_subrequests};
-use crate::{LeaderId, get_ingest_router_buffer_size};
+use crate::get_ingest_router_buffer_size;
 
 /// Duration after which ingest requests time out with [`IngestV2Error::Timeout`].
 fn ingest_request_timeout() -> Duration {
@@ -102,12 +98,7 @@ pub struct IngestRouter {
 }
 
 struct RouterState {
-    // Debounces `GetOrCreateOpenShardsRequest` requests to the control plane.
     debouncer: GetOrCreateOpenShardsRequestDebouncer,
-    // Holds the routing table mapping index and source IDs to shards.
-    routing_table: RoutingTable,
-    // Node-based routing table, populated by capacity broadcasts.
-    // Not yet used for routing — will replace `routing_table` in a follow-up PR.
     node_routing_table: NodeBasedRoutingTable,
 }
 
@@ -130,10 +121,6 @@ impl IngestRouter {
     ) -> Self {
         let state = Arc::new(Mutex::new(RouterState {
             debouncer: GetOrCreateOpenShardsRequestDebouncer::default(),
-            routing_table: RoutingTable {
-                self_node_id: self_node_id.clone(),
-                table: HashMap::default(),
-            },
             node_routing_table: NodeBasedRoutingTable::default(),
         }));
         let ingest_semaphore_permits = get_ingest_router_buffer_size().as_u64() as usize;
@@ -152,12 +139,6 @@ impl IngestRouter {
 
     pub fn subscribe(&self) {
         let weak_router_state = WeakRouterState(Arc::downgrade(&self.state));
-        self.event_broker
-            .subscribe::<LocalShardsUpdate>(weak_router_state.clone())
-            .forever();
-        self.event_broker
-            .subscribe::<ShardPositionsUpdate>(weak_router_state.clone())
-            .forever();
         self.event_broker
             .subscribe::<IngesterCapacityScoreUpdate>(weak_router_state)
             .forever();
@@ -171,22 +152,19 @@ impl IngestRouter {
         ingester_pool: &IngesterPool,
     ) -> DebouncedGetOrCreateOpenShardsRequest {
         let mut debounced_request = DebouncedGetOrCreateOpenShardsRequest::default();
-
-        // `closed_shards` and `unavailable_leaders` are populated by calls to `has_open_shards`
-        // as we're looking for open shards to route the subrequests to.
-        let unavailable_leaders: &mut HashSet<NodeId> = &mut workbench.unavailable_leaders;
+        let unavailable_leaders = &workbench.unavailable_leaders;
 
         let mut state_guard = self.state.lock().await;
 
         for subrequest in pending_subrequests(&workbench.subworkbenches) {
-            if !state_guard.routing_table.has_open_shards(
+            if !state_guard.node_routing_table.has_open_nodes(
                 &subrequest.index_id,
                 &subrequest.source_id,
                 ingester_pool,
-                &mut debounced_request.closed_shards,
                 unavailable_leaders,
             ) {
-                // No shard available! Let's attempt to create one.
+                // No known nodes with open shards for this source. Ask the control
+                // plane to create shards so we have somewhere to route to.
                 let acquire_result = state_guard
                     .debouncer
                     .acquire(&subrequest.index_id, &subrequest.source_id);
@@ -208,9 +186,6 @@ impl IngestRouter {
         }
         drop(state_guard);
 
-        if !debounced_request.is_empty() && !debounced_request.closed_shards.is_empty() {
-            info!(closed_shards=?debounced_request.closed_shards, "reporting closed shard(s) to control plane");
-        }
         if !debounced_request.is_empty() && !unavailable_leaders.is_empty() {
             info!(unavailable_leaders=?unavailable_leaders, "reporting unavailable leader(s) to control plane");
 
@@ -267,7 +242,7 @@ impl IngestRouter {
         let mut state_guard = self.state.lock().await;
 
         for success in response.successes {
-            state_guard.routing_table.replace_shards(
+            state_guard.node_routing_table.merge_from_shards(
                 success.index_uid().clone(),
                 success.source_id,
                 success.open_shards,
@@ -285,8 +260,7 @@ impl IngestRouter {
         workbench: &mut IngestWorkbench,
         mut persist_futures: FuturesUnordered<impl Future<Output = PersistResult>>,
     ) {
-        let mut closed_shards: HashMap<(IndexUid, SourceId), Vec<ShardId>> = HashMap::new();
-        let mut deleted_shards: HashMap<(IndexUid, SourceId), Vec<ShardId>> = HashMap::new();
+        let mut unavailable_leaders: HashSet<NodeId> = HashSet::new();
 
         while let Some((persist_summary, persist_result)) = persist_futures.next().await {
             match persist_result {
@@ -298,33 +272,12 @@ impl IngestRouter {
                         workbench.record_persist_failure(&persist_failure);
 
                         match persist_failure.reason() {
-                            PersistFailureReason::ShardClosed => {
-                                let shard_id = persist_failure.shard_id().clone();
-                                let index_uid: IndexUid = persist_failure.index_uid().clone();
-                                let source_id: SourceId = persist_failure.source_id;
-                                closed_shards
-                                    .entry((index_uid, source_id))
-                                    .or_default()
-                                    .push(shard_id);
-                            }
-                            PersistFailureReason::ShardNotFound => {
-                                let shard_id = persist_failure.shard_id().clone();
-                                let index_uid: IndexUid = persist_failure.index_uid().clone();
-                                let source_id: SourceId = persist_failure.source_id;
-                                deleted_shards
-                                    .entry((index_uid, source_id))
-                                    .or_default()
-                                    .push(shard_id);
-                            }
-                            PersistFailureReason::WalFull
-                            | PersistFailureReason::ShardRateLimited => {
-                                // Let's record that the shard is rate limited or that the ingester
-                                // that hosts has its wal full.
-                                //
-                                // That way we will avoid to retry the persist request on the very
-                                // same node.
-                                let shard_id = persist_failure.shard_id().clone();
-                                workbench.rate_limited_shards.insert(shard_id);
+                            PersistFailureReason::NoShardsAvailable => {}
+                            PersistFailureReason::NodeUnavailable
+                            | PersistFailureReason::WalFull
+                            | PersistFailureReason::Timeout => {
+                                unavailable_leaders
+                                    .insert(NodeId::from(persist_response.leader_id.clone()));
                             }
                             _ => {}
                         }
@@ -348,20 +301,7 @@ impl IngestRouter {
                 }
             };
         }
-        if !closed_shards.is_empty() || !deleted_shards.is_empty() {
-            let mut state_guard = self.state.lock().await;
-
-            for ((index_uid, source_id), shard_ids) in closed_shards {
-                state_guard
-                    .routing_table
-                    .close_shards(&index_uid, source_id, &shard_ids);
-            }
-            for ((index_uid, source_id), shard_ids) in deleted_shards {
-                state_guard
-                    .routing_table
-                    .delete_shards(&index_uid, source_id, &shard_ids);
-            }
-        }
+        workbench.unavailable_leaders.extend(unavailable_leaders);
     }
 
     async fn batch_persist(&self, workbench: &mut IngestWorkbench, commit_type: CommitTypeV2) {
@@ -373,47 +313,35 @@ impl IngestRouter {
         self.populate_routing_table_debounced(workbench, debounced_request)
             .await;
 
-        // Subrequests for which no shards are available to route the subrequests to.
+        let unavailable_leaders = &workbench.unavailable_leaders;
         let mut no_shards_available_subrequest_ids: Vec<SubrequestId> = Vec::new();
-        // Subrequests for which the shards are rate limited.
-        let mut rate_limited_subrequest_ids: Vec<SubrequestId> = Vec::new();
-
-        let mut per_leader_persist_subrequests: HashMap<&LeaderId, Vec<PersistSubrequest>> =
+        let mut per_leader_persist_subrequests: HashMap<&NodeId, Vec<PersistSubrequest>> =
             HashMap::new();
 
-        let rate_limited_shards: &HashSet<ShardId> = &workbench.rate_limited_shards;
         let state_guard = self.state.lock().await;
 
         for subrequest in pending_subrequests(&workbench.subworkbenches) {
-            let next_open_shard_res_opt = state_guard
-                .routing_table
+            let ingester_node = state_guard
+                .node_routing_table
                 .find_entry(&subrequest.index_id, &subrequest.source_id)
-                .map(|entry| {
-                    entry.next_open_shard_round_robin(&self.ingester_pool, rate_limited_shards)
-                });
-            let next_open_shard = match next_open_shard_res_opt {
-                Some(Ok(next_open_shard)) => next_open_shard,
-                Some(Err(NextOpenShardError::RateLimited)) => {
-                    rate_limited_subrequest_ids.push(subrequest.subrequest_id);
-                    continue;
-                }
-                Some(Err(NextOpenShardError::NoShardsAvailable)) | None => {
+                .and_then(|entry| entry.pick_node(&self.ingester_pool, unavailable_leaders));
+
+            let ingester_node = match ingester_node {
+                Some(node) => node,
+                None => {
                     no_shards_available_subrequest_ids.push(subrequest.subrequest_id);
                     continue;
                 }
             };
             let persist_subrequest = PersistSubrequest {
                 subrequest_id: subrequest.subrequest_id,
-                index_uid: next_open_shard.index_uid.clone().into(),
-                source_id: next_open_shard.source_id.clone(),
-                // We don't necessarily persist to this shard. We persist to the shard with the most
-                // capacity on that node.
-                // TODO: Clean this up.
-                shard_id: Some(next_open_shard.shard_id.clone()),
+                index_uid: Some(ingester_node.index_uid.clone()),
+                source_id: subrequest.source_id.clone(),
+                shard_id: None,
                 doc_batch: subrequest.doc_batch.clone(),
             };
             per_leader_persist_subrequests
-                .entry(&next_open_shard.leader_id)
+                .entry(&ingester_node.node_id)
                 .or_default()
                 .push(persist_subrequest);
         }
@@ -461,9 +389,6 @@ impl IngestRouter {
         for subrequest_id in no_shards_available_subrequest_ids {
             workbench.record_no_shards_available(subrequest_id);
         }
-        for subrequest_id in rate_limited_subrequest_ids {
-            workbench.record_rate_limited(subrequest_id);
-        }
         self.process_persist_results(workbench, persist_futures)
             .await;
     }
@@ -516,7 +441,7 @@ impl IngestRouter {
 
     pub async fn debug_info(&self) -> JsonValue {
         let state_guard = self.state.lock().await;
-        let routing_table_json = state_guard.routing_table.debug_info();
+        let routing_table_json = state_guard.node_routing_table.debug_info();
 
         json!({
             "routing_table": routing_table_json,
@@ -640,68 +565,6 @@ impl IngestRouterService for IngestRouter {
 #[derive(Clone)]
 struct WeakRouterState(Weak<Mutex<RouterState>>);
 
-#[async_trait]
-impl EventSubscriber<LocalShardsUpdate> for WeakRouterState {
-    async fn handle_event(&mut self, local_shards_update: LocalShardsUpdate) {
-        let Some(state) = self.0.upgrade() else {
-            return;
-        };
-        let leader_id = local_shards_update.leader_id;
-        let index_uid = local_shards_update.source_uid.index_uid;
-        let source_id = local_shards_update.source_uid.source_id;
-
-        let mut open_shard_ids: Vec<ShardId> = Vec::new();
-        let mut closed_shard_ids: Vec<ShardId> = Vec::new();
-
-        for shard_info in local_shards_update.shard_infos {
-            match shard_info.shard_state {
-                ShardState::Open => open_shard_ids.push(shard_info.shard_id),
-                ShardState::Closed => closed_shard_ids.push(shard_info.shard_id),
-                ShardState::Unavailable | ShardState::Unspecified => {
-                    // Ingesters never broadcast the `Unavailable`` state because, from their point
-                    // of view, they are never unavailable.
-                }
-            }
-        }
-        let mut state_guard = state.lock().await;
-
-        state_guard
-            .routing_table
-            .close_shards(&index_uid, &source_id, &closed_shard_ids);
-
-        state_guard.routing_table.insert_open_shards(
-            &leader_id,
-            index_uid,
-            source_id,
-            &open_shard_ids,
-        );
-    }
-}
-
-#[async_trait]
-impl EventSubscriber<ShardPositionsUpdate> for WeakRouterState {
-    async fn handle_event(&mut self, shard_positions_update: ShardPositionsUpdate) {
-        let Some(state) = self.0.upgrade() else {
-            return;
-        };
-        let mut deleted_shard_ids: Vec<ShardId> = Vec::new();
-
-        for (shard_id, shard_position) in shard_positions_update.updated_shard_positions {
-            if shard_position.is_eof() {
-                deleted_shard_ids.push(shard_id);
-            }
-        }
-        let mut state_guard = state.lock().await;
-
-        let index_uid = shard_positions_update.source_uid.index_uid;
-        let source_id = shard_positions_update.source_uid.source_id;
-
-        state_guard
-            .routing_table
-            .delete_shards(&index_uid, &source_id, &deleted_shard_ids);
-    }
-}
-
 #[async_trait]
 impl EventSubscriber<IngesterCapacityScoreUpdate> for WeakRouterState {
     async fn handle_event(&mut self, update: IngesterCapacityScoreUpdate) {
@@ -725,9 +588,6 @@ pub(super) struct PersistRequestSummary {
 
 #[cfg(test)]
 mod tests {
-    use std::collections::BTreeSet;
-
-    use mockall::Sequence;
     use quickwit_proto::control_plane::{
         GetOrCreateOpenShardsFailure, GetOrCreateOpenShardsFailureReason,
         GetOrCreateOpenShardsResponse, GetOrCreateOpenShardsSuccess, MockControlPlaneService,
@@ -737,15 +597,11 @@ mod tests {
     };
     use quickwit_proto::ingest::router::IngestSubrequest;
     use quickwit_proto::ingest::{
-        CommitTypeV2, DocBatchV2, ParseFailure, ParseFailureReason, Shard, ShardIds, ShardState,
+        CommitTypeV2, DocBatchV2, ParseFailure, ParseFailureReason, Shard, ShardState,
     };
-    use quickwit_proto::types::{DocUid, Position, SourceUid};
-    use tokio::task::yield_now;
+    use quickwit_proto::types::{DocUid, IndexUid, Position, ShardId, SourceUid};
 
     use super::*;
-    use crate::RateMibPerSec;
-    use crate::ingest_v2::broadcast::ShardInfo;
-    use crate::ingest_v2::routing_table::{RoutingEntry, RoutingTableEntry};
     use crate::ingest_v2::workbench::SubworkbenchFailure;
 
     #[tokio::test]
@@ -770,34 +626,18 @@ mod tests {
         assert!(get_or_create_open_shard_request_opt.is_none());
         assert!(rendezvous.is_empty());
 
-        let mut state_guard = router.state.lock().await;
-
-        let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0);
-        state_guard.routing_table.table.insert(
-            ("test-index-0".into(), "test-source".into()),
-            RoutingTableEntry {
-                index_uid: index_uid.clone(),
-                source_id: "test-source".to_string(),
-                local_shards: vec![
-                    RoutingEntry {
-                        index_uid: index_uid.clone(),
-                        source_id: "test-source".to_string(),
-                        shard_id: ShardId::from(1),
-                        shard_state: ShardState::Closed,
-                        leader_id: "test-ingester-0".into(),
-                    },
-                    RoutingEntry {
-                        index_uid: index_uid.clone(),
-                        source_id: "test-source".to_string(),
-                        shard_id: ShardId::from(2),
-                        shard_state: ShardState::Open,
-                        leader_id: "test-ingester-0".into(),
-                    },
-                ],
-                ..Default::default()
-            },
-        );
-        drop(state_guard);
+        {
+            let mut state_guard = router.state.lock().await;
+            state_guard.node_routing_table.apply_capacity_update(
+                "test-ingester-0".into(),
+                SourceUid {
+                    index_uid: IndexUid::for_test("test-index-0", 0),
+                    source_id: "test-source".to_string(),
+                },
+                8,
+                1,
+            );
+        }
 
         let ingest_subrequests: Vec<IngestSubrequest> = vec![
             IngestSubrequest {
@@ -833,24 +673,12 @@ mod tests {
         assert_eq!(subrequest.index_id, "test-index-1");
         assert_eq!(subrequest.source_id, "test-source");
 
-        assert_eq!(get_or_create_open_shard_request.closed_shards.len(), 1);
-        assert_eq!(
-            get_or_create_open_shard_request.closed_shards[0],
-            ShardIds {
-                index_uid: Some(IndexUid::for_test("test-index-0", 0)),
-                source_id: "test-source".to_string(),
-                shard_ids: vec![ShardId::from(1)],
-            }
-        );
-        assert_eq!(
-            get_or_create_open_shard_request.unavailable_leaders.len(),
-            1
-        );
-        assert_eq!(
-            get_or_create_open_shard_request.unavailable_leaders[0],
-            "test-ingester-0"
+        assert!(
+            get_or_create_open_shard_request
+                .unavailable_leaders
+                .is_empty()
         );
-        assert_eq!(workbench.unavailable_leaders.len(), 1);
+        assert!(workbench.unavailable_leaders.is_empty());
 
         let (get_or_create_open_shard_request_opt, rendezvous_2) = router
             .make_get_or_create_open_shard_request(&mut workbench, &ingester_pool)
@@ -867,27 +695,26 @@ mod tests {
 
         ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked());
         {
-            // Ingester-0 has been marked as unavailable due to the previous requests.
+            // Ingester-0 is in pool and in table, but marked unavailable on the workbench
+            // (simulating a prior transport error). has_open_nodes returns false → both
+            // subrequests trigger CP request.
+            workbench
+                .unavailable_leaders
+                .insert("test-ingester-0".into());
             let (get_or_create_open_shard_request_opt, _rendezvous) = router
                 .make_get_or_create_open_shard_request(&mut workbench, &ingester_pool)
                 .await
                 .take();
             let get_or_create_open_shard_request = get_or_create_open_shard_request_opt.unwrap();
             assert_eq!(get_or_create_open_shard_request.subrequests.len(), 2);
-            assert_eq!(workbench.unavailable_leaders.len(), 1);
             assert_eq!(
-                workbench
-                    .unavailable_leaders
-                    .iter()
-                    .next()
-                    .unwrap()
-                    .to_string(),
-                "test-ingester-0"
+                get_or_create_open_shard_request.unavailable_leaders.len(),
+                1
             );
         }
         {
-            // With a fresh workbench, the ingester is not marked as unavailable, and present in the
-            // pool.
+            // Fresh workbench: ingester-0 is in pool, in table, and NOT unavailable.
+            // has_open_nodes returns true for index-0 → only index-1 triggers request.
             let mut workbench = IngestWorkbench::new(ingest_subrequests, 3);
             let (get_or_create_open_shard_request_opt, _rendezvous) = router
                 .make_get_or_create_open_shard_request(&mut workbench, &ingester_pool)
@@ -900,9 +727,10 @@ mod tests {
             assert_eq!(subrequest.index_id, "test-index-1");
             assert_eq!(subrequest.source_id, "test-source");
 
-            assert_eq!(
-                get_or_create_open_shard_request.unavailable_leaders.len(),
-                0
+            assert!(
+                get_or_create_open_shard_request
+                    .unavailable_leaders
+                    .is_empty()
             );
         }
     }
@@ -947,6 +775,7 @@ mod tests {
                                 source_id: "test-source".to_string(),
                                 shard_id: Some(ShardId::from(1)),
                                 shard_state: ShardState::Open as i32,
+                                leader_id: "test-ingester-0".to_string(),
                                 ..Default::default()
                             }],
                         },
@@ -960,6 +789,7 @@ mod tests {
                                     source_id: "test-source".to_string(),
                                     shard_id: Some(ShardId::from(1)),
                                     shard_state: ShardState::Open as i32,
+                                    leader_id: "test-ingester-1".to_string(),
                                     ..Default::default()
                                 },
                                 Shard {
@@ -967,6 +797,7 @@ mod tests {
                                     source_id: "test-source".to_string(),
                                     shard_id: Some(ShardId::from(2)),
                                     shard_state: ShardState::Open as i32,
+                                    leader_id: "test-ingester-1".to_string(),
                                     ..Default::default()
                                 },
                             ],
@@ -1057,23 +888,6 @@ mod tests {
             .populate_routing_table(&mut workbench, get_or_create_open_shards_request)
             .await;
 
-        let state_guard = router.state.lock().await;
-        let routing_table = &state_guard.routing_table;
-        assert_eq!(routing_table.len(), 2);
-
-        let routing_entry_0 = routing_table
-            .find_entry("test-index-0", "test-source")
-            .unwrap();
-        assert_eq!(routing_entry_0.len(), 1);
-        assert_eq!(routing_entry_0.all_shards()[0].shard_id, ShardId::from(1));
-
-        let routing_entry_1 = routing_table
-            .find_entry("test-index-1", "test-source")
-            .unwrap();
-        assert_eq!(routing_entry_1.len(), 2);
-        assert_eq!(routing_entry_1.all_shards()[0].shard_id, ShardId::from(1));
-        assert_eq!(routing_entry_1.all_shards()[1].shard_id, ShardId::from(2));
-
         let subworkbench = workbench.subworkbenches.get(&2).unwrap();
         assert!(matches!(
             subworkbench.last_failure_opt,
@@ -1278,7 +1092,7 @@ mod tests {
                     index_uid: Some(index_uid.clone()),
                     source_id: "test-source".to_string(),
                     shard_id: Some(ShardId::from(1)),
-                    reason: PersistFailureReason::ShardRateLimited as i32,
+                    reason: PersistFailureReason::NoShardsAvailable as i32,
                 }],
             });
             (persist_summary, persist_result)
@@ -1294,89 +1108,6 @@ mod tests {
         ));
     }
 
-    #[tokio::test]
-    async fn test_router_process_persist_results_closes_and_deletes_shards() {
-        let self_node_id = "test-router".into();
-        let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new());
-        let ingester_pool = IngesterPool::default();
-        let replication_factor = 1;
-        let router = IngestRouter::new(
-            self_node_id,
-            control_plane,
-            ingester_pool.clone(),
-            replication_factor,
-            EventBroker::default(),
-        );
-        let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0);
-        let mut state_guard = router.state.lock().await;
-        state_guard.routing_table.replace_shards(
-            index_uid.clone(),
-            "test-source",
-            vec![
-                Shard {
-                    index_uid: Some(index_uid.clone()),
-                    shard_id: Some(ShardId::from(1)),
-                    shard_state: ShardState::Open as i32,
-                    leader_id: "test-ingester-0".to_string(),
-                    ..Default::default()
-                },
-                Shard {
-                    index_uid: Some(index_uid.clone()),
-                    shard_id: Some(ShardId::from(2)),
-                    shard_state: ShardState::Open as i32,
-                    leader_id: "test-ingester-0".to_string(),
-                    ..Default::default()
-                },
-            ],
-        );
-        drop(state_guard);
-
-        let mut workbench = IngestWorkbench::new(Vec::new(), 2);
-        let persist_futures = FuturesUnordered::new();
-
-        persist_futures.push(async {
-            let persist_summary = PersistRequestSummary {
-                leader_id: "test-ingester-0".into(),
-                subrequest_ids: vec![0],
-            };
-            let persist_result = Ok::<_, IngestV2Error>(PersistResponse {
-                leader_id: "test-ingester-0".to_string(),
-                successes: Vec::new(),
-                failures: vec![
-                    PersistFailure {
-                        subrequest_id: 0,
-                        index_uid: Some(index_uid.clone()),
-                        source_id: "test-source".to_string(),
-                        shard_id: Some(ShardId::from(1)),
-                        reason: PersistFailureReason::ShardNotFound as i32,
-                    },
-                    PersistFailure {
-                        subrequest_id: 1,
-                        index_uid: Some(index_uid.clone()),
-                        source_id: "test-source".to_string(),
-                        shard_id: Some(ShardId::from(2)),
-                        reason: PersistFailureReason::ShardClosed as i32,
-                    },
-                ],
-            });
-            (persist_summary, persist_result)
-        });
-        router
-            .process_persist_results(&mut workbench, persist_futures)
-            .await;
-
-        let state_guard = router.state.lock().await;
-        let routing_table_entry = state_guard
-            .routing_table
-            .find_entry("test-index-0", "test-source")
-            .unwrap();
-        assert_eq!(routing_table_entry.len(), 1);
-
-        let shard = routing_table_entry.all_shards()[0];
-        assert_eq!(shard.shard_id, ShardId::from(2));
-        assert_eq!(shard.shard_state, ShardState::Closed);
-    }
-
     #[tokio::test]
     async fn test_router_process_persist_results_does_not_remove_unavailable_leaders() {
         let self_node_id = "test-router".into();
@@ -1470,151 +1201,76 @@ mod tests {
         let self_node_id = "test-router".into();
         let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new());
         let ingester_pool = IngesterPool::default();
-        let replication_factor = 1;
         let router = IngestRouter::new(
             self_node_id,
             control_plane,
             ingester_pool.clone(),
-            replication_factor,
+            1,
             EventBroker::default(),
         );
-        let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0);
-        let index_uid2: IndexUid = IndexUid::for_test("test-index-1", 0);
-        let mut state_guard = router.state.lock().await;
-        state_guard.routing_table.replace_shards(
-            index_uid.clone(),
-            "test-source",
-            vec![Shard {
-                index_uid: Some(index_uid.clone()),
-                source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(1)),
-                shard_state: ShardState::Open as i32,
-                leader_id: "test-ingester-0".to_string(),
-                ..Default::default()
-            }],
-        );
-        state_guard.routing_table.replace_shards(
-            index_uid2.clone(),
-            "test-source",
-            vec![
-                Shard {
-                    index_uid: Some(index_uid2.clone()),
+
+        let index_uid_0: IndexUid = IndexUid::for_test("test-index-0", 0);
+        let index_uid_1: IndexUid = IndexUid::for_test("test-index-1", 0);
+        {
+            let mut state_guard = router.state.lock().await;
+            state_guard.node_routing_table.merge_from_shards(
+                index_uid_0.clone(),
+                "test-source".to_string(),
+                vec![Shard {
+                    index_uid: Some(index_uid_0.clone()),
                     source_id: "test-source".to_string(),
                     shard_id: Some(ShardId::from(1)),
                     shard_state: ShardState::Open as i32,
                     leader_id: "test-ingester-0".to_string(),
-                    follower_id: Some("test-ingester-1".to_string()),
                     ..Default::default()
-                },
-                Shard {
-                    index_uid: Some(index_uid2.clone()),
+                }],
+            );
+            state_guard.node_routing_table.merge_from_shards(
+                index_uid_1.clone(),
+                "test-source".to_string(),
+                vec![Shard {
+                    index_uid: Some(index_uid_1.clone()),
                     source_id: "test-source".to_string(),
-                    shard_id: Some(ShardId::from(2)),
+                    shard_id: Some(ShardId::from(1)),
                     shard_state: ShardState::Open as i32,
                     leader_id: "test-ingester-1".to_string(),
-                    follower_id: Some("test-ingester-2".to_string()),
                     ..Default::default()
-                },
-            ],
-        );
-        drop(state_guard);
+                }],
+            );
+        }
 
+        let index_uid_0_clone = index_uid_0.clone();
         let mut mock_ingester_0 = MockIngesterService::new();
-        let index_uid_clone = index_uid.clone();
-        let index_uid2_clone = index_uid2.clone();
-        mock_ingester_0
-            .expect_persist()
-            .once()
-            .returning(move |request| {
-                assert_eq!(request.leader_id, "test-ingester-0");
-                assert_eq!(request.subrequests.len(), 2);
-                assert_eq!(request.commit_type(), CommitTypeV2::Auto);
-
-                let subrequest = &request.subrequests[0];
-                assert_eq!(subrequest.subrequest_id, 0);
-                assert_eq!(subrequest.index_uid(), &index_uid_clone);
-                assert_eq!(subrequest.source_id, "test-source");
-                assert_eq!(subrequest.shard_id(), ShardId::from(1));
-                assert_eq!(
-                    subrequest.doc_batch,
-                    Some(DocBatchV2::for_test(["", "test-doc-foo", "test-doc-bar"]))
-                );
-
-                let subrequest = &request.subrequests[1];
-                assert_eq!(subrequest.subrequest_id, 1);
-                assert_eq!(subrequest.index_uid(), &index_uid2_clone);
-                assert_eq!(subrequest.source_id, "test-source");
-                assert_eq!(subrequest.shard_id(), ShardId::from(1));
-                assert_eq!(
-                    subrequest.doc_batch,
-                    Some(DocBatchV2::for_test(["test-doc-qux"]))
-                );
-
-                let response = PersistResponse {
-                    leader_id: request.leader_id,
-                    successes: vec![
-                        PersistSuccess {
-                            subrequest_id: 0,
-                            index_uid: Some(index_uid_clone.clone()),
-                            source_id: "test-source".to_string(),
-                            shard_id: Some(ShardId::from(1)),
-                            replication_position_inclusive: Some(Position::offset(1u64)),
-                            num_persisted_docs: 2,
-                            parse_failures: vec![ParseFailure {
-                                doc_uid: Some(DocUid::for_test(0)),
-                                reason: ParseFailureReason::InvalidJson as i32,
-                                message: "invalid JSON".to_string(),
-                            }],
-                        },
-                        PersistSuccess {
-                            subrequest_id: 1,
-                            index_uid: Some(index_uid2_clone.clone()),
-                            source_id: "test-source".to_string(),
-                            shard_id: Some(ShardId::from(1)),
-                            replication_position_inclusive: Some(Position::offset(0u64)),
-                            num_persisted_docs: 1,
-                            parse_failures: Vec::new(),
-                        },
-                    ],
-                    failures: Vec::new(),
-                };
-                Ok(response)
-            });
         mock_ingester_0
             .expect_persist()
             .once()
             .returning(move |request| {
                 assert_eq!(request.leader_id, "test-ingester-0");
                 assert_eq!(request.subrequests.len(), 1);
-                assert_eq!(request.commit_type(), CommitTypeV2::Auto);
+                assert!(request.subrequests[0].shard_id.is_none());
 
-                let subrequest = &request.subrequests[0];
-                assert_eq!(subrequest.subrequest_id, 0);
-                assert_eq!(subrequest.index_uid(), &index_uid);
-                assert_eq!(subrequest.source_id, "test-source");
-                assert_eq!(subrequest.shard_id(), ShardId::from(1));
-                assert_eq!(
-                    subrequest.doc_batch,
-                    Some(DocBatchV2::for_test(["test-doc-moo", "test-doc-baz"]))
-                );
-
-                let response = PersistResponse {
+                Ok(PersistResponse {
                     leader_id: request.leader_id,
                     successes: vec![PersistSuccess {
                         subrequest_id: 0,
-                        index_uid: Some(index_uid.clone()),
+                        index_uid: Some(index_uid_0_clone.clone()),
                         source_id: "test-source".to_string(),
                         shard_id: Some(ShardId::from(1)),
-                        replication_position_inclusive: Some(Position::offset(3u64)),
-                        num_persisted_docs: 4,
-                        parse_failures: Vec::new(),
+                        replication_position_inclusive: Some(Position::offset(1u64)),
+                        num_persisted_docs: 2,
+                        parse_failures: vec![ParseFailure {
+                            doc_uid: Some(DocUid::for_test(0)),
+                            reason: ParseFailureReason::InvalidJson as i32,
+                            message: "invalid JSON".to_string(),
+                        }],
                     }],
                     failures: Vec::new(),
-                };
-                Ok(response)
+                })
             });
-        let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
-        ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
+        ingester_pool.insert(
+            "test-ingester-0".into(),
+            IngesterServiceClient::from_mock(mock_ingester_0),
+        );
 
         let mut mock_ingester_1 = MockIngesterService::new();
         mock_ingester_1
@@ -1623,136 +1279,94 @@ mod tests {
             .returning(move |request| {
                 assert_eq!(request.leader_id, "test-ingester-1");
                 assert_eq!(request.subrequests.len(), 1);
-                assert_eq!(request.commit_type(), CommitTypeV2::Auto);
-
-                let subrequest = &request.subrequests[0];
-                assert_eq!(subrequest.subrequest_id, 1);
-                assert_eq!(subrequest.index_uid(), &index_uid2);
-                assert_eq!(subrequest.source_id, "test-source");
-                assert_eq!(subrequest.shard_id(), ShardId::from(2));
-                assert_eq!(
-                    subrequest.doc_batch,
-                    Some(DocBatchV2::for_test(["test-doc-tux"]))
-                );
+                assert!(request.subrequests[0].shard_id.is_none());
 
-                let response = PersistResponse {
+                Ok(PersistResponse {
                     leader_id: request.leader_id,
                     successes: vec![PersistSuccess {
                         subrequest_id: 1,
-                        index_uid: Some(index_uid2.clone()),
+                        index_uid: Some(index_uid_1.clone()),
                         source_id: "test-source".to_string(),
-                        shard_id: Some(ShardId::from(2)),
+                        shard_id: Some(ShardId::from(1)),
                         replication_position_inclusive: Some(Position::offset(0u64)),
                         num_persisted_docs: 1,
                         parse_failures: Vec::new(),
                     }],
                     failures: Vec::new(),
-                };
-                Ok(response)
+                })
             });
-        let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1);
-        ingester_pool.insert("test-ingester-1".into(), ingester_1);
+        ingester_pool.insert(
+            "test-ingester-1".into(),
+            IngesterServiceClient::from_mock(mock_ingester_1),
+        );
+
+        let response = router
+            .ingest(IngestRequestV2 {
+                subrequests: vec![
+                    IngestSubrequest {
+                        subrequest_id: 0,
+                        index_id: "test-index-0".to_string(),
+                        source_id: "test-source".to_string(),
+                        doc_batch: Some(DocBatchV2::for_test(["", "test-doc-foo", "test-doc-bar"])),
+                    },
+                    IngestSubrequest {
+                        subrequest_id: 1,
+                        index_id: "test-index-1".to_string(),
+                        source_id: "test-source".to_string(),
+                        doc_batch: Some(DocBatchV2::for_test(["test-doc-qux"])),
+                    },
+                ],
+                commit_type: CommitTypeV2::Auto as i32,
+            })
+            .await
+            .unwrap();
 
-        let ingest_request = IngestRequestV2 {
-            subrequests: vec![
-                IngestSubrequest {
-                    subrequest_id: 0,
-                    index_id: "test-index-0".to_string(),
-                    source_id: "test-source".to_string(),
-                    doc_batch: Some(DocBatchV2::for_test(["", "test-doc-foo", "test-doc-bar"])),
-                },
-                IngestSubrequest {
-                    subrequest_id: 1,
-                    index_id: "test-index-1".to_string(),
-                    source_id: "test-source".to_string(),
-                    doc_batch: Some(DocBatchV2::for_test(["test-doc-qux"])),
-                },
-            ],
-            commit_type: CommitTypeV2::Auto as i32,
-        };
-        let response = router.ingest(ingest_request).await.unwrap();
         assert_eq!(response.successes.len(), 2);
         assert_eq!(response.failures.len(), 0);
 
         let parse_failures = &response.successes[0].parse_failures;
         assert_eq!(parse_failures.len(), 1);
-
-        let parse_failure = &parse_failures[0];
-        assert_eq!(parse_failure.doc_uid(), DocUid::for_test(0));
-        assert_eq!(parse_failure.reason(), ParseFailureReason::InvalidJson);
-
-        let ingest_request = IngestRequestV2 {
-            subrequests: vec![
-                IngestSubrequest {
-                    subrequest_id: 0,
-                    index_id: "test-index-0".to_string(),
-                    source_id: "test-source".to_string(),
-                    doc_batch: Some(DocBatchV2::for_test(["test-doc-moo", "test-doc-baz"])),
-                },
-                IngestSubrequest {
-                    subrequest_id: 1,
-                    index_id: "test-index-1".to_string(),
-                    source_id: "test-source".to_string(),
-                    doc_batch: Some(DocBatchV2::for_test(["test-doc-tux"])),
-                },
-            ],
-            commit_type: CommitTypeV2::Auto as i32,
-        };
-        let response = router.ingest(ingest_request).await.unwrap();
-        assert_eq!(response.successes.len(), 2);
-        assert_eq!(response.failures.len(), 0);
-    }
+        assert_eq!(parse_failures[0].doc_uid(), DocUid::for_test(0));
+        assert_eq!(parse_failures[0].reason(), ParseFailureReason::InvalidJson);
+    }
 
     #[tokio::test]
     async fn test_router_ingest_retry() {
         let self_node_id = "test-router".into();
         let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new());
         let ingester_pool = IngesterPool::default();
-        let replication_factor = 1;
         let router = IngestRouter::new(
             self_node_id,
             control_plane,
             ingester_pool.clone(),
-            replication_factor,
+            1,
             EventBroker::default(),
         );
-        let mut state_guard = router.state.lock().await;
         let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0);
-        state_guard.routing_table.replace_shards(
-            index_uid.clone(),
-            "test-source",
-            vec![Shard {
-                index_uid: Some(index_uid.clone()),
-                source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(1)),
-                shard_state: ShardState::Open as i32,
-                leader_id: "test-ingester-0".to_string(),
-                ..Default::default()
-            }],
-        );
-        drop(state_guard);
+        {
+            let mut state_guard = router.state.lock().await;
+            state_guard.node_routing_table.merge_from_shards(
+                index_uid.clone(),
+                "test-source".to_string(),
+                vec![Shard {
+                    index_uid: Some(index_uid.clone()),
+                    source_id: "test-source".to_string(),
+                    shard_id: Some(ShardId::from(1)),
+                    shard_state: ShardState::Open as i32,
+                    leader_id: "test-ingester-0".to_string(),
+                    ..Default::default()
+                }],
+            );
+        }
 
         let mut mock_ingester_0 = MockIngesterService::new();
         let index_uid_clone = index_uid.clone();
+        // First attempt: returns NoShardsAvailable (transient, doesn't mark leader unavailable).
         mock_ingester_0
             .expect_persist()
             .once()
             .returning(move |request| {
-                assert_eq!(request.leader_id, "test-ingester-0");
-                assert_eq!(request.subrequests.len(), 1);
-                assert_eq!(request.commit_type(), CommitTypeV2::Auto);
-
-                let subrequest = &request.subrequests[0];
-                assert_eq!(subrequest.subrequest_id, 0);
-                assert_eq!(subrequest.index_uid(), &index_uid_clone);
-                assert_eq!(subrequest.source_id, "test-source");
-                assert_eq!(subrequest.shard_id(), ShardId::from(1));
-                assert_eq!(
-                    subrequest.doc_batch,
-                    Some(DocBatchV2::for_test(["test-doc-foo"]))
-                );
-
-                let response = PersistResponse {
+                Ok(PersistResponse {
                     leader_id: request.leader_id,
                     successes: Vec::new(),
                     failures: vec![PersistFailure {
@@ -1760,30 +1374,16 @@ mod tests {
                         index_uid: Some(index_uid_clone.clone()),
                         source_id: "test-source".to_string(),
                         shard_id: Some(ShardId::from(1)),
-                        reason: PersistFailureReason::Timeout as i32,
+                        reason: PersistFailureReason::NoShardsAvailable as i32,
                     }],
-                };
-                Ok(response)
+                })
             });
+        // Second attempt: succeeds.
         mock_ingester_0
             .expect_persist()
             .once()
             .returning(move |request| {
-                assert_eq!(request.leader_id, "test-ingester-0");
-                assert_eq!(request.subrequests.len(), 1);
-                assert_eq!(request.commit_type(), CommitTypeV2::Auto);
-
-                let subrequest = &request.subrequests[0];
-                assert_eq!(subrequest.subrequest_id, 0);
-                assert_eq!(subrequest.index_uid(), &index_uid);
-                assert_eq!(subrequest.source_id, "test-source");
-                assert_eq!(subrequest.shard_id(), ShardId::from(1));
-                assert_eq!(
-                    subrequest.doc_batch,
-                    Some(DocBatchV2::for_test(["test-doc-foo"]))
-                );
-
-                let response = PersistResponse {
+                Ok(PersistResponse {
                     leader_id: request.leader_id,
                     successes: vec![PersistSuccess {
                         subrequest_id: 0,
@@ -1795,115 +1395,27 @@ mod tests {
                         parse_failures: Vec::new(),
                     }],
                     failures: Vec::new(),
-                };
-                Ok(response)
+                })
             });
-        let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
-        ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
-
-        let ingest_request = IngestRequestV2 {
-            subrequests: vec![IngestSubrequest {
-                subrequest_id: 0,
-                index_id: "test-index-0".to_string(),
-                source_id: "test-source".to_string(),
-                doc_batch: Some(DocBatchV2::for_test(["test-doc-foo"])),
-            }],
-            commit_type: CommitTypeV2::Auto as i32,
-        };
-        router.ingest(ingest_request).await.unwrap();
-    }
-
-    #[tokio::test]
-    async fn test_router_updates_routing_table_on_chitchat_events() {
-        let self_node_id = "test-router".into();
-        let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new());
-        let ingester_pool = IngesterPool::default();
-        let replication_factor = 1;
-        let event_broker = EventBroker::default();
-        let router = IngestRouter::new(
-            self_node_id,
-            control_plane,
-            ingester_pool.clone(),
-            replication_factor,
-            event_broker.clone(),
-        );
-        router.subscribe();
-        let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0);
-
-        let mut state_guard = router.state.lock().await;
-        state_guard.routing_table.replace_shards(
-            index_uid.clone(),
-            "test-source",
-            vec![Shard {
-                index_uid: Some(index_uid.clone()),
-                shard_id: Some(ShardId::from(1)),
-                shard_state: ShardState::Open as i32,
-                leader_id: "test-ingester".to_string(),
-                ..Default::default()
-            }],
+        ingester_pool.insert(
+            "test-ingester-0".into(),
+            IngesterServiceClient::from_mock(mock_ingester_0),
         );
-        drop(state_guard);
-
-        let local_shards_update = LocalShardsUpdate {
-            leader_id: "test-ingester".into(),
-            source_uid: SourceUid {
-                index_uid: index_uid.clone(),
-                source_id: "test-source".to_string(),
-            },
-            shard_infos: BTreeSet::from_iter([
-                ShardInfo {
-                    shard_id: ShardId::from(1),
-                    shard_state: ShardState::Closed,
-                    short_term_ingestion_rate: RateMibPerSec(0),
-                    long_term_ingestion_rate: RateMibPerSec(0),
-                },
-                ShardInfo {
-                    shard_id: ShardId::from(2),
-                    shard_state: ShardState::Open,
-                    short_term_ingestion_rate: RateMibPerSec(0),
-                    long_term_ingestion_rate: RateMibPerSec(0),
-                },
-            ]),
-        };
-        event_broker.publish(local_shards_update);
-
-        // Yield so that the event is processed.
-        yield_now().await;
-
-        let state_guard = router.state.lock().await;
-        let shards = state_guard
-            .routing_table
-            .find_entry("test-index-0", "test-source")
-            .unwrap()
-            .all_shards();
-        assert_eq!(shards.len(), 2);
-        assert_eq!(shards[0].shard_id, ShardId::from(1));
-        assert_eq!(shards[0].shard_state, ShardState::Closed);
-        assert_eq!(shards[1].shard_id, ShardId::from(2));
-        assert_eq!(shards[1].shard_state, ShardState::Open);
-        drop(state_guard);
 
-        let shard_positions_update = ShardPositionsUpdate {
-            source_uid: SourceUid {
-                index_uid: index_uid.clone(),
-                source_id: "test-source".to_string(),
-            },
-            updated_shard_positions: vec![(ShardId::from(1), Position::eof(0u64))],
-        };
-        event_broker.publish(shard_positions_update);
-
-        // Yield so that the event is processed.
-        yield_now().await;
-
-        let state_guard = router.state.lock().await;
-        let shards = state_guard
-            .routing_table
-            .find_entry("test-index-0", "test-source")
-            .unwrap()
-            .all_shards();
-        assert_eq!(shards.len(), 1);
-        assert_eq!(shards[0].shard_id, ShardId::from(2));
-        drop(state_guard);
+        let response = router
+            .ingest(IngestRequestV2 {
+                subrequests: vec![IngestSubrequest {
+                    subrequest_id: 0,
+                    index_id: "test-index-0".to_string(),
+                    source_id: "test-source".to_string(),
+                    doc_batch: Some(DocBatchV2::for_test(["test-doc-foo"])),
+                }],
+                commit_type: CommitTypeV2::Auto as i32,
+            })
+            .await
+            .unwrap();
+        assert_eq!(response.successes.len(), 1);
+        assert_eq!(response.failures.len(), 0);
     }
 
     #[tokio::test]
@@ -1922,42 +1434,48 @@ mod tests {
         let index_uid_0: IndexUid = IndexUid::for_test("test-index-0", 0);
         let index_uid_1: IndexUid = IndexUid::for_test("test-index-1", 0);
 
-        let mut state_guard = router.state.lock().await;
-        state_guard.routing_table.replace_shards(
-            index_uid_0.clone(),
-            "test-source",
-            vec![Shard {
-                index_uid: Some(index_uid_0.clone()),
-                shard_id: Some(ShardId::from(1)),
-                shard_state: ShardState::Open as i32,
-                leader_id: "test-ingester".to_string(),
-                ..Default::default()
-            }],
-        );
-        state_guard.routing_table.replace_shards(
-            index_uid_1.clone(),
-            "test-source",
-            vec![Shard {
-                index_uid: Some(index_uid_1.clone()),
-                shard_id: Some(ShardId::from(2)),
-                shard_state: ShardState::Open as i32,
-                leader_id: "test-ingester".to_string(),
-                ..Default::default()
-            }],
-        );
-        drop(state_guard);
+        {
+            let mut state_guard = router.state.lock().await;
+            state_guard.node_routing_table.merge_from_shards(
+                index_uid_0.clone(),
+                "test-source".to_string(),
+                vec![Shard {
+                    index_uid: Some(index_uid_0.clone()),
+                    shard_id: Some(ShardId::from(1)),
+                    shard_state: ShardState::Open as i32,
+                    leader_id: "test-ingester-0".to_string(),
+                    ..Default::default()
+                }],
+            );
+            state_guard.node_routing_table.merge_from_shards(
+                index_uid_1.clone(),
+                "test-source".to_string(),
+                vec![Shard {
+                    index_uid: Some(index_uid_1.clone()),
+                    shard_id: Some(ShardId::from(2)),
+                    shard_state: ShardState::Open as i32,
+                    leader_id: "test-ingester-1".to_string(),
+                    ..Default::default()
+                }],
+            );
+        }
 
         let debug_info = router.debug_info().await;
         let routing_table = &debug_info["routing_table"];
         assert_eq!(routing_table.as_object().unwrap().len(), 2);
 
-        assert_eq!(routing_table["test-index-0"].as_array().unwrap().len(), 1);
-        assert_eq!(routing_table["test-index-1"].as_array().unwrap().len(), 1);
+        let index_0_entries = routing_table["test-index-0"].as_array().unwrap();
+        assert_eq!(index_0_entries.len(), 1);
+        assert_eq!(index_0_entries[0]["node_id"], "test-ingester-0");
+        assert_eq!(index_0_entries[0]["capacity_score"], 5);
+
+        let index_1_entries = routing_table["test-index-1"].as_array().unwrap();
+        assert_eq!(index_1_entries.len(), 1);
+        assert_eq!(index_1_entries[0]["node_id"], "test-ingester-1");
     }
 
     #[tokio::test]
-    async fn test_router_does_not_retry_rate_limited_shards() {
-        // We avoid retrying a shard limited shard at the scale of a workbench.
+    async fn test_router_returns_rate_limited_failure() {
         let self_node_id = "test-router".into();
         let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new());
         let ingester_pool = IngesterPool::default();
@@ -1969,138 +1487,51 @@ mod tests {
             replication_factor,
             EventBroker::default(),
         );
-        let mut state_guard = router.state.lock().await;
         let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0);
-
-        state_guard.routing_table.replace_shards(
-            index_uid.clone(),
-            "test-source",
-            vec![
-                Shard {
+        {
+            let mut state_guard = router.state.lock().await;
+            state_guard.node_routing_table.merge_from_shards(
+                index_uid.clone(),
+                "test-source".to_string(),
+                vec![Shard {
                     index_uid: Some(index_uid.clone()),
                     source_id: "test-source".to_string(),
                     shard_id: Some(ShardId::from(1)),
                     shard_state: ShardState::Open as i32,
                     leader_id: "test-ingester-0".to_string(),
                     ..Default::default()
-                },
-                Shard {
-                    index_uid: Some(index_uid.clone()),
-                    source_id: "test-source".to_string(),
-                    shard_id: Some(ShardId::from(2)),
-                    shard_state: ShardState::Open as i32,
-                    leader_id: "test-ingester-0".to_string(),
-                    ..Default::default()
-                },
-            ],
-        );
-        drop(state_guard);
-
-        // We have two shards.
-        // - shard 1 is rate limited
-        // - shard 2 is timeout.
-        // We expect a retry on shard 2 that is then successful.
-        let mut seq = Sequence::new();
+                }],
+            );
+        }
 
         let mut mock_ingester_0 = MockIngesterService::new();
-        mock_ingester_0
-            .expect_persist()
-            .times(1)
-            .returning(move |request| {
-                assert_eq!(request.leader_id, "test-ingester-0");
-                assert_eq!(request.commit_type(), CommitTypeV2::Auto);
-                assert_eq!(request.subrequests.len(), 1);
-                let subrequest = &request.subrequests[0];
-                assert_eq!(subrequest.subrequest_id, 0);
-                let index_uid = subrequest.index_uid().clone();
-                assert_eq!(subrequest.source_id, "test-source");
-                assert_eq!(subrequest.shard_id(), ShardId::from(1));
-                assert_eq!(
-                    subrequest.doc_batch,
-                    Some(DocBatchV2::for_test(["test-doc-foo"]))
-                );
-
-                let response = PersistResponse {
-                    leader_id: request.leader_id,
-                    successes: Vec::new(),
-                    failures: vec![PersistFailure {
-                        subrequest_id: 0,
-                        index_uid: Some(index_uid),
-                        source_id: "test-source".to_string(),
-                        shard_id: Some(ShardId::from(1)),
-                        reason: PersistFailureReason::ShardRateLimited as i32,
-                    }],
-                };
-                Ok(response)
-            })
-            .in_sequence(&mut seq);
-
-        mock_ingester_0
-            .expect_persist()
-            .times(1)
-            .returning(move |request| {
-                assert_eq!(request.leader_id, "test-ingester-0");
-                assert_eq!(request.commit_type(), CommitTypeV2::Auto);
-                assert_eq!(request.subrequests.len(), 1);
-                let subrequest = &request.subrequests[0];
-                assert_eq!(subrequest.subrequest_id, 0);
-                let index_uid = subrequest.index_uid().clone();
-                assert_eq!(subrequest.source_id, "test-source");
-                assert_eq!(subrequest.shard_id(), ShardId::from(2));
-                assert_eq!(
-                    subrequest.doc_batch,
-                    Some(DocBatchV2::for_test(["test-doc-foo"]))
-                );
-
-                let response = PersistResponse {
-                    leader_id: request.leader_id,
-                    successes: Vec::new(),
-                    failures: vec![PersistFailure {
-                        subrequest_id: 0,
-                        index_uid: Some(index_uid),
-                        source_id: "test-source".to_string(),
-                        shard_id: Some(ShardId::from(1)),
-                        reason: PersistFailureReason::Timeout as i32,
-                    }],
-                };
-                Ok(response)
-            })
-            .in_sequence(&mut seq);
-
-        mock_ingester_0
-            .expect_persist()
-            .times(1)
-            .returning(move |request| {
-                assert_eq!(request.leader_id, "test-ingester-0");
-                assert_eq!(request.commit_type(), CommitTypeV2::Auto);
-                assert_eq!(request.subrequests.len(), 1);
-                let subrequest = &request.subrequests[0];
-                assert_eq!(subrequest.subrequest_id, 0);
-                let index_uid = subrequest.index_uid().clone();
-                assert_eq!(subrequest.source_id, "test-source");
-                assert_eq!(subrequest.shard_id(), ShardId::from(2));
-                assert_eq!(
-                    subrequest.doc_batch,
-                    Some(DocBatchV2::for_test(["test-doc-foo"]))
-                );
-
-                let response = PersistResponse {
-                    leader_id: request.leader_id,
-                    successes: vec![PersistSuccess {
-                        subrequest_id: 0,
-                        index_uid: Some(index_uid),
-                        source_id: "test-source".to_string(),
-                        shard_id: Some(ShardId::from(1)),
-                        num_persisted_docs: 1,
-                        replication_position_inclusive: Some(Position::offset(0u64)),
-                        parse_failures: Vec::new(),
-                    }],
-                    failures: Vec::new(),
-                };
-                Ok(response)
-            })
-            .in_sequence(&mut seq);
+        mock_ingester_0.expect_persist().returning(move |request| {
+            assert_eq!(request.leader_id, "test-ingester-0");
+            assert_eq!(request.commit_type(), CommitTypeV2::Auto);
+            assert_eq!(request.subrequests.len(), 1);
+            let subrequest = &request.subrequests[0];
+            assert_eq!(subrequest.subrequest_id, 0);
+            let index_uid = subrequest.index_uid().clone();
+            assert_eq!(subrequest.source_id, "test-source");
+            assert!(subrequest.shard_id.is_none());
+            assert_eq!(
+                subrequest.doc_batch,
+                Some(DocBatchV2::for_test(["test-doc-foo"]))
+            );
 
+            let response = PersistResponse {
+                leader_id: request.leader_id,
+                successes: Vec::new(),
+                failures: vec![PersistFailure {
+                    subrequest_id: 0,
+                    index_uid: Some(index_uid),
+                    source_id: "test-source".to_string(),
+                    shard_id: Some(ShardId::from(1)),
+                    reason: PersistFailureReason::NoShardsAvailable as i32,
+                }],
+            };
+            Ok(response)
+        });
         let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
         ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
 
@@ -2113,88 +1544,130 @@ mod tests {
             }],
             commit_type: CommitTypeV2::Auto as i32,
         };
-        router.ingest(ingest_request).await.unwrap();
+        let ingest_response = router.ingest(ingest_request).await.unwrap();
+        assert_eq!(ingest_response.successes.len(), 0);
+        assert_eq!(ingest_response.failures.len(), 1);
+        assert_eq!(
+            ingest_response.failures[0].reason(),
+            IngestFailureReason::NoShardsAvailable
+        );
     }
 
     #[tokio::test]
-    async fn test_router_returns_rate_limited_failure() {
-        let self_node_id = "test-router".into();
-        let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new());
-        let ingester_pool = IngesterPool::default();
-        let replication_factor = 1;
+    async fn test_router_updates_node_routing_table_on_capacity_update() {
+        let event_broker = EventBroker::default();
         let router = IngestRouter::new(
-            self_node_id,
-            control_plane,
-            ingester_pool.clone(),
-            replication_factor,
-            EventBroker::default(),
+            "test-router".into(),
+            ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()),
+            IngesterPool::default(),
+            1,
+            event_broker.clone(),
         );
-        let mut state_guard = router.state.lock().await;
-        let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0);
+        router.subscribe();
 
-        state_guard.routing_table.replace_shards(
-            index_uid.clone(),
-            "test-source",
-            vec![Shard {
-                index_uid: Some(index_uid.clone()),
+        event_broker.publish(IngesterCapacityScoreUpdate {
+            node_id: "test-ingester-0".into(),
+            source_uid: SourceUid {
+                index_uid: IndexUid::for_test("test-index", 0),
                 source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(1)),
-                shard_state: ShardState::Open as i32,
-                leader_id: "test-ingester-0".to_string(),
-                ..Default::default()
-            }],
-        );
-        drop(state_guard);
-
-        let mut mock_ingester_0 = MockIngesterService::new();
-        mock_ingester_0
-            .expect_persist()
-            .times(1)
-            .returning(move |request| {
-                assert_eq!(request.leader_id, "test-ingester-0");
-                assert_eq!(request.commit_type(), CommitTypeV2::Auto);
-                assert_eq!(request.subrequests.len(), 1);
-                let subrequest = &request.subrequests[0];
-                assert_eq!(subrequest.subrequest_id, 0);
-                let index_uid = subrequest.index_uid().clone();
-                assert_eq!(subrequest.source_id, "test-source");
-                assert_eq!(subrequest.shard_id(), ShardId::from(1));
-                assert_eq!(
-                    subrequest.doc_batch,
-                    Some(DocBatchV2::for_test(["test-doc-foo"]))
-                );
+            },
+            capacity_score: 7,
+            open_shard_count: 3,
+        });
+        // Give the async subscriber a moment to process.
+        tokio::time::sleep(Duration::from_millis(10)).await;
 
-                let response = PersistResponse {
-                    leader_id: request.leader_id,
-                    successes: Vec::new(),
-                    failures: vec![PersistFailure {
-                        subrequest_id: 0,
-                        index_uid: Some(index_uid),
-                        source_id: "test-source".to_string(),
-                        shard_id: Some(ShardId::from(1)),
-                        reason: PersistFailureReason::ShardRateLimited as i32,
-                    }],
-                };
-                Ok(response)
-            });
-        let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
-        ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
+        let state_guard = router.state.lock().await;
+        let entry = state_guard
+            .node_routing_table
+            .find_entry("test-index", "test-source")
+            .unwrap();
+        let node = entry.nodes.get("test-ingester-0").unwrap();
+        assert_eq!(node.capacity_score, 7);
+        assert_eq!(node.open_shard_count, 3);
+    }
 
-        let ingest_request = IngestRequestV2 {
-            subrequests: vec![IngestSubrequest {
+    #[tokio::test]
+    async fn test_router_process_persist_results_marks_unavailable_on_persist_failure() {
+        let router = IngestRouter::new(
+            "test-router".into(),
+            ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()),
+            IngesterPool::default(),
+            1,
+            EventBroker::default(),
+        );
+        let ingest_subrequests = vec![
+            IngestSubrequest {
                 subrequest_id: 0,
                 index_id: "test-index-0".to_string(),
                 source_id: "test-source".to_string(),
-                doc_batch: Some(DocBatchV2::for_test(["test-doc-foo"])),
-            }],
-            commit_type: CommitTypeV2::Auto as i32,
-        };
-        let ingest_response = router.ingest(ingest_request).await.unwrap();
-        assert_eq!(ingest_response.successes.len(), 0);
-        assert_eq!(ingest_response.failures.len(), 1);
-        assert_eq!(
-            ingest_response.failures[0].reason(),
-            IngestFailureReason::ShardRateLimited
+                ..Default::default()
+            },
+            IngestSubrequest {
+                subrequest_id: 1,
+                index_id: "test-index-1".to_string(),
+                source_id: "test-source".to_string(),
+                ..Default::default()
+            },
+        ];
+        let mut workbench = IngestWorkbench::new(ingest_subrequests, 2);
+
+        // NoShardsAvailable does NOT mark the leader as unavailable.
+        let persist_futures = FuturesUnordered::new();
+        persist_futures.push(async {
+            let summary = PersistRequestSummary {
+                leader_id: "test-ingester-0".into(),
+                subrequest_ids: vec![0],
+            };
+            let result = Ok::<_, IngestV2Error>(PersistResponse {
+                leader_id: "test-ingester-0".to_string(),
+                successes: Vec::new(),
+                failures: vec![PersistFailure {
+                    subrequest_id: 0,
+                    index_uid: Some(IndexUid::for_test("test-index-0", 0)),
+                    source_id: "test-source".to_string(),
+                    shard_id: Some(ShardId::from(1)),
+                    reason: PersistFailureReason::NoShardsAvailable as i32,
+                }],
+            });
+            (summary, result)
+        });
+        router
+            .process_persist_results(&mut workbench, persist_futures)
+            .await;
+        assert!(
+            !workbench
+                .unavailable_leaders
+                .contains(&NodeId::from("test-ingester-0"))
+        );
+
+        // NodeUnavailable DOES mark the leader as unavailable.
+        let persist_futures = FuturesUnordered::new();
+        persist_futures.push(async {
+            let summary = PersistRequestSummary {
+                leader_id: "test-ingester-1".into(),
+                subrequest_ids: vec![1],
+            };
+            let result = Ok::<_, IngestV2Error>(PersistResponse {
+                leader_id: "test-ingester-1".to_string(),
+                successes: Vec::new(),
+                failures: vec![PersistFailure {
+                    subrequest_id: 1,
+                    index_uid: Some(IndexUid::for_test("test-index-1", 0)),
+                    source_id: "test-source".to_string(),
+                    shard_id: Some(ShardId::from(1)),
+                    reason: PersistFailureReason::NodeUnavailable as i32,
+                }],
+            });
+            (summary, result)
+        });
+        router
+            .process_persist_results(&mut workbench, persist_futures)
+            .await;
+        assert!(
+            workbench
+                .unavailable_leaders
+                .contains(&NodeId::from("test-ingester-1"))
         );
     }
 }
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs b/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs
index 311aef138d8..3e7b22969e8 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs
@@ -24,7 +24,7 @@ use quickwit_proto::ingest::router::{
     IngestFailure, IngestFailureReason, IngestResponseV2, IngestSubrequest, IngestSuccess,
 };
 use quickwit_proto::ingest::{IngestV2Error, RateLimitingCause};
-use quickwit_proto::types::{NodeId, ShardId, SubrequestId};
+use quickwit_proto::types::{NodeId, SubrequestId};
 use tracing::warn;
 
 use super::publish_tracker::PublishTracker;
@@ -35,7 +35,6 @@ use super::router::PersistRequestSummary;
 #[derive(Default)]
 pub(super) struct IngestWorkbench {
     pub subworkbenches: BTreeMap<SubrequestId, IngestSubworkbench>,
-    pub rate_limited_shards: HashSet<ShardId>,
     pub num_successes: usize,
     /// The number of batch persist attempts. This is not sum of the number of attempts for each
     /// subrequest.
@@ -228,13 +227,6 @@ impl IngestWorkbench {
         self.record_failure(subrequest_id, SubworkbenchFailure::NoShardsAvailable);
     }
 
-    pub fn record_rate_limited(&mut self, subrequest_id: SubrequestId) {
-        self.record_failure(
-            subrequest_id,
-            SubworkbenchFailure::RateLimited(RateLimitingCause::ShardRateLimiting),
-        );
-    }
-
     /// Marks a node as unavailable for the span of the workbench.
     ///
     /// Remaining attempts will treat the node as if it was not in the ingester pool.
@@ -433,7 +425,7 @@ mod tests {
         assert!(!subworkbench.last_failure_is_transient());
 
         subworkbench.last_failure_opt = Some(SubworkbenchFailure::Persist(
-            PersistFailureReason::ShardRateLimited,
+            PersistFailureReason::NoShardsAvailable,
         ));
         assert!(subworkbench.is_pending());
         assert!(subworkbench.last_failure_is_transient());
@@ -807,7 +799,7 @@ mod tests {
 
         let persist_failure = PersistFailure {
             subrequest_id: 42,
-            reason: PersistFailureReason::ShardRateLimited as i32,
+            reason: PersistFailureReason::NoShardsAvailable as i32,
             ..Default::default()
         };
         workbench.record_persist_failure(&persist_failure);
diff --git a/quickwit/quickwit-proto/protos/quickwit/ingester.proto b/quickwit/quickwit-proto/protos/quickwit/ingester.proto
index a5b651d94d8..25a4705d58a 100644
--- a/quickwit/quickwit-proto/protos/quickwit/ingester.proto
+++ b/quickwit/quickwit-proto/protos/quickwit/ingester.proto
@@ -96,11 +96,10 @@ message PersistSuccess {
 
 enum PersistFailureReason {
   PERSIST_FAILURE_REASON_UNSPECIFIED = 0;
-  PERSIST_FAILURE_REASON_SHARD_NOT_FOUND = 1;
-  PERSIST_FAILURE_REASON_SHARD_CLOSED = 2;
-  PERSIST_FAILURE_REASON_SHARD_RATE_LIMITED = 3;
   PERSIST_FAILURE_REASON_WAL_FULL = 4;
   PERSIST_FAILURE_REASON_TIMEOUT = 5;
+  PERSIST_FAILURE_REASON_NO_SHARDS_AVAILABLE = 6;
+  PERSIST_FAILURE_REASON_NODE_UNAVAILABLE = 7;
 }
 
 message PersistFailure {
diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs
index 018e19a39a9..07b8d5b64a1 100644
--- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs
+++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs
@@ -397,11 +397,10 @@ pub struct ObservationMessage {
 #[repr(i32)]
 pub enum PersistFailureReason {
     Unspecified = 0,
-    ShardNotFound = 1,
-    ShardClosed = 2,
-    ShardRateLimited = 3,
     WalFull = 4,
     Timeout = 5,
+    NoShardsAvailable = 6,
+    NodeUnavailable = 7,
 }
 impl PersistFailureReason {
     /// String value of the enum field names used in the ProtoBuf definition.
@@ -411,22 +410,20 @@ impl PersistFailureReason {
     pub fn as_str_name(&self) -> &'static str {
         match self {
             Self::Unspecified => "PERSIST_FAILURE_REASON_UNSPECIFIED",
-            Self::ShardNotFound => "PERSIST_FAILURE_REASON_SHARD_NOT_FOUND",
-            Self::ShardClosed => "PERSIST_FAILURE_REASON_SHARD_CLOSED",
-            Self::ShardRateLimited => "PERSIST_FAILURE_REASON_SHARD_RATE_LIMITED",
             Self::WalFull => "PERSIST_FAILURE_REASON_WAL_FULL",
             Self::Timeout => "PERSIST_FAILURE_REASON_TIMEOUT",
+            Self::NoShardsAvailable => "PERSIST_FAILURE_REASON_NO_SHARDS_AVAILABLE",
+            Self::NodeUnavailable => "PERSIST_FAILURE_REASON_NODE_UNAVAILABLE",
         }
     }
     /// Creates an enum from field names used in the ProtoBuf definition.
     pub fn from_str_name(value: &str) -> ::core::option::Option<Self> {
         match value {
             "PERSIST_FAILURE_REASON_UNSPECIFIED" => Some(Self::Unspecified),
-            "PERSIST_FAILURE_REASON_SHARD_NOT_FOUND" => Some(Self::ShardNotFound),
-            "PERSIST_FAILURE_REASON_SHARD_CLOSED" => Some(Self::ShardClosed),
-            "PERSIST_FAILURE_REASON_SHARD_RATE_LIMITED" => Some(Self::ShardRateLimited),
             "PERSIST_FAILURE_REASON_WAL_FULL" => Some(Self::WalFull),
             "PERSIST_FAILURE_REASON_TIMEOUT" => Some(Self::Timeout),
+            "PERSIST_FAILURE_REASON_NO_SHARDS_AVAILABLE" => Some(Self::NoShardsAvailable),
+            "PERSIST_FAILURE_REASON_NODE_UNAVAILABLE" => Some(Self::NodeUnavailable),
             _ => None,
         }
     }
diff --git a/quickwit/quickwit-proto/src/ingest/mod.rs b/quickwit/quickwit-proto/src/ingest/mod.rs
index fda347d7931..6d529d79fbd 100644
--- a/quickwit/quickwit-proto/src/ingest/mod.rs
+++ b/quickwit/quickwit-proto/src/ingest/mod.rs
@@ -313,11 +313,10 @@ impl From<PersistFailureReason> for IngestFailureReason {
     fn from(reason: PersistFailureReason) -> Self {
         match reason {
             PersistFailureReason::Unspecified => IngestFailureReason::Unspecified,
-            PersistFailureReason::ShardNotFound => IngestFailureReason::NoShardsAvailable,
-            PersistFailureReason::ShardClosed => IngestFailureReason::NoShardsAvailable,
+            PersistFailureReason::NoShardsAvailable => IngestFailureReason::NoShardsAvailable,
             PersistFailureReason::WalFull => IngestFailureReason::WalFull,
-            PersistFailureReason::ShardRateLimited => IngestFailureReason::ShardRateLimited,
             PersistFailureReason::Timeout => IngestFailureReason::Timeout,
+            PersistFailureReason::NodeUnavailable => IngestFailureReason::NoShardsAvailable,
         }
     }
 }
@@ -326,8 +325,8 @@ impl From<ReplicateFailureReason> for PersistFailureReason {
     fn from(reason: ReplicateFailureReason) -> Self {
         match reason {
             ReplicateFailureReason::Unspecified => PersistFailureReason::Unspecified,
-            ReplicateFailureReason::ShardNotFound => PersistFailureReason::ShardNotFound,
-            ReplicateFailureReason::ShardClosed => PersistFailureReason::ShardClosed,
+            ReplicateFailureReason::ShardNotFound => PersistFailureReason::NoShardsAvailable,
+            ReplicateFailureReason::ShardClosed => PersistFailureReason::NoShardsAvailable,
             ReplicateFailureReason::WalFull => PersistFailureReason::WalFull,
         }
     }
diff --git a/quickwit/quickwit-serve/src/lib.rs b/quickwit/quickwit-serve/src/lib.rs
index ca4520ff0ce..60515bc819f 100644
--- a/quickwit/quickwit-serve/src/lib.rs
+++ b/quickwit/quickwit-serve/src/lib.rs
@@ -82,8 +82,9 @@ use quickwit_indexing::models::ShardPositionsService;
 use quickwit_indexing::start_indexing_service;
 use quickwit_ingest::{
     GetMemoryCapacity, IngestRequest, IngestRouter, IngestServiceClient, Ingester, IngesterPool,
-    LocalShardsUpdate, get_idle_shard_timeout, setup_local_shards_update_listener,
-    start_ingest_api_service, wait_for_ingester_decommission, wait_for_ingester_status,
+    LocalShardsUpdate, get_idle_shard_timeout, setup_ingester_capacity_update_listener,
+    setup_local_shards_update_listener, start_ingest_api_service, wait_for_ingester_decommission,
+    wait_for_ingester_status,
 };
 use quickwit_jaeger::JaegerService;
 use quickwit_janitor::{JanitorService, start_janitor_service};
@@ -906,6 +907,9 @@ async fn setup_ingest_v2(
         event_broker.clone(),
     );
     ingest_router.subscribe();
+    setup_ingester_capacity_update_listener(cluster.clone(), event_broker.clone())
+        .await
+        .forever();
 
     let ingest_router_service = IngestRouterServiceClient::tower()
         .stack_layer(INGEST_GRPC_SERVER_METRICS_LAYER.clone())
diff --git a/quickwit/rest-api-tests/scenarii/tag_fields/0002_negative_tags.yaml b/quickwit/rest-api-tests/scenarii/tag_fields/0002_negative_tags.yaml
index f1f900c342c..99f0e5ed285 100644
--- a/quickwit/rest-api-tests/scenarii/tag_fields/0002_negative_tags.yaml
+++ b/quickwit/rest-api-tests/scenarii/tag_fields/0002_negative_tags.yaml
@@ -1,23 +1,23 @@
 # regression test for https://github.com/quickwit-oss/quickwit/issues/4698
-endpoint: simple/search
+endpoint: tag-simple/search
 params:
   query: "tag:1"
 expected:
   num_hits: 3
 ---
-endpoint: simple/search
+endpoint: tag-simple/search
 params:
   query: "-tag:2"
 expected:
   num_hits: 4
 ---
-endpoint: simple/search
+endpoint: tag-simple/search
 params:
   query: "tag:2"
 expected:
   num_hits: 1
 ---
-endpoint: simple/search
+endpoint: tag-simple/search
 params:
   query: "-tag:1"
 expected:
diff --git a/quickwit/rest-api-tests/scenarii/tag_fields/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/tag_fields/_setup.quickwit.yaml
index 1208ca48343..4ae0b2eb465 100644
--- a/quickwit/rest-api-tests/scenarii/tag_fields/_setup.quickwit.yaml
+++ b/quickwit/rest-api-tests/scenarii/tag_fields/_setup.quickwit.yaml
@@ -4,14 +4,14 @@ endpoint: indexes/allowedtypes
 status_code: null
 ---
 method: DELETE
-endpoint: indexes/simple
+endpoint: indexes/tag-simple
 status_code: null
 ---
 method: POST
 endpoint: indexes/
 json:
   version: "0.7"
-  index_id: simple
+  index_id: tag-simple
   doc_mapping:
     field_mappings:
       - name: seq
@@ -21,7 +21,7 @@ json:
     tag_fields: ["tag"]
 ---
 method: POST
-endpoint: simple/ingest
+endpoint: tag-simple/ingest
 params:
   commit: force
 ndjson:
@@ -29,7 +29,7 @@ ndjson:
   - {"seq": 2, "tag": 2}
 ---
 method: POST
-endpoint: simple/ingest
+endpoint: tag-simple/ingest
 params:
   commit: force
 ndjson:
@@ -37,7 +37,7 @@ ndjson:
   - {"seq": 3, "tag": null}
 ---
 method: POST
-endpoint: simple/ingest
+endpoint: tag-simple/ingest
 params:
   commit: force
 ndjson:
diff --git a/quickwit/rest-api-tests/scenarii/tag_fields/_teardown.quickwit.yaml b/quickwit/rest-api-tests/scenarii/tag_fields/_teardown.quickwit.yaml
index fa0ca391b51..0c3ac8fd6a4 100644
--- a/quickwit/rest-api-tests/scenarii/tag_fields/_teardown.quickwit.yaml
+++ b/quickwit/rest-api-tests/scenarii/tag_fields/_teardown.quickwit.yaml
@@ -3,4 +3,4 @@ endpoint: indexes/allowedtypes
 status_code: null
 ---
 method: DELETE
-endpoint: indexes/simple
+endpoint: indexes/tag-simple

From 006f951c25379fc664bab2220ccc69583d2347ba Mon Sep 17 00:00:00 2001
From: nadav-govari <nadav.govari@datadoghq.com>
Date: Tue, 3 Mar 2026 13:28:39 -0500
Subject: [PATCH 4/9] Piggyback routing update on persist response (#6173)

---
 .github/workflows/ci.yml                      |  28 ++-
 .../broadcast/ingester_capacity_score.rs      | 233 ++----------------
 .../src/ingest_v2/debouncing.rs               |   4 +-
 .../quickwit-ingest/src/ingest_v2/ingester.rs |  28 ++-
 quickwit/quickwit-ingest/src/ingest_v2/mod.rs |   7 +-
 .../src/ingest_v2/node_routing_table.rs       | 190 +++++++++-----
 .../quickwit-ingest/src/ingest_v2/router.rs   | 162 +++++++++++-
 .../quickwit-ingest/src/ingest_v2/state.rs    | 158 ++++++++----
 .../src/ingest_v2/wal_capacity_timeseries.rs  | 214 ++++++++++++++++
 .../src/ingest_v2/workbench.rs                |   3 +-
 .../protos/quickwit/ingester.proto            |  13 +
 .../quickwit/quickwit.ingest.ingester.rs      |  22 ++
 quickwit/quickwit-proto/src/getters.rs        |   1 +
 13 files changed, 701 insertions(+), 362 deletions(-)
 create mode 100644 quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_timeseries.rs

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index e4a12d69521..ada6218f295 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -55,16 +55,24 @@ jobs:
       - name: Cleanup Disk Space
         run: |
           df -h
-          sudo rm -rf /usr/share/dotnet
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/swift
-          sudo rm -rf /usr/local/.ghcup
-          sudo rm -rf /opt/hostedtoolcache/CodeQL
-          df -h
+        
+          if [ "$(df -BG / | awk 'NR==2 {gsub("G","",$4); print $4}')" -lt 30 ]; then
+            echo "Less than 30GiB available. Running cleanup..."
+            sudo rm -rf /usr/share/dotnet
+            sudo rm -rf /usr/local/lib/android
+            sudo rm -rf /usr/share/swift
+            sudo rm -rf /usr/local/.ghcup
+            sudo rm -rf /opt/hostedtoolcache/CodeQL
+            df -h
+          else
+            echo "30GiB or more available. Skipping cleanup."
+          fi
 
       - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
       - name: Install Ubuntu packages
-        run: sudo apt-get -y install protobuf-compiler
+        run: |
+          sudo apt-get update
+          sudo apt-get -y install protobuf-compiler
       - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v.6.1.0
         with:
           python-version: '3.11'
@@ -90,7 +98,7 @@ jobs:
           workspaces: "./quickwit -> target"
       - name: Install nextest
         if: always() && steps.modified.outputs.rust_src == 'true'
-        uses: taiki-e/cache-cargo-install-action@34ce5120836e5f9f1508d8713d7fdea0e8facd6f # v3.0.1
+        uses: taiki-e/install-action@aba36d755ec7ca22d38b12111787c26115943952
         with:
           tool: cargo-nextest
       - name: cargo build
@@ -132,7 +140,9 @@ jobs:
               - .github/workflows/ci.yml
       - name: Install Ubuntu packages
         if: always() && steps.modified.outputs.rust_src == 'true'
-        run: sudo apt-get -y install protobuf-compiler
+        run: |
+          sudo apt-get update
+          sudo apt-get -y install protobuf-compiler
       - name: Setup nightly Rust Toolchain (for rustfmt)
         if: steps.modified.outputs.rust_src == 'true'
         uses: dtolnay/rust-toolchain@f7ccc83f9ed1e5b9c81d8a67d7ad1a747e22a561 # master
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs
index 9531db17deb..482f5f58886 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs
@@ -18,100 +18,17 @@ use anyhow::{Context, Result};
 use bytesize::ByteSize;
 use quickwit_cluster::{Cluster, ListenerHandle};
 use quickwit_common::pubsub::{Event, EventBroker};
-use quickwit_common::ring_buffer::RingBuffer;
 use quickwit_common::shared_consts::INGESTER_CAPACITY_SCORE_PREFIX;
 use quickwit_proto::ingest::ingester::IngesterStatus;
-use quickwit_proto::types::{IndexUid, NodeId, SourceId, SourceUid};
+use quickwit_proto::types::{NodeId, SourceUid};
 use serde::{Deserialize, Serialize};
 use tokio::task::JoinHandle;
 use tracing::{info, warn};
 
 use super::{BROADCAST_INTERVAL_PERIOD, make_key, parse_key};
+use crate::OpenShardCounts;
 use crate::ingest_v2::state::WeakIngesterState;
 
-pub type OpenShardCounts = Vec<(IndexUid, SourceId, usize)>;
-
-/// The lookback window length is meant to capture readings far enough back in time to give
-/// a rough rate of change estimate. At size 6, with broadcast interval of 5 seconds, this would be
-/// 30 seconds of readings.
-const WAL_CAPACITY_LOOKBACK_WINDOW_LEN: usize = 6;
-
-/// The ring buffer stores one extra element so that `delta()` can compare the newest reading
-/// with the one that is exactly `WAL_CAPACITY_LOOKBACK_WINDOW_LEN` steps ago. Otherwise, that
-/// reading would be discarded when the next reading is inserted.
-const WAL_CAPACITY_READINGS_LEN: usize = WAL_CAPACITY_LOOKBACK_WINDOW_LEN + 1;
-
-struct WalDiskCapacityTimeSeries {
-    memory_capacity: ByteSize,
-    readings: RingBuffer<f64, WAL_CAPACITY_READINGS_LEN>,
-}
-
-impl WalDiskCapacityTimeSeries {
-    fn new(memory_capacity: ByteSize) -> Self {
-        #[cfg(not(test))]
-        assert!(memory_capacity.as_u64() > 0);
-        Self {
-            memory_capacity,
-            readings: RingBuffer::default(),
-        }
-    }
-
-    fn record(&mut self, memory_used: ByteSize) {
-        let remaining = 1.0 - (memory_used.as_u64() as f64 / self.memory_capacity.as_u64() as f64);
-        self.readings.push_back(remaining.clamp(0.0, 1.0));
-    }
-
-    fn current(&self) -> Option<f64> {
-        self.readings.last()
-    }
-
-    /// How much remaining capacity changed between the oldest and newest readings.
-    /// Positive = improving, negative = draining.
-    fn delta(&self) -> Option<f64> {
-        let current = self.readings.last()?;
-        let oldest = self.readings.front()?;
-        Some(current - oldest)
-    }
-}
-
-/// Computes a capacity score from 0 to 10 using a PD controller.
-///
-/// The score has two components:
-///
-/// - **P (proportional):** How much WAL capacity remains right now. An ingester with 100% free
-///   capacity gets `PROPORTIONAL_WEIGHT` points; 50% gets half; and so on. If remaining capacity
-///   drops to `MIN_PERMISSIBLE_CAPACITY` or below, the score is immediately 0.
-///
-/// - **D (derivative):** Up to `DERIVATIVE_WEIGHT` bonus points based on how fast remaining
-///   capacity is changing over the lookback window. A higher drain rate is worse, so we invert it:
-///   `drain / MAX_DRAIN_RATE` normalizes the drain to a 0–1 penalty, and subtracting from 1
-///   converts it into a 0–1 bonus. Multiplied by `DERIVATIVE_WEIGHT`, a stable node gets the full
-///   bonus and a node draining at `MAX_DRAIN_RATE` or faster gets nothing.
-///
-/// Putting it together: a completely idle ingester scores 10 (8 + 2).
-/// One that is full but stable scores ~2. One that is draining rapidly scores less.
-/// A score of 0 means the ingester is at or below minimum permissible capacity.
-///
-/// Below this remaining capacity fraction, the score is immediately 0.
-const MIN_PERMISSIBLE_CAPACITY: f64 = 0.05;
-/// Weight of the proportional term (max points from P).
-const PROPORTIONAL_WEIGHT: f64 = 8.0;
-/// Weight of the derivative term (max points from D).
-const DERIVATIVE_WEIGHT: f64 = 2.0;
-/// The drain rate (as a fraction of total capacity over the lookback window) at which the
-/// derivative penalty is fully applied. Drain rates beyond this are clamped.
-const MAX_DRAIN_RATE: f64 = 0.10;
-
-fn compute_capacity_score(remaining_capacity: f64, capacity_delta: f64) -> usize {
-    if remaining_capacity <= MIN_PERMISSIBLE_CAPACITY {
-        return 0;
-    }
-    let p = PROPORTIONAL_WEIGHT * remaining_capacity;
-    let drain = (-capacity_delta).clamp(0.0, MAX_DRAIN_RATE);
-    let d = DERIVATIVE_WEIGHT * (1.0 - drain / MAX_DRAIN_RATE);
-    (p + d).clamp(0.0, 10.0) as usize
-}
-
 #[derive(Debug, Clone, Default, Serialize, Deserialize)]
 pub struct IngesterCapacityScore {
     pub capacity_score: usize,
@@ -123,24 +40,18 @@ pub struct IngesterCapacityScore {
 pub struct BroadcastIngesterCapacityScoreTask {
     cluster: Cluster,
     weak_state: WeakIngesterState,
-    wal_capacity_time_series: WalDiskCapacityTimeSeries,
 }
 
 impl BroadcastIngesterCapacityScoreTask {
-    pub fn spawn(
-        cluster: Cluster,
-        weak_state: WeakIngesterState,
-        disk_capacity: ByteSize,
-    ) -> JoinHandle<()> {
+    pub fn spawn(cluster: Cluster, weak_state: WeakIngesterState) -> JoinHandle<()> {
         let mut broadcaster = Self {
             cluster,
             weak_state,
-            wal_capacity_time_series: WalDiskCapacityTimeSeries::new(disk_capacity),
         };
         tokio::spawn(async move { broadcaster.run().await })
     }
 
-    async fn snapshot(&self) -> Result<Option<(ByteSize, OpenShardCounts)>> {
+    async fn snapshot(&self) -> Result<Option<(usize, OpenShardCounts)>> {
         let state = self
             .weak_state
             .upgrade()
@@ -152,15 +63,16 @@ impl BroadcastIngesterCapacityScoreTask {
             return Ok(None);
         }
 
-        let guard = state
+        let mut guard = state
             .lock_fully()
             .await
             .map_err(|_| anyhow::anyhow!("failed to acquire ingester state lock"))?;
         let usage = guard.mrecordlog.resource_usage();
         let disk_used = ByteSize::b(usage.disk_used_bytes as u64);
-        let open_shard_counts = guard.get_open_shard_counts();
+        let capacity_score = guard.wal_capacity_time_series.record_and_score(disk_used);
+        let (open_shard_counts, _) = guard.get_shard_snapshot();
 
-        Ok(Some((disk_used, open_shard_counts)))
+        Ok(Some((capacity_score, open_shard_counts)))
     }
 
     async fn run(&mut self) {
@@ -170,7 +82,7 @@ impl BroadcastIngesterCapacityScoreTask {
         loop {
             interval.tick().await;
 
-            let (disk_used, open_shard_counts) = match self.snapshot().await {
+            let (capacity_score, open_shard_counts) = match self.snapshot().await {
                 Ok(Some(snapshot)) => snapshot,
                 Ok(None) => continue,
                 Err(error) => {
@@ -179,12 +91,6 @@ impl BroadcastIngesterCapacityScoreTask {
                 }
             };
 
-            self.wal_capacity_time_series.record(disk_used);
-
-            let remaining_capacity = self.wal_capacity_time_series.current().unwrap_or(1.0);
-            let capacity_delta = self.wal_capacity_time_series.delta().unwrap_or(0.0);
-            let capacity_score = compute_capacity_score(remaining_capacity, capacity_delta);
-
             previous_sources = self
                 .broadcast_capacity(capacity_score, &open_shard_counts, &previous_sources)
                 .await;
@@ -266,90 +172,12 @@ mod tests {
     use std::sync::atomic::{AtomicUsize, Ordering};
 
     use quickwit_cluster::{ChannelTransport, create_cluster_for_test};
-    use quickwit_proto::types::ShardId;
+    use quickwit_proto::types::{IndexUid, ShardId, SourceId};
 
     use super::*;
     use crate::ingest_v2::models::IngesterShard;
     use crate::ingest_v2::state::IngesterState;
 
-    fn ts() -> WalDiskCapacityTimeSeries {
-        WalDiskCapacityTimeSeries::new(ByteSize::b(100))
-    }
-
-    /// Helper: record a reading with `used` bytes against the series' fixed capacity.
-    fn record(series: &mut WalDiskCapacityTimeSeries, used: u64) {
-        series.record(ByteSize::b(used));
-    }
-
-    #[test]
-    fn test_wal_disk_capacity_current_after_record() {
-        let mut series = WalDiskCapacityTimeSeries::new(ByteSize::b(256));
-        // 192 of 256 used => 25% remaining
-        series.record(ByteSize::b(192));
-        assert_eq!(series.current(), Some(0.25));
-
-        // 16 of 256 used => 93.75% remaining
-        series.record(ByteSize::b(16));
-        assert_eq!(series.current(), Some(0.9375));
-    }
-
-    #[test]
-    fn test_wal_disk_capacity_record_saturates_at_zero() {
-        let mut series = ts();
-        // 200 used out of 100 capacity => clamped to 0.0
-        record(&mut series, 200);
-        assert_eq!(series.current(), Some(0.0));
-    }
-
-    #[test]
-    fn test_wal_disk_capacity_delta_growing() {
-        let mut series = ts();
-        // oldest: 60 of 100 used => 40% remaining
-        record(&mut series, 60);
-        // current: 20 of 100 used => 80% remaining
-        record(&mut series, 20);
-        // delta = 0.80 - 0.40 = 0.40
-        assert_eq!(series.delta(), Some(0.40));
-    }
-
-    #[test]
-    fn test_wal_disk_capacity_delta_shrinking() {
-        let mut series = ts();
-        // oldest: 20 of 100 used => 80% remaining
-        record(&mut series, 20);
-        // current: 60 of 100 used => 40% remaining
-        record(&mut series, 60);
-        // delta = 0.40 - 0.80 = -0.40
-        assert_eq!(series.delta(), Some(-0.40));
-    }
-
-    #[test]
-    fn test_capacity_score_draining_vs_stable() {
-        // Node A: capacity draining — usage increases 10, 20, ..., 70 over 7 ticks.
-        let mut node_a = ts();
-        for used in (10..=70).step_by(10) {
-            record(&mut node_a, used);
-        }
-        let a_remaining = node_a.current().unwrap();
-        let a_delta = node_a.delta().unwrap();
-        let a_score = compute_capacity_score(a_remaining, a_delta);
-
-        // Node B: steady at 50% usage over 7 ticks.
-        let mut node_b = ts();
-        for _ in 0..7 {
-            record(&mut node_b, 50);
-        }
-        let b_remaining = node_b.current().unwrap();
-        let b_delta = node_b.delta().unwrap();
-        let b_score = compute_capacity_score(b_remaining, b_delta);
-
-        // p=2.4, d=0 (max drain) => 2
-        assert_eq!(a_score, 2);
-        // p=4, d=2 (stable) => 6
-        assert_eq!(b_score, 6);
-        assert!(b_score > a_score);
-    }
-
     #[tokio::test]
     async fn test_snapshot_state_dropped() {
         let transport = ChannelTransport::default();
@@ -363,7 +191,6 @@ mod tests {
         let task = BroadcastIngesterCapacityScoreTask {
             cluster,
             weak_state,
-            wal_capacity_time_series: WalDiskCapacityTimeSeries::new(ByteSize::mb(1)),
         };
         assert!(task.snapshot().await.is_err());
     }
@@ -376,7 +203,9 @@ mod tests {
             .unwrap();
         let event_broker = EventBroker::default();
 
-        let (_temp_dir, state) = IngesterState::for_test().await;
+        // Use 1000 bytes disk capacity so 500 used => 50% remaining, 0 delta => score = 6
+        let (_temp_dir, state) =
+            IngesterState::for_test_with_disk_capacity(ByteSize::b(1000)).await;
         let index_uid = IndexUid::for_test("test-index", 0);
         let mut state_guard = state.lock_partially().await.unwrap();
         let shard = IngesterShard::new_solo(
@@ -387,21 +216,18 @@ mod tests {
         .advertisable()
         .build();
         state_guard.shards.insert(shard.queue_id(), shard);
-        let open_shard_counts = state_guard.get_open_shard_counts();
+        let (open_shard_counts, _) = state_guard.get_shard_snapshot();
+        let capacity_score = state_guard
+            .wal_capacity_time_series
+            .record_and_score(ByteSize::b(500));
         drop(state_guard);
 
-        // Simulate 500 of 1000 bytes capacity used => 50% remaining, 0 delta => score = 6
-        let mut task = BroadcastIngesterCapacityScoreTask {
+        assert_eq!(capacity_score, 6);
+
+        let task = BroadcastIngesterCapacityScoreTask {
             cluster: cluster.clone(),
             weak_state: state.weak(),
-            wal_capacity_time_series: WalDiskCapacityTimeSeries::new(ByteSize::b(1000)),
         };
-        task.wal_capacity_time_series.record(ByteSize::b(500));
-
-        let remaining = task.wal_capacity_time_series.current().unwrap();
-        let delta = task.wal_capacity_time_series.delta().unwrap();
-        let capacity_score = compute_capacity_score(remaining, delta);
-        assert_eq!(capacity_score, 6);
 
         let update_counter = Arc::new(AtomicUsize::new(0));
         let update_counter_clone = update_counter.clone();
@@ -434,23 +260,4 @@ mod tests {
         assert_eq!(deserialized.capacity_score, 6);
         assert_eq!(deserialized.open_shard_count, 1);
     }
-
-    #[test]
-    fn test_wal_disk_capacity_delta_spans_lookback_window() {
-        let mut series = ts();
-
-        // Fill to exactly the lookback window length (6 readings), all same value.
-        for _ in 0..WAL_CAPACITY_LOOKBACK_WINDOW_LEN {
-            record(&mut series, 50);
-        }
-        assert_eq!(series.delta(), Some(0.0));
-
-        // 7th reading fills the ring buffer. Delta spans 6 intervals.
-        record(&mut series, 0);
-        assert_eq!(series.delta(), Some(0.50));
-
-        // 8th reading evicts the oldest 50-remaining. Delta still spans 6 intervals.
-        record(&mut series, 0);
-        assert_eq!(series.delta(), Some(0.50));
-    }
 }
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/debouncing.rs b/quickwit/quickwit-ingest/src/ingest_v2/debouncing.rs
index 19d6f5d691d..041f2928c45 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/debouncing.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/debouncing.rs
@@ -18,6 +18,7 @@ use std::sync::Arc;
 use quickwit_proto::control_plane::{
     GetOrCreateOpenShardsRequest, GetOrCreateOpenShardsSubrequest,
 };
+use quickwit_proto::ingest::ShardIds;
 use quickwit_proto::types::{IndexId, SourceId};
 use tokio::sync::{OwnedRwLockWriteGuard, RwLock};
 
@@ -68,6 +69,7 @@ impl GetOrCreateOpenShardsRequestDebouncer {
 #[derive(Default)]
 pub(super) struct DebouncedGetOrCreateOpenShardsRequest {
     subrequests: Vec<GetOrCreateOpenShardsSubrequest>,
+    pub closed_shards: Vec<ShardIds>,
     pub unavailable_leaders: Vec<String>,
     rendezvous: Rendezvous,
 }
@@ -83,8 +85,8 @@ impl DebouncedGetOrCreateOpenShardsRequest {
         }
         let request = GetOrCreateOpenShardsRequest {
             subrequests: self.subrequests,
+            closed_shards: self.closed_shards,
             unavailable_leaders: self.unavailable_leaders,
-            ..Default::default()
         };
         (Some(request), self.rendezvous)
     }
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
index 5cd231d5a4a..0eafd689b66 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
@@ -132,11 +132,11 @@ impl Ingester {
         idle_shard_timeout: Duration,
     ) -> IngestV2Result<Self> {
         let self_node_id: NodeId = cluster.self_node_id().into();
-        let state = IngesterState::load(wal_dir_path, rate_limiter_settings);
+        let state = IngesterState::load(wal_dir_path, disk_capacity, rate_limiter_settings);
 
         let weak_state = state.weak();
         BroadcastLocalShardsTask::spawn(cluster.clone(), weak_state.clone());
-        BroadcastIngesterCapacityScoreTask::spawn(cluster, weak_state.clone(), disk_capacity);
+        BroadcastIngesterCapacityScoreTask::spawn(cluster, weak_state.clone());
         CloseIdleShardsTask::spawn(weak_state, idle_shard_timeout);
 
         let ingester = Self {
@@ -468,6 +468,7 @@ impl Ingester {
                 leader_id: leader_id.into(),
                 successes: Vec::new(),
                 failures: persist_failures,
+                routing_update: None,
             };
             return Ok(persist_response);
         }
@@ -788,15 +789,33 @@ impl Ingester {
             }
         }
         let wal_usage = state_guard.mrecordlog.resource_usage();
-        drop(state_guard);
-
         let disk_used = wal_usage.disk_used_bytes as u64;
+        let (open_shard_counts, closed_shards) = state_guard.get_shard_snapshot();
+        let capacity_score = state_guard
+            .wal_capacity_time_series
+            .score(ByteSize::b(disk_used)) as u32;
+        drop(state_guard);
 
         if disk_used >= self.disk_capacity.as_u64() * 90 / 100 {
             self.background_reset_shards();
         }
         report_wal_usage(wal_usage);
 
+        let source_shard_updates = open_shard_counts
+            .into_iter()
+            .map(|(index_uid, source_id, count)| SourceShardUpdate {
+                index_uid: Some(index_uid),
+                source_id,
+                open_shard_count: count as u32,
+            })
+            .collect();
+
+        let routing_update = RoutingUpdate {
+            capacity_score,
+            source_shard_updates,
+            closed_shards,
+        };
+
         #[cfg(test)]
         {
             persist_successes.sort_by_key(|success| success.subrequest_id);
@@ -807,6 +826,7 @@ impl Ingester {
             leader_id,
             successes: persist_successes,
             failures: persist_failures,
+            routing_update: Some(routing_update),
         };
         Ok(persist_response)
     }
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
index 0bb3d6b6138..f6c1cd42732 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
@@ -31,6 +31,7 @@ mod router;
 #[allow(dead_code)]
 mod routing_table;
 mod state;
+mod wal_capacity_timeseries;
 mod workbench;
 
 use std::collections::HashMap;
@@ -50,7 +51,9 @@ use quickwit_common::tower::Pool;
 use quickwit_proto::ingest::ingester::IngesterServiceClient;
 use quickwit_proto::ingest::router::{IngestRequestV2, IngestSubrequest};
 use quickwit_proto::ingest::{CommitTypeV2, DocBatchV2};
-use quickwit_proto::types::{DocUid, DocUidGenerator, IndexId, NodeId, SubrequestId};
+use quickwit_proto::types::{
+    DocUid, DocUidGenerator, IndexId, IndexUid, NodeId, SourceId, SubrequestId,
+};
 use serde::Serialize;
 use tracing::{error, info};
 use workbench::pending_subrequests;
@@ -70,6 +73,8 @@ pub type LeaderId = NodeId;
 
 pub type FollowerId = NodeId;
 
+pub type OpenShardCounts = Vec<(IndexUid, SourceId, usize)>;
+
 const IDLE_SHARD_TIMEOUT_ENV_KEY: &str = "QW_IDLE_SHARD_TIMEOUT_SECS";
 
 const DEFAULT_IDLE_SHARD_TIMEOUT: Duration = Duration::from_secs(15 * 60); // 15 minutes
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs
index f354011ede2..6b23fcd1aca 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs
@@ -14,8 +14,9 @@
 
 use std::collections::{HashMap, HashSet};
 
+use itertools::Itertools;
 use quickwit_proto::ingest::Shard;
-use quickwit_proto::types::{IndexId, IndexUid, NodeId, SourceId, SourceUid};
+use quickwit_proto::types::{IndexId, IndexUid, NodeId, SourceId};
 use rand::rng;
 use rand::seq::IndexedRandom;
 
@@ -36,7 +37,7 @@ pub(super) struct IngesterNode {
     pub open_shard_count: usize,
 }
 
-#[derive(Debug)]
+#[derive(Debug, Default)]
 pub(super) struct RoutingEntry {
     pub nodes: HashMap<NodeId, IngesterNode>,
 }
@@ -135,67 +136,59 @@ impl NodeBasedRoutingTable {
     pub fn apply_capacity_update(
         &mut self,
         node_id: NodeId,
-        source_uid: SourceUid,
+        index_uid: IndexUid,
+        source_id: SourceId,
         capacity_score: usize,
         open_shard_count: usize,
     ) {
-        let key = (
-            source_uid.index_uid.index_id.to_string(),
-            source_uid.source_id.clone(),
-        );
-
-        let entry = self.table.entry(key).or_insert_with(|| RoutingEntry {
-            nodes: HashMap::new(),
-        });
+        let key = (index_uid.index_id.to_string(), source_id.clone());
 
+        let entry = self.table.entry(key).or_default();
         let ingester_node = IngesterNode {
             node_id: node_id.clone(),
-            index_uid: source_uid.index_uid,
-            source_id: source_uid.source_id,
+            index_uid,
+            source_id,
             capacity_score,
             open_shard_count,
         };
         entry.nodes.insert(node_id, ingester_node);
     }
 
-    /// Merges nodes from a GetOrCreateOpenShards control plane response into the
-    /// table. Only adds nodes that aren't already present — existing nodes keep
-    /// their real capacity scores from the broadcast.
-    /// TODO: New nodes get a default capacity_score of 5 until GetOrCreateOpenShards contains
-    /// capacity scores.
+    /// Merges routing updates from a GetOrCreateOpenShards control plane response into the
+    /// table. For existing nodes, updates their open shard count, including if the count is 0, from
+    /// the CP response while preserving capacity scores if they already exist.
+    /// New nodes get a default capacity_score of 5.
     pub fn merge_from_shards(
         &mut self,
         index_uid: IndexUid,
         source_id: SourceId,
         shards: Vec<Shard>,
     ) {
-        let key = (index_uid.index_id.to_string(), source_id.clone());
-
-        let mut per_leader_count: HashMap<NodeId, usize> = HashMap::new();
-        for shard in &shards {
-            if shard.is_open() {
-                *per_leader_count
-                    .entry(NodeId::from(shard.leader_id.clone()))
-                    .or_default() += 1;
-            }
-        }
+        let per_leader_count: HashMap<NodeId, usize> = shards
+            .iter()
+            .map(|shard| {
+                let num_open_shards = shard.is_open() as usize;
+                let leader_id = NodeId::from(shard.leader_id.clone());
+                (leader_id, num_open_shards)
+            })
+            .into_grouping_map()
+            .sum();
 
-        let entry = self.table.entry(key).or_insert_with(|| RoutingEntry {
-            nodes: HashMap::new(),
-        });
+        let key = (index_uid.index_id.to_string(), source_id.clone());
+        let entry = self.table.entry(key).or_default();
 
         for (node_id, open_shard_count) in per_leader_count {
-            if entry.nodes.contains_key(&node_id) {
-                continue;
-            }
-            let ingester_node = IngesterNode {
-                node_id: node_id.clone(),
-                index_uid: index_uid.clone(),
-                source_id: source_id.clone(),
-                capacity_score: 5,
-                open_shard_count,
-            };
-            entry.nodes.insert(node_id, ingester_node);
+            entry
+                .nodes
+                .entry(node_id.clone())
+                .and_modify(|node| node.open_shard_count = open_shard_count)
+                .or_insert_with(|| IngesterNode {
+                    node_id,
+                    index_uid: index_uid.clone(),
+                    source_id: source_id.clone(),
+                    capacity_score: 5,
+                    open_shard_count,
+                });
         }
     }
 }
@@ -208,37 +201,53 @@ mod tests {
 
     use super::*;
 
-    fn source_uid(index_id: &str, incarnation_id: u128, source_id: &str) -> SourceUid {
-        SourceUid {
-            index_uid: IndexUid::for_test(index_id, incarnation_id),
-            source_id: source_id.to_string(),
-        }
-    }
-
     #[test]
     fn test_apply_capacity_update() {
         let mut table = NodeBasedRoutingTable::default();
-        let uid = source_uid("test-index", 0, "test-source");
-        let key = ("test-index".to_string(), "test-source".to_string());
+        let key = ("test-index".to_string(), "test-source".into());
 
         // Insert first node.
-        table.apply_capacity_update("node-1".into(), uid.clone(), 8, 3);
+        table.apply_capacity_update(
+            "node-1".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            8,
+            3,
+        );
         let entry = table.table.get(&key).unwrap();
         assert_eq!(entry.nodes.len(), 1);
         assert_eq!(entry.nodes.get("node-1").unwrap().capacity_score, 8);
 
         // Update existing node.
-        table.apply_capacity_update("node-1".into(), uid.clone(), 4, 5);
+        table.apply_capacity_update(
+            "node-1".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            4,
+            5,
+        );
         let node = table.table.get(&key).unwrap().nodes.get("node-1").unwrap();
         assert_eq!(node.capacity_score, 4);
         assert_eq!(node.open_shard_count, 5);
 
         // Add second node.
-        table.apply_capacity_update("node-2".into(), uid.clone(), 6, 2);
+        table.apply_capacity_update(
+            "node-2".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            6,
+            2,
+        );
         assert_eq!(table.table.get(&key).unwrap().nodes.len(), 2);
 
         // Zero shards: node stays in table but becomes ineligible for routing.
-        table.apply_capacity_update("node-1".into(), uid.clone(), 0, 0);
+        table.apply_capacity_update(
+            "node-1".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            0,
+            0,
+        );
         let entry = table.table.get(&key).unwrap();
         assert_eq!(entry.nodes.len(), 2);
         assert_eq!(entry.nodes.get("node-1").unwrap().open_shard_count, 0);
@@ -249,13 +258,18 @@ mod tests {
     fn test_has_open_nodes() {
         let mut table = NodeBasedRoutingTable::default();
         let pool = IngesterPool::default();
-        let uid = source_uid("test-index", 0, "test-source");
 
         // Empty table.
         assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new()));
 
         // Node exists but is not in pool.
-        table.apply_capacity_update("node-1".into(), uid.clone(), 8, 3);
+        table.apply_capacity_update(
+            "node-1".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            8,
+            3,
+        );
         assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new()));
 
         // Node is in pool → true.
@@ -267,12 +281,24 @@ mod tests {
         assert!(!table.has_open_nodes("test-index", "test-source", &pool, &unavailable));
 
         // Second node available → true despite first being unavailable.
-        table.apply_capacity_update("node-2".into(), uid.clone(), 6, 2);
+        table.apply_capacity_update(
+            "node-2".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            6,
+            2,
+        );
         pool.insert("node-2".into(), IngesterServiceClient::mocked());
         assert!(table.has_open_nodes("test-index", "test-source", &pool, &unavailable));
 
         // Node with capacity_score=0 is not eligible.
-        table.apply_capacity_update("node-2".into(), uid, 0, 2);
+        table.apply_capacity_update(
+            "node-2".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            0,
+            2,
+        );
         assert!(!table.has_open_nodes("test-index", "test-source", &pool, &unavailable));
     }
 
@@ -280,11 +306,16 @@ mod tests {
     fn test_pick_node() {
         let mut table = NodeBasedRoutingTable::default();
         let pool = IngesterPool::default();
-        let uid = source_uid("test-index", 0, "test-source");
         let key = ("test-index".to_string(), "test-source".to_string());
 
         // Node exists but not in pool → None.
-        table.apply_capacity_update("node-1".into(), uid.clone(), 8, 3);
+        table.apply_capacity_update(
+            "node-1".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            8,
+            3,
+        );
         assert!(
             table
                 .table
@@ -305,7 +336,13 @@ mod tests {
         assert_eq!(picked.node_id, NodeId::from("node-1"));
 
         // Multiple nodes → something is returned.
-        table.apply_capacity_update("node-2".into(), uid.clone(), 2, 1);
+        table.apply_capacity_update(
+            "node-2".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            2,
+            1,
+        );
         pool.insert("node-2".into(), IngesterServiceClient::mocked());
         assert!(
             table
@@ -317,8 +354,20 @@ mod tests {
         );
 
         // Node with capacity_score=0 is skipped.
-        table.apply_capacity_update("node-1".into(), uid.clone(), 0, 3);
-        table.apply_capacity_update("node-2".into(), uid, 0, 1);
+        table.apply_capacity_update(
+            "node-1".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            0,
+            3,
+        );
+        table.apply_capacity_update(
+            "node-2".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            0,
+            1,
+        );
         assert!(
             table
                 .table
@@ -385,17 +434,18 @@ mod tests {
             ..Default::default()
         };
 
-        // Two open shards on node-1, one open on node-2, one closed (ignored).
+        // Two open shards on node-1, one open + one closed on node-2, only closed on node-3.
         let shards = vec![
             make_shard(1, "node-1", true),
             make_shard(2, "node-1", true),
             make_shard(3, "node-2", true),
             make_shard(4, "node-2", false),
+            make_shard(5, "node-3", false),
         ];
         table.merge_from_shards(index_uid.clone(), "test-source".into(), shards);
 
         let entry = table.table.get(&key).unwrap();
-        assert_eq!(entry.nodes.len(), 2);
+        assert_eq!(entry.nodes.len(), 3);
 
         let n1 = entry.nodes.get("node-1").unwrap();
         assert_eq!(n1.open_shard_count, 2);
@@ -404,14 +454,18 @@ mod tests {
         let n2 = entry.nodes.get("node-2").unwrap();
         assert_eq!(n2.open_shard_count, 1);
 
+        let n3 = entry.nodes.get("node-3").unwrap();
+        assert_eq!(n3.open_shard_count, 0);
+
         // Merging again adds new nodes but preserves existing ones.
-        let shards = vec![make_shard(10, "node-3", true)];
+        let shards = vec![make_shard(10, "node-4", true)];
         table.merge_from_shards(index_uid, "test-source".into(), shards);
 
         let entry = table.table.get(&key).unwrap();
-        assert_eq!(entry.nodes.len(), 3);
+        assert_eq!(entry.nodes.len(), 4);
         assert!(entry.nodes.contains_key("node-1"));
         assert!(entry.nodes.contains_key("node-2"));
         assert!(entry.nodes.contains_key("node-3"));
+        assert!(entry.nodes.contains_key("node-4"));
     }
 }
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs
index da3d989d93e..1f5ca26865e 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs
@@ -98,7 +98,9 @@ pub struct IngestRouter {
 }
 
 struct RouterState {
+    // Debounces `GetOrCreateOpenShardsRequest` requests to the control plane.
     debouncer: GetOrCreateOpenShardsRequestDebouncer,
+    // Routing table of nodes, their WAL capacity, and the number of open shards per source.
     node_routing_table: NodeBasedRoutingTable,
 }
 
@@ -152,7 +154,7 @@ impl IngestRouter {
         ingester_pool: &IngesterPool,
     ) -> DebouncedGetOrCreateOpenShardsRequest {
         let mut debounced_request = DebouncedGetOrCreateOpenShardsRequest::default();
-        let unavailable_leaders = &workbench.unavailable_leaders;
+        let unavailable_leaders: &HashSet<NodeId> = &workbench.unavailable_leaders;
 
         let mut state_guard = self.state.lock().await;
 
@@ -186,6 +188,12 @@ impl IngestRouter {
         }
         drop(state_guard);
 
+        if !debounced_request.is_empty() && !workbench.closed_shards.is_empty() {
+            info!(closed_shards=?workbench.closed_shards, "reporting closed shard(s) to control plane");
+            debounced_request
+                .closed_shards
+                .append(&mut workbench.closed_shards);
+        }
         if !debounced_request.is_empty() && !unavailable_leaders.is_empty() {
             info!(unavailable_leaders=?unavailable_leaders, "reporting unavailable leader(s) to control plane");
 
@@ -265,6 +273,8 @@ impl IngestRouter {
         while let Some((persist_summary, persist_result)) = persist_futures.next().await {
             match persist_result {
                 Ok(persist_response) => {
+                    let leader_id = NodeId::from(persist_response.leader_id.clone());
+
                     for persist_success in persist_response.successes {
                         workbench.record_persist_success(persist_success);
                     }
@@ -272,16 +282,37 @@ impl IngestRouter {
                         workbench.record_persist_failure(&persist_failure);
 
                         match persist_failure.reason() {
-                            PersistFailureReason::NoShardsAvailable => {}
+                            PersistFailureReason::NoShardsAvailable => {
+                                // For non-critical failures, we don't mark the nodes unavailable;
+                                // a routing update is piggybacked on PersistResponses, so shard
+                                // counts and capacity scores will be fresh on the next try.
+                            }
                             PersistFailureReason::NodeUnavailable
                             | PersistFailureReason::WalFull
                             | PersistFailureReason::Timeout => {
-                                unavailable_leaders
-                                    .insert(NodeId::from(persist_response.leader_id.clone()));
+                                unavailable_leaders.insert(leader_id.clone());
                             }
                             _ => {}
                         }
                     }
+
+                    if let Some(routing_update) = persist_response.routing_update {
+                        // Since we just talked to the node, we take advantage and use the
+                        // opportunity to get a fresh routing update.
+                        let mut state_guard = self.state.lock().await;
+                        for shard_update in routing_update.source_shard_updates {
+                            state_guard.node_routing_table.apply_capacity_update(
+                                leader_id.clone(),
+                                shard_update.index_uid().clone(),
+                                shard_update.source_id,
+                                routing_update.capacity_score as usize,
+                                shard_update.open_shard_count as usize,
+                            );
+                        }
+                        drop(state_guard);
+
+                        workbench.closed_shards.extend(routing_update.closed_shards);
+                    }
                 }
                 Err(persist_error) => {
                     if workbench.is_last_attempt() {
@@ -574,7 +605,8 @@ impl EventSubscriber<IngesterCapacityScoreUpdate> for WeakRouterState {
         let mut state_guard = state.lock().await;
         state_guard.node_routing_table.apply_capacity_update(
             update.node_id,
-            update.source_uid,
+            update.source_uid.index_uid,
+            update.source_uid.source_id,
             update.capacity_score,
             update.open_shard_count,
         );
@@ -593,7 +625,8 @@ mod tests {
         GetOrCreateOpenShardsResponse, GetOrCreateOpenShardsSuccess, MockControlPlaneService,
     };
     use quickwit_proto::ingest::ingester::{
-        IngesterServiceClient, MockIngesterService, PersistFailure, PersistResponse, PersistSuccess,
+        IngesterServiceClient, MockIngesterService, PersistFailure, PersistResponse,
+        PersistSuccess, RoutingUpdate, SourceShardUpdate,
     };
     use quickwit_proto::ingest::router::IngestSubrequest;
     use quickwit_proto::ingest::{
@@ -630,10 +663,8 @@ mod tests {
             let mut state_guard = router.state.lock().await;
             state_guard.node_routing_table.apply_capacity_update(
                 "test-ingester-0".into(),
-                SourceUid {
-                    index_uid: IndexUid::for_test("test-index-0", 0),
-                    source_id: "test-source".to_string(),
-                },
+                IndexUid::for_test("test-index-0", 0),
+                "test-source".to_string(),
                 8,
                 1,
             );
@@ -1042,6 +1073,11 @@ mod tests {
                     ..Default::default()
                 }],
                 failures: Vec::new(),
+                routing_update: Some(RoutingUpdate {
+                    capacity_score: 6,
+                    source_shard_updates: Vec::new(),
+                    ..Default::default()
+                }),
             });
             (persist_summary, persist_result)
         });
@@ -1094,6 +1130,11 @@ mod tests {
                     shard_id: Some(ShardId::from(1)),
                     reason: PersistFailureReason::NoShardsAvailable as i32,
                 }],
+                routing_update: Some(RoutingUpdate {
+                    capacity_score: 6,
+                    source_shard_updates: Vec::new(),
+                    ..Default::default()
+                }),
             });
             (persist_summary, persist_result)
         });
@@ -1265,6 +1306,11 @@ mod tests {
                         }],
                     }],
                     failures: Vec::new(),
+                    routing_update: Some(RoutingUpdate {
+                        capacity_score: 6,
+                        source_shard_updates: Vec::new(),
+                        ..Default::default()
+                    }),
                 })
             });
         ingester_pool.insert(
@@ -1293,6 +1339,11 @@ mod tests {
                         parse_failures: Vec::new(),
                     }],
                     failures: Vec::new(),
+                    routing_update: Some(RoutingUpdate {
+                        capacity_score: 6,
+                        source_shard_updates: Vec::new(),
+                        ..Default::default()
+                    }),
                 })
             });
         ingester_pool.insert(
@@ -1360,8 +1411,9 @@ mod tests {
         }
 
         let mut mock_ingester_0 = MockIngesterService::new();
-        let index_uid_clone = index_uid.clone();
         // First attempt: returns NoShardsAvailable (transient, doesn't mark leader unavailable).
+        // The response still reports capacity_score=6 and 1 open shard so the node stays routable.
+        let index_uid_clone = index_uid.clone();
         mock_ingester_0
             .expect_persist()
             .once()
@@ -1376,6 +1428,15 @@ mod tests {
                         shard_id: Some(ShardId::from(1)),
                         reason: PersistFailureReason::NoShardsAvailable as i32,
                     }],
+                    routing_update: Some(RoutingUpdate {
+                        capacity_score: 6,
+                        source_shard_updates: vec![SourceShardUpdate {
+                            index_uid: Some(index_uid_clone.clone()),
+                            source_id: "test-source".to_string(),
+                            open_shard_count: 1,
+                        }],
+                        ..Default::default()
+                    }),
                 })
             });
         // Second attempt: succeeds.
@@ -1395,6 +1456,11 @@ mod tests {
                         parse_failures: Vec::new(),
                     }],
                     failures: Vec::new(),
+                    routing_update: Some(RoutingUpdate {
+                        capacity_score: 6,
+                        source_shard_updates: Vec::new(),
+                        ..Default::default()
+                    }),
                 })
             });
         ingester_pool.insert(
@@ -1524,11 +1590,20 @@ mod tests {
                 successes: Vec::new(),
                 failures: vec![PersistFailure {
                     subrequest_id: 0,
-                    index_uid: Some(index_uid),
+                    index_uid: Some(index_uid.clone()),
                     source_id: "test-source".to_string(),
                     shard_id: Some(ShardId::from(1)),
                     reason: PersistFailureReason::NoShardsAvailable as i32,
                 }],
+                routing_update: Some(RoutingUpdate {
+                    capacity_score: 6,
+                    source_shard_updates: vec![SourceShardUpdate {
+                        index_uid: Some(index_uid),
+                        source_id: "test-source".to_string(),
+                        open_shard_count: 1,
+                    }],
+                    ..Default::default()
+                }),
             };
             Ok(response)
         });
@@ -1629,6 +1704,11 @@ mod tests {
                     shard_id: Some(ShardId::from(1)),
                     reason: PersistFailureReason::NoShardsAvailable as i32,
                 }],
+                routing_update: Some(RoutingUpdate {
+                    capacity_score: 6,
+                    source_shard_updates: Vec::new(),
+                    ..Default::default()
+                }),
             });
             (summary, result)
         });
@@ -1658,6 +1738,11 @@ mod tests {
                     shard_id: Some(ShardId::from(1)),
                     reason: PersistFailureReason::NodeUnavailable as i32,
                 }],
+                routing_update: Some(RoutingUpdate {
+                    capacity_score: 6,
+                    source_shard_updates: Vec::new(),
+                    ..Default::default()
+                }),
             });
             (summary, result)
         });
@@ -1670,4 +1755,57 @@ mod tests {
                 .contains(&NodeId::from("test-ingester-1"))
         );
     }
+
+    #[tokio::test]
+    async fn test_router_process_persist_results_applies_piggybacked_routing_updates() {
+        let router = IngestRouter::new(
+            "test-router".into(),
+            ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()),
+            IngesterPool::default(),
+            1,
+            EventBroker::default(),
+        );
+        let ingest_subrequests = vec![IngestSubrequest {
+            subrequest_id: 0,
+            index_id: "test-index".to_string(),
+            source_id: "test-source".to_string(),
+            ..Default::default()
+        }];
+        let mut workbench = IngestWorkbench::new(ingest_subrequests, 2);
+
+        let persist_futures = FuturesUnordered::new();
+        persist_futures.push(async {
+            let summary = PersistRequestSummary {
+                leader_id: "test-ingester-0".into(),
+                subrequest_ids: vec![0],
+            };
+            let result = Ok::<_, IngestV2Error>(PersistResponse {
+                leader_id: "test-ingester-0".to_string(),
+                successes: Vec::new(),
+                failures: Vec::new(),
+                routing_update: Some(RoutingUpdate {
+                    capacity_score: 3,
+                    source_shard_updates: vec![SourceShardUpdate {
+                        index_uid: Some(IndexUid::for_test("test-index", 0)),
+                        source_id: "test-source".to_string(),
+                        open_shard_count: 2,
+                    }],
+                    ..Default::default()
+                }),
+            });
+            (summary, result)
+        });
+        router
+            .process_persist_results(&mut workbench, persist_futures)
+            .await;
+
+        let state_guard = router.state.lock().await;
+        let entry = state_guard
+            .node_routing_table
+            .find_entry("test-index", "test-source")
+            .unwrap();
+        let node = entry.nodes.get("test-ingester-0").unwrap();
+        assert_eq!(node.capacity_score, 3);
+        assert_eq!(node.open_shard_count, 2);
+    }
 }
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/state.rs b/quickwit/quickwit-ingest/src/ingest_v2/state.rs
index a14f4ae9a44..e158bce7c58 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/state.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/state.rs
@@ -19,6 +19,7 @@ use std::path::Path;
 use std::sync::{Arc, Weak};
 use std::time::{Duration, Instant};
 
+use bytesize::ByteSize;
 use itertools::Itertools;
 use mrecordlog::error::{DeleteQueueError, TruncateError};
 use quickwit_common::pretty::PrettyDisplay;
@@ -26,7 +27,7 @@ use quickwit_common::rate_limiter::{RateLimiter, RateLimiterSettings};
 use quickwit_doc_mapper::DocMapper;
 use quickwit_proto::control_plane::AdviseResetShardsResponse;
 use quickwit_proto::ingest::ingester::IngesterStatus;
-use quickwit_proto::ingest::{IngestV2Error, IngestV2Result, ShardState};
+use quickwit_proto::ingest::{IngestV2Error, IngestV2Result, ShardIds, ShardState};
 use quickwit_proto::types::{DocMappingUid, IndexUid, Position, QueueId, SourceId, split_queue_id};
 use tokio::sync::{Mutex, MutexGuard, RwLock, RwLockMappedWriteGuard, RwLockWriteGuard, watch};
 use tracing::{error, info};
@@ -34,9 +35,10 @@ use tracing::{error, info};
 use super::models::IngesterShard;
 use super::rate_meter::RateMeter;
 use super::replication::{ReplicationStreamTaskHandle, ReplicationTaskHandle};
+use super::wal_capacity_timeseries::WalDiskCapacityTimeSeries;
 use crate::ingest_v2::mrecordlog_utils::{force_delete_queue, queue_position_range};
 use crate::mrecordlog_async::MultiRecordLogAsync;
-use crate::{FollowerId, LeaderId};
+use crate::{FollowerId, LeaderId, OpenShardCounts};
 
 /// Stores the state of the ingester and attempts to prevent deadlocks by exposing an API that
 /// guarantees that the internal data structures are always locked in the same order.
@@ -59,6 +61,7 @@ pub(super) struct InnerIngesterState {
     pub replication_streams: HashMap<FollowerId, ReplicationStreamTaskHandle>,
     // Replication tasks running for each replication stream opened with leaders.
     pub replication_tasks: HashMap<LeaderId, ReplicationTaskHandle>,
+    pub wal_capacity_time_series: WalDiskCapacityTimeSeries,
     status: IngesterStatus,
     status_tx: watch::Sender<IngesterStatus>,
 }
@@ -89,20 +92,45 @@ impl InnerIngesterState {
             .map(|(_, shard)| shard)
     }
 
-    pub fn get_open_shard_counts(&self) -> Vec<(IndexUid, SourceId, usize)> {
-        self.shards
+    /// Returns per-source open shard counts and closed shard IDs for all advertisable,
+    /// non-replica shards.
+    pub fn get_shard_snapshot(&self) -> (OpenShardCounts, Vec<ShardIds>) {
+        let grouped = self
+            .shards
             .values()
-            .filter(|shard| shard.is_advertisable && !shard.is_replica() && shard.is_open())
-            .map(|shard| (shard.index_uid.clone(), shard.source_id.clone()))
-            .counts()
-            .into_iter()
-            .map(|((index_uid, source_id), count)| (index_uid, source_id, count))
-            .collect()
+            .filter(|shard| shard.is_advertisable && !shard.is_replica())
+            .map(|shard| ((shard.index_uid.clone(), shard.source_id.clone()), shard))
+            .into_group_map();
+
+        let mut open_counts = Vec::new();
+        let mut closed_shards = Vec::new();
+
+        for ((index_uid, source_id), shards) in grouped {
+            let mut open_count = 0;
+            let mut closed_ids = Vec::new();
+
+            for shard in shards {
+                if shard.is_open() {
+                    open_count += 1;
+                } else if shard.is_closed() {
+                    closed_ids.push(shard.shard_id.clone());
+                }
+            }
+            open_counts.push((index_uid.clone(), source_id.clone(), open_count));
+            if !closed_ids.is_empty() {
+                closed_shards.push(ShardIds {
+                    index_uid: Some(index_uid),
+                    source_id,
+                    shard_ids: closed_ids,
+                });
+            }
+        }
+        (open_counts, closed_shards)
     }
 }
 
 impl IngesterState {
-    fn new() -> Self {
+    fn new(disk_capacity: ByteSize) -> Self {
         let status = IngesterStatus::Initializing;
         let (status_tx, status_rx) = watch::channel(status);
         let inner = InnerIngesterState {
@@ -110,6 +138,7 @@ impl IngesterState {
             doc_mappers: Default::default(),
             replication_streams: Default::default(),
             replication_tasks: Default::default(),
+            wal_capacity_time_series: WalDiskCapacityTimeSeries::new(disk_capacity),
             status,
             status_tx,
         };
@@ -123,8 +152,12 @@ impl IngesterState {
         }
     }
 
-    pub fn load(wal_dir_path: &Path, rate_limiter_settings: RateLimiterSettings) -> Self {
-        let state = Self::new();
+    pub fn load(
+        wal_dir_path: &Path,
+        disk_capacity: ByteSize,
+        rate_limiter_settings: RateLimiterSettings,
+    ) -> Self {
+        let state = Self::new(disk_capacity);
         let state_clone = state.clone();
         let wal_dir_path = wal_dir_path.to_path_buf();
 
@@ -138,8 +171,17 @@ impl IngesterState {
 
     #[cfg(test)]
     pub async fn for_test() -> (tempfile::TempDir, Self) {
+        Self::for_test_with_disk_capacity(ByteSize::mb(256)).await
+    }
+
+    #[cfg(test)]
+    pub async fn for_test_with_disk_capacity(disk_capacity: ByteSize) -> (tempfile::TempDir, Self) {
         let temp_dir = tempfile::tempdir().unwrap();
-        let mut state = IngesterState::load(temp_dir.path(), RateLimiterSettings::default());
+        let mut state = IngesterState::load(
+            temp_dir.path(),
+            disk_capacity,
+            RateLimiterSettings::default(),
+        );
 
         state
             .status_rx
@@ -488,7 +530,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_ingester_state_does_not_lock_while_initializing() {
-        let state = IngesterState::new();
+        let state = IngesterState::new(ByteSize::mb(256));
         let inner_guard = state.inner.lock().await;
 
         assert_eq!(inner_guard.status(), IngesterStatus::Initializing);
@@ -503,7 +545,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_ingester_state_failed() {
-        let state = IngesterState::new();
+        let state = IngesterState::new(ByteSize::mb(256));
 
         state.inner.lock().await.set_status(IngesterStatus::Failed);
 
@@ -516,7 +558,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_ingester_state_init() {
-        let mut state = IngesterState::new();
+        let mut state = IngesterState::new(ByteSize::mb(256));
         let temp_dir = tempfile::tempdir().unwrap();
 
         state
@@ -672,61 +714,71 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_get_open_shard_counts() {
+    async fn test_get_shard_snapshot() {
         let (_temp_dir, state) = IngesterState::for_test().await;
         let mut state_guard = state.lock_partially().await.unwrap();
 
-        let index_a = IndexUid::for_test("index-a", 0);
-        let index_b = IndexUid::for_test("index-b", 0);
-        let index_c = IndexUid::for_test("index-c", 0);
+        let index_uid = IndexUid::for_test("test-index", 0);
 
-        // (index-a, source-a): 1 open solo shard.
+        // source-a: 2 open shards + 1 closed shard + 1 replica (ignored).
         let s = open_shard(
-            index_a.clone(),
-            SourceId::from("source-a"),
+            index_uid.clone(),
+            "source-a".into(),
             ShardId::from(1),
             false,
         );
         state_guard.shards.insert(s.queue_id(), s);
-
-        // (index-b, source-b): 1 open solo + 1 replica. Only the solo should be counted.
         let s = open_shard(
-            index_b.clone(),
-            SourceId::from("source-b"),
+            index_uid.clone(),
+            "source-a".into(),
             ShardId::from(2),
             false,
         );
         state_guard.shards.insert(s.queue_id(), s);
-        let s = open_shard(
-            index_b.clone(),
-            SourceId::from("source-b"),
-            ShardId::from(3),
-            true,
-        );
+        let s = IngesterShard::new_solo(index_uid.clone(), "source-a".into(), ShardId::from(3))
+            .with_state(ShardState::Closed)
+            .advertisable()
+            .build();
+        state_guard.shards.insert(s.queue_id(), s);
+        let s = open_shard(index_uid.clone(), "source-a".into(), ShardId::from(4), true);
         state_guard.shards.insert(s.queue_id(), s);
 
-        // (index-c, source-c): 2 open solo shards.
-        let s = open_shard(
-            index_c.clone(),
-            SourceId::from("source-c"),
-            ShardId::from(4),
-            false,
-        );
+        // source-b: 2 closed shards, no open shards.
+        let s = IngesterShard::new_solo(index_uid.clone(), "source-b".into(), ShardId::from(5))
+            .with_state(ShardState::Closed)
+            .advertisable()
+            .build();
         state_guard.shards.insert(s.queue_id(), s);
-        let s = open_shard(
-            index_c.clone(),
-            SourceId::from("source-c"),
-            ShardId::from(5),
-            false,
-        );
+        let s = IngesterShard::new_solo(index_uid.clone(), "source-b".into(), ShardId::from(6))
+            .with_state(ShardState::Closed)
+            .advertisable()
+            .build();
         state_guard.shards.insert(s.queue_id(), s);
 
-        let mut counts = state_guard.get_open_shard_counts();
-        counts.sort_by(|a, b| a.0.cmp(&b.0));
+        let (mut open_counts, mut closed_shards) = state_guard.get_shard_snapshot();
+
+        // Open counts: source-a has 2, source-b has 0.
+        open_counts.sort_by(|a, b| a.1.cmp(&b.1));
+        assert_eq!(open_counts.len(), 2);
+        assert_eq!(
+            open_counts[0],
+            (index_uid.clone(), SourceId::from("source-a"), 2)
+        );
+        assert_eq!(
+            open_counts[1],
+            (index_uid.clone(), SourceId::from("source-b"), 0)
+        );
+
+        // Closed shards: source-a has shard 3, source-b has shards 5 and 6.
+        closed_shards.sort_by(|a, b| a.source_id.cmp(&b.source_id));
+        assert_eq!(closed_shards.len(), 2);
+
+        assert_eq!(closed_shards[0].source_id, "source-a");
+        assert_eq!(closed_shards[0].shard_ids, vec![ShardId::from(3)]);
 
-        assert_eq!(counts.len(), 3);
-        assert_eq!(counts[0], (index_a, SourceId::from("source-a"), 1));
-        assert_eq!(counts[1], (index_b, SourceId::from("source-b"), 1));
-        assert_eq!(counts[2], (index_c, SourceId::from("source-c"), 2));
+        assert_eq!(closed_shards[1].source_id, "source-b");
+        let mut source_b_ids = closed_shards[1].shard_ids.clone();
+        source_b_ids.sort();
+        assert_eq!(source_b_ids, vec![ShardId::from(5), ShardId::from(6)]);
     }
 }
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_timeseries.rs b/quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_timeseries.rs
new file mode 100644
index 00000000000..58f030cbf74
--- /dev/null
+++ b/quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_timeseries.rs
@@ -0,0 +1,214 @@
+// Copyright 2021-Present Datadog, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use bytesize::ByteSize;
+use quickwit_common::ring_buffer::RingBuffer;
+
+/// The lookback window length is meant to capture readings far enough back in time to give
+/// a rough rate of change estimate. At size 6, with broadcast interval of 5 seconds, this would be
+/// 30 seconds of readings.
+const WAL_CAPACITY_LOOKBACK_WINDOW_LEN: usize = 6;
+
+/// The ring buffer stores one extra element so that `delta()` can compare the newest reading
+/// with the one that is exactly `WAL_CAPACITY_LOOKBACK_WINDOW_LEN` steps ago. Otherwise, that
+/// reading would be discarded when the next reading is inserted.
+const WAL_CAPACITY_READINGS_LEN: usize = WAL_CAPACITY_LOOKBACK_WINDOW_LEN + 1;
+
+pub struct WalDiskCapacityTimeSeries {
+    disk_capacity: ByteSize,
+    readings: RingBuffer<f64, WAL_CAPACITY_READINGS_LEN>,
+}
+
+impl WalDiskCapacityTimeSeries {
+    pub fn new(disk_capacity: ByteSize) -> Self {
+        #[cfg(not(test))]
+        assert!(disk_capacity.as_u64() > 0);
+        Self {
+            disk_capacity,
+            readings: RingBuffer::default(),
+        }
+    }
+
+    /// Records a disk usage reading and returns the resulting capacity score.
+    pub fn record_and_score(&mut self, disk_used: ByteSize) -> usize {
+        self.record(disk_used);
+        let remaining = self.current().unwrap_or(1.0);
+        let delta = self.delta().unwrap_or(0.0);
+        compute_capacity_score(remaining, delta)
+    }
+
+    /// Computes a capacity score for the given disk usage without recording it.
+    pub fn score(&self, disk_used: ByteSize) -> usize {
+        let remaining = 1.0 - (disk_used.as_u64() as f64 / self.disk_capacity.as_u64() as f64);
+        let delta = self.delta().unwrap_or(0.0);
+        compute_capacity_score(remaining, delta)
+    }
+
+    fn record(&mut self, disk_used: ByteSize) {
+        let remaining = 1.0 - (disk_used.as_u64() as f64 / self.disk_capacity.as_u64() as f64);
+        self.readings.push_back(remaining.clamp(0.0, 1.0));
+    }
+
+    fn current(&self) -> Option<f64> {
+        self.readings.last()
+    }
+
+    /// How much remaining capacity changed between the oldest and newest readings.
+    /// Positive = improving, negative = draining.
+    fn delta(&self) -> Option<f64> {
+        let current = self.readings.last()?;
+        let oldest = self.readings.front()?;
+        Some(current - oldest)
+    }
+}
+
+/// Computes a capacity score from 0 to 10 using a PD controller.
+///
+/// The score has two components:
+///
+/// - **P (proportional):** How much WAL capacity remains right now. An ingester with 100% free
+///   capacity gets `PROPORTIONAL_WEIGHT` points; 50% gets half; and so on. If remaining capacity
+///   drops to `MIN_PERMISSIBLE_CAPACITY` or below, the score is immediately 0.
+///
+/// - **D (derivative):** Up to `DERIVATIVE_WEIGHT` bonus points based on how fast remaining
+///   capacity is changing over the lookback window. A higher drain rate is worse, so we invert it:
+///   `drain / MAX_DRAIN_RATE` normalizes the drain to a 0–1 penalty, and subtracting from 1
+///   converts it into a 0–1 bonus. Multiplied by `DERIVATIVE_WEIGHT`, a stable node gets the full
+///   bonus and a node draining at `MAX_DRAIN_RATE` or faster gets nothing.
+///
+/// Putting it together: a completely idle ingester scores 10 (8 + 2).
+/// One that is full but stable scores ~2. One that is draining rapidly scores less.
+/// A score of 0 means the ingester is at or below minimum permissible capacity.
+///
+/// Below this remaining capacity fraction, the score is immediately 0.
+const MIN_PERMISSIBLE_CAPACITY: f64 = 0.05;
+/// Weight of the proportional term (max points from P).
+const PROPORTIONAL_WEIGHT: f64 = 8.0;
+/// Weight of the derivative term (max points from D).
+const DERIVATIVE_WEIGHT: f64 = 2.0;
+/// The drain rate (as a fraction of total capacity over the lookback window) at which the
+/// derivative penalty is fully applied. Drain rates beyond this are clamped.
+const MAX_DRAIN_RATE: f64 = 0.10;
+
+fn compute_capacity_score(remaining_capacity: f64, capacity_delta: f64) -> usize {
+    if remaining_capacity <= MIN_PERMISSIBLE_CAPACITY {
+        return 0;
+    }
+    let p = PROPORTIONAL_WEIGHT * remaining_capacity;
+    let drain = (-capacity_delta).clamp(0.0, MAX_DRAIN_RATE);
+    let d = DERIVATIVE_WEIGHT * (1.0 - drain / MAX_DRAIN_RATE);
+    (p + d).clamp(0.0, 10.0) as usize
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn ts() -> WalDiskCapacityTimeSeries {
+        WalDiskCapacityTimeSeries::new(ByteSize::b(100))
+    }
+
+    /// Helper: record a reading with `used` bytes against the series' fixed capacity.
+    fn record(series: &mut WalDiskCapacityTimeSeries, used: u64) {
+        series.record(ByteSize::b(used));
+    }
+
+    #[test]
+    fn test_wal_disk_capacity_current_after_record() {
+        let mut series = WalDiskCapacityTimeSeries::new(ByteSize::b(256));
+        // 192 of 256 used => 25% remaining
+        series.record(ByteSize::b(192));
+        assert_eq!(series.current(), Some(0.25));
+
+        // 16 of 256 used => 93.75% remaining
+        series.record(ByteSize::b(16));
+        assert_eq!(series.current(), Some(0.9375));
+    }
+
+    #[test]
+    fn test_wal_disk_capacity_record_saturates_at_zero() {
+        let mut series = ts();
+        // 200 used out of 100 capacity => clamped to 0.0
+        record(&mut series, 200);
+        assert_eq!(series.current(), Some(0.0));
+    }
+
+    #[test]
+    fn test_wal_disk_capacity_delta_growing() {
+        let mut series = ts();
+        // oldest: 60 of 100 used => 40% remaining
+        record(&mut series, 60);
+        // current: 20 of 100 used => 80% remaining
+        record(&mut series, 20);
+        // delta = 0.80 - 0.40 = 0.40
+        assert_eq!(series.delta(), Some(0.40));
+    }
+
+    #[test]
+    fn test_wal_disk_capacity_delta_shrinking() {
+        let mut series = ts();
+        // oldest: 20 of 100 used => 80% remaining
+        record(&mut series, 20);
+        // current: 60 of 100 used => 40% remaining
+        record(&mut series, 60);
+        // delta = 0.40 - 0.80 = -0.40
+        assert_eq!(series.delta(), Some(-0.40));
+    }
+
+    #[test]
+    fn test_capacity_score_draining_vs_stable() {
+        // Node A: capacity draining — usage increases 10, 20, ..., 70 over 7 ticks.
+        let mut node_a = ts();
+        for used in (10..=70).step_by(10) {
+            record(&mut node_a, used);
+        }
+        let a_remaining = node_a.current().unwrap();
+        let a_delta = node_a.delta().unwrap();
+        let a_score = compute_capacity_score(a_remaining, a_delta);
+
+        // Node B: steady at 50% usage over 7 ticks.
+        let mut node_b = ts();
+        for _ in 0..7 {
+            record(&mut node_b, 50);
+        }
+        let b_remaining = node_b.current().unwrap();
+        let b_delta = node_b.delta().unwrap();
+        let b_score = compute_capacity_score(b_remaining, b_delta);
+
+        // p=2.4, d=0 (max drain) => 2
+        assert_eq!(a_score, 2);
+        // p=4, d=2 (stable) => 6
+        assert_eq!(b_score, 6);
+        assert!(b_score > a_score);
+    }
+
+    #[test]
+    fn test_wal_disk_capacity_delta_spans_lookback_window() {
+        let mut series = ts();
+
+        // Fill to exactly the lookback window length (6 readings), all same value.
+        for _ in 0..WAL_CAPACITY_LOOKBACK_WINDOW_LEN {
+            record(&mut series, 50);
+        }
+        assert_eq!(series.delta(), Some(0.0));
+
+        // 7th reading fills the ring buffer. Delta spans 6 intervals.
+        record(&mut series, 0);
+        assert_eq!(series.delta(), Some(0.50));
+
+        // 8th reading evicts the oldest 50-remaining. Delta still spans 6 intervals.
+        record(&mut series, 0);
+        assert_eq!(series.delta(), Some(0.50));
+    }
+}
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs b/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs
index 3e7b22969e8..fb52d8e8139 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs
@@ -23,7 +23,7 @@ use quickwit_proto::ingest::ingester::{PersistFailure, PersistFailureReason, Per
 use quickwit_proto::ingest::router::{
     IngestFailure, IngestFailureReason, IngestResponseV2, IngestSubrequest, IngestSuccess,
 };
-use quickwit_proto::ingest::{IngestV2Error, RateLimitingCause};
+use quickwit_proto::ingest::{IngestV2Error, RateLimitingCause, ShardIds};
 use quickwit_proto::types::{NodeId, SubrequestId};
 use tracing::warn;
 
@@ -47,6 +47,7 @@ pub(super) struct IngestWorkbench {
     /// (The point here is to make sure we do not wait for the failure detection to kick the node
     /// out of the ingest node.)
     pub unavailable_leaders: HashSet<NodeId>,
+    pub closed_shards: Vec<ShardIds>,
     publish_tracker: Option<PublishTracker>,
 }
 
diff --git a/quickwit/quickwit-proto/protos/quickwit/ingester.proto b/quickwit/quickwit-proto/protos/quickwit/ingester.proto
index 25a4705d58a..23ff6e6825d 100644
--- a/quickwit/quickwit-proto/protos/quickwit/ingester.proto
+++ b/quickwit/quickwit-proto/protos/quickwit/ingester.proto
@@ -81,6 +81,19 @@ message PersistResponse {
   string leader_id = 1;
   repeated PersistSuccess successes = 2;
   repeated PersistFailure failures = 3;
+  RoutingUpdate routing_update = 4;
+}
+
+message RoutingUpdate {
+  uint32 capacity_score = 1;
+  repeated SourceShardUpdate source_shard_updates = 2;
+  repeated quickwit.ingest.ShardIds closed_shards = 3;
+}
+
+message SourceShardUpdate {
+  quickwit.common.IndexUid index_uid = 1;
+  string source_id = 2;
+  uint32 open_shard_count = 3;
 }
 
 message PersistSuccess {
diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs
index 07b8d5b64a1..606dce48e2e 100644
--- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs
+++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs
@@ -51,6 +51,28 @@ pub struct PersistResponse {
     pub successes: ::prost::alloc::vec::Vec<PersistSuccess>,
     #[prost(message, repeated, tag = "3")]
     pub failures: ::prost::alloc::vec::Vec<PersistFailure>,
+    #[prost(message, optional, tag = "4")]
+    pub routing_update: ::core::option::Option<RoutingUpdate>,
+}
+#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)]
+#[derive(Clone, PartialEq, ::prost::Message)]
+pub struct RoutingUpdate {
+    #[prost(uint32, tag = "1")]
+    pub capacity_score: u32,
+    #[prost(message, repeated, tag = "2")]
+    pub source_shard_updates: ::prost::alloc::vec::Vec<SourceShardUpdate>,
+    #[prost(message, repeated, tag = "3")]
+    pub closed_shards: ::prost::alloc::vec::Vec<super::ShardIds>,
+}
+#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)]
+#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)]
+pub struct SourceShardUpdate {
+    #[prost(message, optional, tag = "1")]
+    pub index_uid: ::core::option::Option<crate::types::IndexUid>,
+    #[prost(string, tag = "2")]
+    pub source_id: ::prost::alloc::string::String,
+    #[prost(uint32, tag = "3")]
+    pub open_shard_count: u32,
 }
 #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)]
 #[derive(Clone, PartialEq, ::prost::Message)]
diff --git a/quickwit/quickwit-proto/src/getters.rs b/quickwit/quickwit-proto/src/getters.rs
index a327c1717a7..d33901bde47 100644
--- a/quickwit/quickwit-proto/src/getters.rs
+++ b/quickwit/quickwit-proto/src/getters.rs
@@ -111,6 +111,7 @@ generate_getters! {
     ShardIds,
     ShardPKey,
     TruncateShardsSubrequest,
+    SourceShardUpdate,
 
     // Metastore API
     AcquireShardsRequest,

From 115553b65086a831c9656968cedeefc2027b4957 Mon Sep 17 00:00:00 2001
From: nadav-govari <nadav.govari@datadoghq.com>
Date: Tue, 3 Mar 2026 14:10:38 -0500
Subject: [PATCH 5/9] Remove unused shard_ids in persist protos (#6169)

---
 .../quickwit-ingest/src/ingest_v2/ingester.rs | 30 -------------------
 quickwit/quickwit-ingest/src/ingest_v2/mod.rs |  1 -
 .../src/ingest_v2/node_routing_table.rs       |  1 +
 .../quickwit-ingest/src/ingest_v2/router.rs   |  9 ------
 .../src/ingest_v2/workbench.rs                |  2 --
 .../protos/quickwit/ingester.proto            |  2 --
 .../quickwit/quickwit.ingest.ingester.rs      |  4 ---
 quickwit/quickwit-proto/src/getters.rs        |  2 --
 .../quickwit-proto/src/ingest/ingester.rs     |  6 ----
 9 files changed, 1 insertion(+), 56 deletions(-)

diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
index 0eafd689b66..a131e2c289a 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
@@ -459,7 +459,6 @@ impl Ingester {
                     subrequest_id: subrequest.subrequest_id,
                     index_uid: subrequest.index_uid,
                     source_id: subrequest.source_id,
-                    shard_id: subrequest.shard_id,
                     reason: PersistFailureReason::NodeUnavailable as i32,
                 };
                 persist_failures.push(persist_failure);
@@ -490,7 +489,6 @@ impl Ingester {
                         subrequest_id: subrequest.subrequest_id,
                         index_uid: subrequest.index_uid,
                         source_id: subrequest.source_id,
-                        shard_id: subrequest.shard_id,
                         reason: PersistFailureReason::NoShardsAvailable as i32,
                     };
                     persist_failures.push(persist_failure);
@@ -531,7 +529,6 @@ impl Ingester {
                         subrequest_id: subrequest.subrequest_id,
                         index_uid: subrequest.index_uid,
                         source_id: subrequest.source_id,
-                        shard_id: Some(shard_id),
                         reason: PersistFailureReason::WalFull as i32,
                     };
                     persist_failures.push(persist_failure);
@@ -549,7 +546,6 @@ impl Ingester {
                         subrequest_id: subrequest.subrequest_id,
                         index_uid: subrequest.index_uid,
                         source_id: subrequest.source_id,
-                        shard_id: Some(shard_id),
                         reason: PersistFailureReason::NoShardsAvailable as i32,
                     };
                     persist_failures.push(persist_failure);
@@ -687,7 +683,6 @@ impl Ingester {
                         subrequest_id: replicate_failure.subrequest_id,
                         index_uid: replicate_failure.index_uid,
                         source_id: replicate_failure.source_id,
-                        shard_id: replicate_failure.shard_id,
                         reason: persist_failure_reason as i32,
                     };
                     persist_failures.push(persist_failure);
@@ -737,7 +732,6 @@ impl Ingester {
                             subrequest_id: subrequest.subrequest_id,
                             index_uid: subrequest.index_uid,
                             source_id: subrequest.source_id,
-                            shard_id: subrequest.shard_id,
                             reason: reason as i32,
                         };
                         persist_failures.push(persist_failure);
@@ -1771,14 +1765,12 @@ mod tests {
                     subrequest_id: 0,
                     index_uid: Some(index_uid.clone()),
                     source_id: "test-source".to_string(),
-                    shard_id: Some(ShardId::from(1)),
                     doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-010"}"#])),
                 },
                 PersistSubrequest {
                     subrequest_id: 1,
                     index_uid: Some(index_uid2.clone()),
                     source_id: "test-source".to_string(),
-                    shard_id: Some(ShardId::from(1)),
                     doc_batch: Some(DocBatchV2::for_test([
                         r#"{"doc": "test-doc-110"}"#,
                         r#"{"doc": "test-doc-111"}"#,
@@ -1795,7 +1787,6 @@ mod tests {
         assert_eq!(persist_success_0.subrequest_id, 0);
         assert_eq!(persist_success_0.index_uid(), &index_uid);
         assert_eq!(persist_success_0.source_id, "test-source");
-        assert_eq!(persist_success_0.shard_id(), ShardId::from(1));
         assert_eq!(
             persist_success_0.replication_position_inclusive,
             Some(Position::offset(1u64))
@@ -1805,7 +1796,6 @@ mod tests {
         assert_eq!(persist_success_1.subrequest_id, 1);
         assert_eq!(persist_success_1.index_uid(), &index_uid2);
         assert_eq!(persist_success_1.source_id, "test-source");
-        assert_eq!(persist_success_1.shard_id(), ShardId::from(1));
         assert_eq!(
             persist_success_1.replication_position_inclusive,
             Some(Position::offset(2u64))
@@ -1892,7 +1882,6 @@ mod tests {
                 subrequest_id: 0,
                 index_uid: Some(index_uid.clone()),
                 source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(0)),
                 doc_batch: None,
             }],
         };
@@ -1905,7 +1894,6 @@ mod tests {
         assert_eq!(persist_success.subrequest_id, 0);
         assert_eq!(persist_success.index_uid(), &index_uid);
         assert_eq!(persist_success.source_id, "test-source");
-        assert_eq!(persist_success.shard_id(), ShardId::from(0));
         assert_eq!(
             persist_success.replication_position_inclusive,
             Some(Position::Beginning)
@@ -1953,7 +1941,6 @@ mod tests {
                 subrequest_id: 0,
                 index_uid: Some(index_uid.clone()),
                 source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(0)),
                 doc_batch: Some(DocBatchV2::for_test([
                     "",                           // invalid
                     "[]",                         // invalid
@@ -2028,7 +2015,6 @@ mod tests {
                 subrequest_id: 0,
                 index_uid: Some(index_uid.clone()),
                 source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(0)),
                 doc_batch: Some(DocBatchV2::for_test([
                     "",                           // invalid
                     "[]",                         // invalid
@@ -2091,7 +2077,6 @@ mod tests {
                 subrequest_id: 0,
                 index_uid: Some(index_uid.clone()),
                 source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(0)),
                 doc_batch: Some(DocBatchV2::for_test(["", "[]", r#"{"foo": "bar"}"#])),
             }],
         };
@@ -2152,7 +2137,6 @@ mod tests {
                 subrequest_id: 0,
                 index_uid: Some(index_uid.clone()),
                 source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(0)),
                 doc_batch: Some(DocBatchV2::for_test(["", "[]", r#"{"foo": "bar"}"#])),
             }],
         };
@@ -2213,7 +2197,6 @@ mod tests {
                 subrequest_id: 0,
                 index_uid: Some(index_uid.clone()),
                 source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(1)),
                 doc_batch: Some(DocBatchV2::for_test([r#"test-doc-foo"#])),
             }],
         };
@@ -2226,7 +2209,6 @@ mod tests {
         assert_eq!(persist_failure.subrequest_id, 0);
         assert_eq!(persist_failure.index_uid(), &index_uid);
         assert_eq!(persist_failure.source_id, "test-source");
-        assert_eq!(persist_failure.shard_id(), ShardId::from(1));
         assert_eq!(
             persist_failure.reason(),
             PersistFailureReason::NodeUnavailable,
@@ -2266,7 +2248,6 @@ mod tests {
                 subrequest_id: 0,
                 index_uid: Some(index_uid.clone()),
                 source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(1)),
                 doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-foo"}"#])),
             }],
         };
@@ -2279,7 +2260,6 @@ mod tests {
         assert_eq!(persist_failure.subrequest_id, 0);
         assert_eq!(persist_failure.index_uid(), &index_uid);
         assert_eq!(persist_failure.source_id, "test-source");
-        assert_eq!(persist_failure.shard_id(), ShardId::from(1));
         assert_eq!(
             persist_failure.reason(),
             PersistFailureReason::NodeUnavailable
@@ -2362,14 +2342,12 @@ mod tests {
                     subrequest_id: 0,
                     index_uid: Some(index_uid.clone()),
                     source_id: "test-source".to_string(),
-                    shard_id: Some(ShardId::from(1)),
                     doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-010"}"#])),
                 },
                 PersistSubrequest {
                     subrequest_id: 1,
                     index_uid: Some(index_uid2.clone()),
                     source_id: "test-source".to_string(),
-                    shard_id: Some(ShardId::from(1)),
                     doc_batch: Some(DocBatchV2::for_test([
                         r#"{"doc": "test-doc-110"}"#,
                         r#"{"doc": "test-doc-111"}"#,
@@ -2570,14 +2548,12 @@ mod tests {
                     subrequest_id: 0,
                     index_uid: Some(index_uid.clone()),
                     source_id: "test-source".to_string(),
-                    shard_id: Some(ShardId::from(1)),
                     doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-010"}"#])),
                 },
                 PersistSubrequest {
                     subrequest_id: 1,
                     index_uid: Some(index_uid2.clone()),
                     source_id: "test-source".to_string(),
-                    shard_id: Some(ShardId::from(1)),
                     doc_batch: Some(DocBatchV2::for_test([
                         r#"{"doc": "test-doc-110"}"#,
                         r#"{"doc": "test-doc-111"}"#,
@@ -2696,7 +2672,6 @@ mod tests {
                 subrequest_id: 0,
                 index_uid: Some(index_uid.clone()),
                 source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(1)),
                 doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-010"}"#])),
             }],
         };
@@ -2709,7 +2684,6 @@ mod tests {
         assert_eq!(persist_failure.subrequest_id, 0);
         assert_eq!(persist_failure.index_uid(), &index_uid);
         assert_eq!(persist_failure.source_id, "test-source");
-        assert_eq!(persist_failure.shard_id(), ShardId::from(1));
         assert_eq!(
             persist_failure.reason(),
             PersistFailureReason::NoShardsAvailable
@@ -2775,7 +2749,6 @@ mod tests {
                 subrequest_id: 0,
                 index_uid: Some(index_uid.clone()),
                 source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(1)),
                 doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-010"}"#])),
             }],
         };
@@ -2788,7 +2761,6 @@ mod tests {
         assert_eq!(persist_failure.subrequest_id, 0);
         assert_eq!(persist_failure.index_uid(), &index_uid);
         assert_eq!(persist_failure.source_id, "test-source");
-        assert_eq!(persist_failure.shard_id(), ShardId::from(1));
         assert_eq!(
             persist_failure.reason(),
             PersistFailureReason::NoShardsAvailable
@@ -2856,7 +2828,6 @@ mod tests {
                 subrequest_id: 0,
                 index_uid: Some(index_uid.clone()),
                 source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(1)),
                 doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-010"}"#])),
             }],
         };
@@ -2869,7 +2840,6 @@ mod tests {
         assert_eq!(persist_failure.subrequest_id, 0);
         assert_eq!(persist_failure.index_uid(), &index_uid);
         assert_eq!(persist_failure.source_id, "test-source");
-        assert_eq!(persist_failure.shard_id(), ShardId::from(1));
         assert_eq!(persist_failure.reason(), PersistFailureReason::WalFull);
 
         let state_guard = ingester.state.lock_fully().await.unwrap();
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
index f6c1cd42732..34051e62cd0 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
@@ -22,7 +22,6 @@ mod metrics;
 mod models;
 mod mrecord;
 mod mrecordlog_utils;
-#[allow(dead_code)]
 mod node_routing_table;
 mod publish_tracker;
 mod rate_meter;
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs
index 6b23fcd1aca..72ee2ded5ec 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs
@@ -29,6 +29,7 @@ use crate::IngesterPool;
 pub(super) struct IngesterNode {
     pub node_id: NodeId,
     pub index_uid: IndexUid,
+    #[allow(unused)]
     pub source_id: SourceId,
     /// Score from 0-10. Higher means more available capacity.
     pub capacity_score: usize,
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs
index 1f5ca26865e..5a473a1adb1 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs
@@ -368,7 +368,6 @@ impl IngestRouter {
                 subrequest_id: subrequest.subrequest_id,
                 index_uid: Some(ingester_node.index_uid.clone()),
                 source_id: subrequest.source_id.clone(),
-                shard_id: None,
                 doc_batch: subrequest.doc_batch.clone(),
             };
             per_leader_persist_subrequests
@@ -1127,7 +1126,6 @@ mod tests {
                     subrequest_id: 0,
                     index_uid: Some(index_uid.clone()),
                     source_id: "test-source".to_string(),
-                    shard_id: Some(ShardId::from(1)),
                     reason: PersistFailureReason::NoShardsAvailable as i32,
                 }],
                 routing_update: Some(RoutingUpdate {
@@ -1288,7 +1286,6 @@ mod tests {
             .returning(move |request| {
                 assert_eq!(request.leader_id, "test-ingester-0");
                 assert_eq!(request.subrequests.len(), 1);
-                assert!(request.subrequests[0].shard_id.is_none());
 
                 Ok(PersistResponse {
                     leader_id: request.leader_id,
@@ -1325,7 +1322,6 @@ mod tests {
             .returning(move |request| {
                 assert_eq!(request.leader_id, "test-ingester-1");
                 assert_eq!(request.subrequests.len(), 1);
-                assert!(request.subrequests[0].shard_id.is_none());
 
                 Ok(PersistResponse {
                     leader_id: request.leader_id,
@@ -1425,7 +1421,6 @@ mod tests {
                         subrequest_id: 0,
                         index_uid: Some(index_uid_clone.clone()),
                         source_id: "test-source".to_string(),
-                        shard_id: Some(ShardId::from(1)),
                         reason: PersistFailureReason::NoShardsAvailable as i32,
                     }],
                     routing_update: Some(RoutingUpdate {
@@ -1579,7 +1574,6 @@ mod tests {
             assert_eq!(subrequest.subrequest_id, 0);
             let index_uid = subrequest.index_uid().clone();
             assert_eq!(subrequest.source_id, "test-source");
-            assert!(subrequest.shard_id.is_none());
             assert_eq!(
                 subrequest.doc_batch,
                 Some(DocBatchV2::for_test(["test-doc-foo"]))
@@ -1592,7 +1586,6 @@ mod tests {
                     subrequest_id: 0,
                     index_uid: Some(index_uid.clone()),
                     source_id: "test-source".to_string(),
-                    shard_id: Some(ShardId::from(1)),
                     reason: PersistFailureReason::NoShardsAvailable as i32,
                 }],
                 routing_update: Some(RoutingUpdate {
@@ -1701,7 +1694,6 @@ mod tests {
                     subrequest_id: 0,
                     index_uid: Some(IndexUid::for_test("test-index-0", 0)),
                     source_id: "test-source".to_string(),
-                    shard_id: Some(ShardId::from(1)),
                     reason: PersistFailureReason::NoShardsAvailable as i32,
                 }],
                 routing_update: Some(RoutingUpdate {
@@ -1735,7 +1727,6 @@ mod tests {
                     subrequest_id: 1,
                     index_uid: Some(IndexUid::for_test("test-index-1", 0)),
                     source_id: "test-source".to_string(),
-                    shard_id: Some(ShardId::from(1)),
                     reason: PersistFailureReason::NodeUnavailable as i32,
                 }],
                 routing_update: Some(RoutingUpdate {
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs b/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs
index fb52d8e8139..50f92654efb 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/workbench.rs
@@ -560,7 +560,6 @@ mod tests {
 
         let persist_failure = PersistFailure {
             subrequest_id: 1,
-            shard_id: Some(shard_id_2.clone()),
             ..Default::default()
         };
         workbench.record_persist_failure(&persist_failure);
@@ -807,7 +806,6 @@ mod tests {
 
         let persist_failure = PersistFailure {
             subrequest_id: 0,
-            shard_id: Some(ShardId::from(1)),
             reason: PersistFailureReason::WalFull as i32,
             ..Default::default()
         };
diff --git a/quickwit/quickwit-proto/protos/quickwit/ingester.proto b/quickwit/quickwit-proto/protos/quickwit/ingester.proto
index 23ff6e6825d..04b649a16ed 100644
--- a/quickwit/quickwit-proto/protos/quickwit/ingester.proto
+++ b/quickwit/quickwit-proto/protos/quickwit/ingester.proto
@@ -73,7 +73,6 @@ message PersistSubrequest {
   uint32 subrequest_id = 1;
   quickwit.common.IndexUid index_uid = 2;
   string source_id = 3;
-  quickwit.ingest.ShardId shard_id = 4;
   quickwit.ingest.DocBatchV2 doc_batch = 5;
 }
 
@@ -119,7 +118,6 @@ message PersistFailure {
   uint32 subrequest_id = 1;
   quickwit.common.IndexUid index_uid = 2;
   string source_id = 3;
-  quickwit.ingest.ShardId shard_id = 4;
   PersistFailureReason reason = 5;
 }
 
diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs
index 606dce48e2e..1b5fcfd15c6 100644
--- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs
+++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.ingest.ingester.rs
@@ -37,8 +37,6 @@ pub struct PersistSubrequest {
     pub index_uid: ::core::option::Option<crate::types::IndexUid>,
     #[prost(string, tag = "3")]
     pub source_id: ::prost::alloc::string::String,
-    #[prost(message, optional, tag = "4")]
-    pub shard_id: ::core::option::Option<crate::types::ShardId>,
     #[prost(message, optional, tag = "5")]
     pub doc_batch: ::core::option::Option<super::DocBatchV2>,
 }
@@ -101,8 +99,6 @@ pub struct PersistFailure {
     pub index_uid: ::core::option::Option<crate::types::IndexUid>,
     #[prost(string, tag = "3")]
     pub source_id: ::prost::alloc::string::String,
-    #[prost(message, optional, tag = "4")]
-    pub shard_id: ::core::option::Option<crate::types::ShardId>,
     #[prost(enumeration = "PersistFailureReason", tag = "5")]
     pub reason: i32,
 }
diff --git a/quickwit/quickwit-proto/src/getters.rs b/quickwit/quickwit-proto/src/getters.rs
index d33901bde47..4d8e6f058e6 100644
--- a/quickwit/quickwit-proto/src/getters.rs
+++ b/quickwit/quickwit-proto/src/getters.rs
@@ -210,8 +210,6 @@ generate_getters! {
     InitShardFailure,
     OpenFetchStreamRequest,
     OpenShardSubrequest,
-    PersistFailure,
-    PersistSubrequest,
     PersistSuccess,
     ReplicateFailure,
     ReplicateSubrequest,
diff --git a/quickwit/quickwit-proto/src/ingest/ingester.rs b/quickwit/quickwit-proto/src/ingest/ingester.rs
index d2da3f8d9bd..3d6bb896e37 100644
--- a/quickwit/quickwit-proto/src/ingest/ingester.rs
+++ b/quickwit/quickwit-proto/src/ingest/ingester.rs
@@ -85,12 +85,6 @@ impl OpenFetchStreamRequest {
     }
 }
 
-impl PersistSubrequest {
-    pub fn queue_id(&self) -> QueueId {
-        queue_id(self.index_uid(), &self.source_id, self.shard_id())
-    }
-}
-
 impl PersistSuccess {
     pub fn queue_id(&self) -> QueueId {
         queue_id(self.index_uid(), &self.source_id, self.shard_id())

From 1d0e18e177eb36cedadda816d5c03f8317eba208 Mon Sep 17 00:00:00 2001
From: nadav-govari <nadav.govari@datadoghq.com>
Date: Mon, 9 Mar 2026 14:47:24 -0400
Subject: [PATCH 6/9] Add availability zone awareness to node based routing
 (#6189)

---
 quickwit/quickwit-cluster/src/node.rs         |   7 +
 .../src/rate_limited_tracing.rs               |   9 +-
 .../src/control_plane.rs                      |  15 +-
 .../src/ingest/ingest_controller.rs           |  78 ++++++---
 .../src/source/ingest/mod.rs                  |  24 ++-
 .../quickwit-ingest/src/ingest_v2/fetch.rs    |  29 +++-
 .../quickwit-ingest/src/ingest_v2/ingester.rs |  25 ++-
 quickwit/quickwit-ingest/src/ingest_v2/mod.rs |   9 +-
 .../src/ingest_v2/node_routing_table.rs       | 163 +++++++++++-------
 .../quickwit-ingest/src/ingest_v2/router.rs   |  94 +++++++---
 .../src/ingest_v2/routing_table.rs            |  40 ++++-
 quickwit/quickwit-proto/src/lib.rs            |   3 +-
 quickwit/quickwit-serve/src/lib.rs            |  27 +--
 13 files changed, 360 insertions(+), 163 deletions(-)

diff --git a/quickwit/quickwit-cluster/src/node.rs b/quickwit/quickwit-cluster/src/node.rs
index 4a8b11dbafc..52029348eb1 100644
--- a/quickwit/quickwit-cluster/src/node.rs
+++ b/quickwit/quickwit-cluster/src/node.rs
@@ -48,6 +48,7 @@ impl ClusterNode {
             indexing_capacity: member.indexing_cpu_capacity,
             is_ready: member.is_ready,
             is_self_node,
+            availability_zone: member.availability_zone,
         };
         let node = ClusterNode {
             inner: Arc::new(inner),
@@ -132,6 +133,10 @@ impl ClusterNode {
     pub fn is_self_node(&self) -> bool {
         self.inner.is_self_node
     }
+
+    pub fn availability_zone(&self) -> Option<&str> {
+        self.inner.availability_zone.as_deref()
+    }
 }
 
 impl Debug for ClusterNode {
@@ -153,6 +158,7 @@ impl PartialEq for ClusterNode {
             && self.inner.indexing_tasks == other.inner.indexing_tasks
             && self.inner.is_ready == other.inner.is_ready
             && self.inner.is_self_node == other.inner.is_self_node
+            && self.inner.availability_zone == other.inner.availability_zone
     }
 }
 
@@ -165,4 +171,5 @@ struct InnerNode {
     indexing_capacity: CpuCapacity,
     is_ready: bool,
     is_self_node: bool,
+    availability_zone: Option<String>,
 }
diff --git a/quickwit/quickwit-common/src/rate_limited_tracing.rs b/quickwit/quickwit-common/src/rate_limited_tracing.rs
index c9a323f9ec2..198c2bf8bdd 100644
--- a/quickwit/quickwit-common/src/rate_limited_tracing.rs
+++ b/quickwit/quickwit-common/src/rate_limited_tracing.rs
@@ -179,12 +179,13 @@ fn _check_macro_works() {
 
 #[doc(hidden)]
 pub use coarsetime::Instant as CoarsetimeInstant;
+pub use rate_limited_debug;
+pub use rate_limited_error;
+pub use rate_limited_info;
+pub use rate_limited_trace;
 #[doc(hidden)]
 pub use rate_limited_tracing;
-pub use {
-    rate_limited_debug, rate_limited_error, rate_limited_info, rate_limited_trace,
-    rate_limited_warn,
-};
+pub use rate_limited_warn;
 
 #[cfg(test)]
 mod tests {
diff --git a/quickwit/quickwit-control-plane/src/control_plane.rs b/quickwit/quickwit-control-plane/src/control_plane.rs
index e4c6995d639..33b0e00ddaa 100644
--- a/quickwit/quickwit-control-plane/src/control_plane.rs
+++ b/quickwit/quickwit-control-plane/src/control_plane.rs
@@ -1176,6 +1176,7 @@ mod tests {
         CLI_SOURCE_ID, INGEST_V2_SOURCE_ID, IndexConfig, KafkaSourceParams, SourceParams,
     };
     use quickwit_indexing::IndexingService;
+    use quickwit_ingest::IngesterPoolEntry;
     use quickwit_metastore::{
         CreateIndexRequestExt, IndexMetadata, ListIndexesMetadataResponseExt,
     };
@@ -1203,6 +1204,12 @@ mod tests {
     use super::*;
     use crate::IndexerNodeInfo;
 
+    fn ingester_pool_entry(client: IngesterServiceClient) -> IngesterPoolEntry {
+        IngesterPoolEntry {
+            client,
+            availability_zone: None,
+        }
+    }
     #[tokio::test]
     async fn test_control_plane_create_index() {
         let universe = Universe::with_accelerated_time();
@@ -2210,7 +2217,7 @@ mod tests {
                 assert!(&retain_shards_for_source.shard_ids.is_empty());
                 Ok(RetainShardsResponse {})
             });
-        let ingester = IngesterServiceClient::from_mock(mock_ingester);
+        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("node1".into(), ingester);
 
         let cluster_config = ClusterConfig::for_test();
@@ -2256,7 +2263,7 @@ mod tests {
                 );
                 Ok(RetainShardsResponse {})
             });
-        let ingester = IngesterServiceClient::from_mock(mock_ingester);
+        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("node1".into(), ingester);
 
         let mut index_0 = IndexMetadata::for_test("test-index-0", "ram:///test-index-0");
@@ -2552,7 +2559,7 @@ mod tests {
             };
             Ok(response)
         });
-        let ingester = IngesterServiceClient::from_mock(mock_ingester);
+        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert(ingester_id, ingester);
 
         let mut mock_metastore = MockMetastoreService::new();
@@ -2706,7 +2713,7 @@ mod tests {
             };
             Ok(response)
         });
-        let ingester = IngesterServiceClient::from_mock(mock_ingester);
+        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert(ingester_id, ingester);
 
         let mut mock_metastore = MockMetastoreService::new();
diff --git a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs
index 63295d61eca..e5e7ec8da0e 100644
--- a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs
+++ b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs
@@ -329,7 +329,7 @@ impl IngestController {
     fn sync_with_ingester(&self, ingester: &NodeId, model: &ControlPlaneModel) -> WaitHandle {
         info!(ingester = %ingester, "sync_with_ingester");
         let (wait_drop_guard, wait_handle) = WaitHandle::new();
-        let Some(ingester_client) = self.ingester_pool.get(ingester) else {
+        let Some(ingester_client) = self.ingester_pool.get(ingester).map(|h| h.client) else {
             // TODO: (Maybe) We should mark the ingester as unavailable, and stop advertise its
             // shard to routers.
             warn!("failed to sync with ingester `{ingester}`: not available");
@@ -639,7 +639,7 @@ impl IngestController {
                     }
                 })
                 .collect();
-            let Some(leader) = self.ingester_pool.get(&leader_id) else {
+            let Some(leader) = self.ingester_pool.get(&leader_id).map(|h| h.client) else {
                 warn!("failed to init shards: ingester `{leader_id}` is unavailable");
                 failures.extend(init_shard_failures);
                 continue;
@@ -898,7 +898,7 @@ impl IngestController {
             return Ok(());
         };
         info!("scaling down shard {shard_id} from {leader_id}");
-        let Some(ingester) = self.ingester_pool.get(&leader_id) else {
+        let Some(ingester) = self.ingester_pool.get(&leader_id).map(|h| h.client) else {
             model.release_scaling_permits(&source_uid, ScalingMode::Down);
             return Ok(());
         };
@@ -1211,7 +1211,7 @@ impl IngestController {
         let mut close_shards_futures = FuturesUnordered::new();
 
         for (leader_id, shard_pkeys) in per_leader_shards_to_close {
-            let Some(ingester) = self.ingester_pool.get(&leader_id) else {
+            let Some(ingester) = self.ingester_pool.get(&leader_id).map(|h| h.client) else {
                 warn!("failed to close shards: ingester `{leader_id}` is unavailable");
                 continue;
             };
@@ -1313,7 +1313,7 @@ mod tests {
     use quickwit_common::shared_consts::DEFAULT_SHARD_THROUGHPUT_LIMIT;
     use quickwit_common::tower::DelayLayer;
     use quickwit_config::{DocMapping, INGEST_V2_SOURCE_ID, SourceConfig};
-    use quickwit_ingest::{RateMibPerSec, ShardInfo};
+    use quickwit_ingest::{IngesterPoolEntry, RateMibPerSec, ShardInfo};
     use quickwit_metastore::IndexMetadata;
     use quickwit_proto::control_plane::GetOrCreateOpenShardsSubrequest;
     use quickwit_proto::ingest::ingester::{
@@ -1328,6 +1328,13 @@ mod tests {
 
     use super::*;
 
+    fn ingester_pool_entry(client: IngesterServiceClient) -> IngesterPoolEntry {
+        IngesterPoolEntry {
+            client,
+            availability_zone: None,
+        }
+    }
+
     const TEST_SHARD_THROUGHPUT_LIMIT_MIB: f32 =
         DEFAULT_SHARD_THROUGHPUT_LIMIT.as_u64() as f32 / quickwit_common::shared_consts::MIB as f32;
 
@@ -1385,7 +1392,10 @@ mod tests {
         let ingester = IngesterServiceClient::from_mock(mock_ingester);
 
         let ingester_pool = IngesterPool::default();
-        ingester_pool.insert(NodeId::from("test-ingester-1"), ingester.clone());
+        ingester_pool.insert(
+            NodeId::from("test-ingester-1"),
+            ingester_pool_entry(ingester.clone()),
+        );
 
         let mut mock_ingester = MockIngesterService::new();
         let index_uid_1_clone = index_uid_1.clone();
@@ -1413,7 +1423,10 @@ mod tests {
                 Ok(response)
             });
         let ingester = IngesterServiceClient::from_mock(mock_ingester);
-        ingester_pool.insert(NodeId::from("test-ingester-2"), ingester.clone());
+        ingester_pool.insert(
+            NodeId::from("test-ingester-2"),
+            ingester_pool_entry(ingester.clone()),
+        );
 
         let replication_factor = 2;
         let mut controller = IngestController::new(
@@ -1599,7 +1612,10 @@ mod tests {
         let ingester = IngesterServiceClient::from_mock(mock_ingester);
 
         let ingester_pool = IngesterPool::default();
-        ingester_pool.insert(NodeId::from("test-ingester-1"), ingester.clone());
+        ingester_pool.insert(
+            NodeId::from("test-ingester-1"),
+            ingester_pool_entry(ingester.clone()),
+        );
 
         let replication_factor = 1;
         let mut controller = IngestController::new(
@@ -1712,7 +1728,7 @@ mod tests {
 
         ingester_pool.insert(
             NodeId::from("test-ingester-1"),
-            IngesterServiceClient::mocked(),
+            ingester_pool_entry(IngesterServiceClient::mocked()),
         );
 
         let leader_follower_pairs_opt =
@@ -1722,7 +1738,10 @@ mod tests {
         // find any solution.
         assert!(leader_follower_pairs_opt.is_none());
 
-        ingester_pool.insert("test-ingester-2".into(), IngesterServiceClient::mocked());
+        ingester_pool.insert(
+            "test-ingester-2".into(),
+            ingester_pool_entry(IngesterServiceClient::mocked()),
+        );
 
         let leader_follower_pairs = controller
             .allocate_shards(0, &FnvHashSet::default(), &model)
@@ -1841,7 +1860,10 @@ mod tests {
             Some(NodeId::from("test-ingester-1"))
         );
 
-        ingester_pool.insert("test-ingester-3".into(), IngesterServiceClient::mocked());
+        ingester_pool.insert(
+            "test-ingester-3".into(),
+            ingester_pool_entry(IngesterServiceClient::mocked()),
+        );
         let unavailable_leaders = FnvHashSet::from_iter([NodeId::from("test-ingester-2")]);
         let leader_follower_pairs = controller
             .allocate_shards(4, &unavailable_leaders, &model)
@@ -1934,7 +1956,7 @@ mod tests {
                 Ok(response)
             });
         let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
-        ingester_pool.insert(ingester_id_0, ingester_0);
+        ingester_pool.insert(ingester_id_0, ingester_pool_entry(ingester_0));
 
         let ingester_id_1 = NodeId::from("test-ingester-1");
         let mut mock_ingester_1 = MockIngesterService::new();
@@ -1955,7 +1977,7 @@ mod tests {
 
                 Err(IngestV2Error::Internal("internal error".to_string()))
             });
-        let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1);
+        let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1));
         ingester_pool.insert(ingester_id_1, ingester_1);
 
         let ingester_id_2 = NodeId::from("test-ingester-2");
@@ -1965,7 +1987,7 @@ mod tests {
         let ingester_2 = IngesterServiceClient::tower()
             .stack_init_shards_layer(DelayLayer::new(INIT_SHARDS_REQUEST_TIMEOUT * 2))
             .build_from_mock(mock_ingester_2);
-        ingester_pool.insert(ingester_id_2, ingester_2);
+        ingester_pool.insert(ingester_id_2, ingester_pool_entry(ingester_2));
 
         let init_shards_response = controller
             .init_shards(Vec::new(), &Progress::default())
@@ -2173,7 +2195,7 @@ mod tests {
 
         ingester_pool.insert(
             NodeId::from("test-ingester-1"),
-            IngesterServiceClient::from_mock(mock_ingester),
+            ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)),
         );
         let source_uids: HashMap<SourceUid, usize> = HashMap::from_iter([(source_uid.clone(), 1)]);
         let unavailable_leaders = FnvHashSet::default();
@@ -2343,7 +2365,7 @@ mod tests {
                     "failed to close shards".to_string(),
                 ))
             });
-        let ingester = IngesterServiceClient::from_mock(mock_ingester);
+        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("test-ingester".into(), ingester);
 
         let shard_infos = BTreeSet::from_iter([
@@ -2496,7 +2518,7 @@ mod tests {
             },
         );
 
-        let ingester = IngesterServiceClient::from_mock(mock_ingester);
+        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("test-ingester".into(), ingester);
 
         let shard_infos = BTreeSet::from_iter([ShardInfo {
@@ -2642,7 +2664,7 @@ mod tests {
                 };
                 Ok(response)
             });
-        let ingester = IngesterServiceClient::from_mock(mock_ingester);
+        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("test-ingester".into(), ingester);
 
         // Test failed to open shards.
@@ -2764,7 +2786,7 @@ mod tests {
                 };
                 Ok(response)
             });
-        let ingester = IngesterServiceClient::from_mock(mock_ingester);
+        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("test-ingester".into(), ingester);
 
         // Test failed to close shard.
@@ -2996,15 +3018,15 @@ mod tests {
             });
         ingester_pool.insert(
             "node-1".into(),
-            IngesterServiceClient::from_mock(mock_ingester_1),
+            ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1)),
         );
         ingester_pool.insert(
             "node-2".into(),
-            IngesterServiceClient::from_mock(mock_ingester_2),
+            ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_2)),
         );
         ingester_pool.insert(
             "node-3".into(),
-            IngesterServiceClient::from_mock(mock_ingester_3),
+            ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_3)),
         );
         let node_id = "node-1".into();
         let wait_handle = controller.sync_with_ingester(&node_id, &model);
@@ -3134,7 +3156,7 @@ mod tests {
                 Ok(response)
             });
         let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
-        ingester_pool.insert(ingester_id_0.clone(), ingester_0);
+        ingester_pool.insert(ingester_id_0.clone(), ingester_pool_entry(ingester_0));
 
         let ingester_id_1 = NodeId::from("test-ingester-1");
         let mut mock_ingester_1 = MockIngesterService::new();
@@ -3152,7 +3174,7 @@ mod tests {
                 Err(IngestV2Error::Internal("internal error".to_string()))
             });
         let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1);
-        ingester_pool.insert(ingester_id_1.clone(), ingester_1);
+        ingester_pool.insert(ingester_id_1.clone(), ingester_pool_entry(ingester_1));
 
         let ingester_id_2 = NodeId::from("test-ingester-2");
         let mut mock_ingester_2 = MockIngesterService::new();
@@ -3161,7 +3183,7 @@ mod tests {
         let ingester_2 = IngesterServiceClient::tower()
             .stack_close_shards_layer(DelayLayer::new(CLOSE_SHARDS_REQUEST_TIMEOUT * 2))
             .build_from_mock(mock_ingester_2);
-        ingester_pool.insert(ingester_id_2.clone(), ingester_2);
+        ingester_pool.insert(ingester_id_2.clone(), ingester_pool_entry(ingester_2));
 
         // In this test:
         // - ingester 0 will close shard 0 successfully and fail to close shard 1;
@@ -3341,7 +3363,7 @@ mod tests {
                 Ok(response)
             });
         let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
-        ingester_pool.insert(ingester_id_0.clone(), ingester_0);
+        ingester_pool.insert(ingester_id_0.clone(), ingester_pool_entry(ingester_0));
 
         let ingester_id_1 = NodeId::from("test-ingester-1");
         let mut mock_ingester_1 = MockIngesterService::new();
@@ -3382,7 +3404,7 @@ mod tests {
             };
             Ok(response)
         });
-        let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1);
+        let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1));
         ingester_pool.insert(ingester_id_1.clone(), ingester_1);
 
         let close_shards_task = controller
@@ -3561,7 +3583,7 @@ mod tests {
 
         let ingester_pool = IngesterPool::default();
         let mock_ingester = MockIngesterService::new();
-        let ingester_client = IngesterServiceClient::from_mock(mock_ingester);
+        let ingester_client = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
 
         let active_ids: Vec<String> = (0..available_ingester_shards.len())
             .map(|i| format!("active-ingester-{}", i))
diff --git a/quickwit/quickwit-indexing/src/source/ingest/mod.rs b/quickwit/quickwit-indexing/src/source/ingest/mod.rs
index 63c746aabe0..6f9551ee406 100644
--- a/quickwit/quickwit-indexing/src/source/ingest/mod.rs
+++ b/quickwit/quickwit-indexing/src/source/ingest/mod.rs
@@ -345,7 +345,7 @@ impl IngestSource {
                 .push(truncate_shards_subrequest);
         }
         for (ingester_id, truncate_subrequests) in per_ingester_truncate_subrequests {
-            let Some(ingester) = self.ingester_pool.get(ingester_id) else {
+            let Some(ingester) = self.ingester_pool.get(ingester_id).map(|h| h.client) else {
                 warn!("failed to truncate shard(s): ingester `{ingester_id}` is unavailable");
                 continue;
             };
@@ -672,6 +672,7 @@ mod tests {
     use quickwit_common::metrics::MEMORY_METRICS;
     use quickwit_common::stream_utils::InFlightValue;
     use quickwit_config::{IndexingSettings, SourceConfig, SourceParams};
+    use quickwit_ingest::IngesterPoolEntry;
     use quickwit_proto::indexing::IndexingPipelineId;
     use quickwit_proto::ingest::ingester::{
         FetchMessage, IngesterServiceClient, MockIngesterService, TruncateShardsResponse,
@@ -687,6 +688,13 @@ mod tests {
     use crate::models::RawDocBatch;
     use crate::source::SourceActor;
 
+    fn ingester_pool_entry(client: IngesterServiceClient) -> IngesterPoolEntry {
+        IngesterPoolEntry {
+            client,
+            availability_zone: None,
+        }
+    }
+
     // In this test, we simulate a source to which we sequentially assign the following set of
     // shards []
     // [1] (triggers a reset, and the creation of a publish lock)
@@ -929,7 +937,7 @@ mod tests {
                 Ok(response)
             });
 
-        let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
+        let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
 
         let event_broker = EventBroker::default();
@@ -1126,7 +1134,7 @@ mod tests {
                 Ok(response)
             });
 
-        let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
+        let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
 
         let event_broker = EventBroker::default();
@@ -1291,7 +1299,7 @@ mod tests {
                 Ok(response)
             });
 
-        let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
+        let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
 
         let event_broker = EventBroker::default();
@@ -1599,7 +1607,7 @@ mod tests {
                 })
             });
 
-        let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
+        let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
 
         let event_broker = EventBroker::default();
@@ -1699,7 +1707,7 @@ mod tests {
 
                 Ok(TruncateShardsResponse {})
             });
-        let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
+        let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
 
         let mut mock_ingester_1 = MockIngesterService::new();
@@ -1726,7 +1734,7 @@ mod tests {
 
                 Ok(TruncateShardsResponse {})
             });
-        let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1);
+        let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1));
         ingester_pool.insert("test-ingester-1".into(), ingester_1.clone());
 
         let mut mock_ingester_3 = MockIngesterService::new();
@@ -1746,7 +1754,7 @@ mod tests {
 
                 Ok(TruncateShardsResponse {})
             });
-        let ingester_3 = IngesterServiceClient::from_mock(mock_ingester_3);
+        let ingester_3 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_3));
         ingester_pool.insert("test-ingester-3".into(), ingester_3.clone());
 
         let event_broker = EventBroker::default();
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs
index 6e8d085e35d..f6f0145d175 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs
@@ -495,7 +495,11 @@ async fn fault_tolerant_fetch_stream(
             shard_id: Some(shard_id.clone()),
             from_position_exclusive: Some(from_position_exclusive.clone()),
         };
-        let mut fetch_stream = match ingester.open_fetch_stream(open_fetch_stream_request).await {
+        let mut fetch_stream = match ingester
+            .client
+            .open_fetch_stream(open_fetch_stream_request)
+            .await
+        {
             Ok(fetch_stream) => fetch_stream,
             Err(not_found_error @ IngestV2Error::ShardNotFound { .. }) => {
                 error!(
@@ -628,7 +632,7 @@ pub(super) mod tests {
     use tokio::time::timeout;
 
     use super::*;
-    use crate::MRecord;
+    use crate::{IngesterPoolEntry, MRecord};
 
     pub fn into_fetch_payload(fetch_message: FetchMessage) -> FetchPayload {
         match fetch_message.message.unwrap() {
@@ -644,6 +648,13 @@ pub(super) mod tests {
         }
     }
 
+    fn ingester_pool_entry(client: IngesterServiceClient) -> IngesterPoolEntry {
+        IngesterPoolEntry {
+            client,
+            availability_zone: None,
+        }
+    }
+
     #[tokio::test]
     async fn test_fetch_task_happy_path() {
         let tempdir = tempfile::tempdir().unwrap();
@@ -1325,7 +1336,7 @@ pub(super) mod tests {
 
                 Ok(service_stream_1)
             });
-        let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1);
+        let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1));
 
         ingester_pool.insert("test-ingester-1".into(), ingester_1);
 
@@ -1425,7 +1436,7 @@ pub(super) mod tests {
                     "open fetch stream error".to_string(),
                 ))
             });
-        let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
+        let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0));
 
         let mut mock_ingester_1 = MockIngesterService::new();
         let index_uid_clone = index_uid.clone();
@@ -1440,7 +1451,7 @@ pub(super) mod tests {
 
                 Ok(service_stream_1)
             });
-        let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1);
+        let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1));
 
         ingester_pool.insert("test-ingester-0".into(), ingester_0);
         ingester_pool.insert("test-ingester-1".into(), ingester_1);
@@ -1540,7 +1551,7 @@ pub(super) mod tests {
 
                 Ok(service_stream_0)
             });
-        let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
+        let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0));
 
         let mut mock_ingester_1 = MockIngesterService::new();
         let index_uid_clone = index_uid.clone();
@@ -1555,7 +1566,7 @@ pub(super) mod tests {
 
                 Ok(service_stream_1)
             });
-        let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1);
+        let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1));
 
         ingester_pool.insert("test-ingester-0".into(), ingester_0);
         ingester_pool.insert("test-ingester-1".into(), ingester_1);
@@ -1658,7 +1669,7 @@ pub(super) mod tests {
                     shard_id: ShardId::from(1),
                 })
             });
-        let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
+        let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0);
 
         fault_tolerant_fetch_stream(
@@ -1746,7 +1757,7 @@ pub(super) mod tests {
 
                 Ok(service_stream_2)
             });
-        let ingester = IngesterServiceClient::from_mock(mock_ingester);
+        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
 
         ingester_pool.insert("test-ingester".into(), ingester);
 
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
index a131e2c289a..df51758a4ca 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
@@ -31,7 +31,6 @@ use quickwit_common::metrics::{GaugeGuard, MEMORY_METRICS};
 use quickwit_common::pretty::PrettyDisplay;
 use quickwit_common::pubsub::{EventBroker, EventSubscriber};
 use quickwit_common::rate_limiter::{RateLimiter, RateLimiterSettings};
-use quickwit_common::tower::Pool;
 use quickwit_common::{ServiceStream, rate_limited_error, rate_limited_warn};
 use quickwit_proto::control_plane::{
     AdviseResetShardsRequest, ControlPlaneService, ControlPlaneServiceClient,
@@ -123,7 +122,7 @@ impl Ingester {
     pub async fn try_new(
         cluster: Cluster,
         control_plane: ControlPlaneServiceClient,
-        ingester_pool: Pool<NodeId, IngesterServiceClient>,
+        ingester_pool: IngesterPool,
         wal_dir_path: &Path,
         disk_capacity: ByteSize,
         memory_capacity: ByteSize,
@@ -390,6 +389,7 @@ impl Ingester {
             IngestV2Error::Unavailable(message)
         })?;
         let mut ack_replication_stream = ingester
+            .client
             .open_replication_stream(syn_replication_stream)
             .await?;
         ack_replication_stream
@@ -1311,11 +1311,11 @@ mod tests {
     use tonic::transport::{Endpoint, Server};
 
     use super::*;
-    use crate::MRecord;
     use crate::ingest_v2::DEFAULT_IDLE_SHARD_TIMEOUT;
     use crate::ingest_v2::broadcast::ShardInfos;
     use crate::ingest_v2::doc_mapper::try_build_doc_mapper;
     use crate::ingest_v2::fetch::tests::{into_fetch_eof, into_fetch_payload};
+    use crate::{IngesterPoolEntry, MRecord};
 
     const MAX_GRPC_MESSAGE_SIZE: ByteSize = ByteSize::mib(1);
 
@@ -2284,10 +2284,14 @@ mod tests {
             .build()
             .await;
 
-        leader_ctx.ingester_pool.insert(
-            follower_ctx.node_id.clone(),
-            IngesterServiceClient::new(follower.clone()),
-        );
+        let ingester_pool_entry = IngesterPoolEntry {
+            client: IngesterServiceClient::new(follower.clone()),
+            availability_zone: None,
+        };
+
+        leader_ctx
+            .ingester_pool
+            .insert(follower_ctx.node_id.clone(), ingester_pool_entry);
 
         let index_uid = IndexUid::for_test("test-index", 0);
         let index_uid2: IndexUid = IndexUid::for_test("test-index", 1);
@@ -2491,9 +2495,14 @@ mod tests {
             None,
         );
 
+        let ingester_pool_entry = IngesterPoolEntry {
+            client: follower_grpc_client,
+            availability_zone: None,
+        };
+
         leader_ctx
             .ingester_pool
-            .insert(follower_ctx.node_id.clone(), follower_grpc_client);
+            .insert(follower_ctx.node_id.clone(), ingester_pool_entry);
 
         let index_uid = IndexUid::for_test("test-index", 0);
         let index_uid2: IndexUid = IndexUid::for_test("test-index", 1);
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
index 34051e62cd0..d5432936e58 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
@@ -63,7 +63,14 @@ use self::mrecord::MRECORD_HEADER_LEN;
 pub use self::mrecord::{MRecord, decoded_mrecords};
 pub use self::router::IngestRouter;
 
-pub type IngesterPool = Pool<NodeId, IngesterServiceClient>;
+/// An ingester as represented in the pool, bundling the gRPC client with node metadata.
+#[derive(Debug, Clone)]
+pub struct IngesterPoolEntry {
+    pub client: IngesterServiceClient,
+    pub availability_zone: Option<String>,
+}
+
+pub type IngesterPool = Pool<NodeId, IngesterPoolEntry>;
 
 /// Identifies an ingester client, typically a source, for logging and debugging purposes.
 pub type ClientId = String;
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs
index 72ee2ded5ec..68299358f77 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs
@@ -57,15 +57,25 @@ fn power_of_two_choices<'a>(candidates: &[&'a IngesterNode]) -> &'a IngesterNode
     }
 }
 
+fn pick_from(candidates: Vec<&IngesterNode>) -> Option<&IngesterNode> {
+    match candidates.len() {
+        0 => None,
+        1 => Some(candidates[0]),
+        _ => Some(power_of_two_choices(&candidates)),
+    }
+}
+
 impl RoutingEntry {
     /// Pick an ingester node to persist the request to. Uses power of two choices based on reported
-    /// ingester capacity, if more than one eligible node exists.
-    pub fn pick_node(
+    /// ingester capacity, if more than one eligible node exists. Prefers nodes in the same
+    /// availability zone, falling back to remote nodes.
+    fn pick_node(
         &self,
         ingester_pool: &IngesterPool,
         unavailable_leaders: &HashSet<NodeId>,
+        self_availability_zone: &Option<String>,
     ) -> Option<&IngesterNode> {
-        let eligible: Vec<&IngesterNode> = self
+        let (local_ingesters, remote_ingesters): (Vec<&IngesterNode>, Vec<&IngesterNode>) = self
             .nodes
             .values()
             .filter(|node| {
@@ -74,31 +84,55 @@ impl RoutingEntry {
                     && ingester_pool.contains_key(&node.node_id)
                     && !unavailable_leaders.contains(&node.node_id)
             })
-            .collect();
-
-        match eligible.len() {
-            0 => None,
-            1 => Some(eligible[0]),
-            _ => Some(power_of_two_choices(&eligible)),
-        }
+            .partition(|node| {
+                let node_az = ingester_pool
+                    .get(&node.node_id)
+                    .and_then(|h| h.availability_zone);
+                node_az == *self_availability_zone
+            });
+
+        pick_from(local_ingesters).or_else(|| pick_from(remote_ingesters))
     }
 }
 
 #[derive(Debug, Default)]
 pub(super) struct NodeBasedRoutingTable {
     table: HashMap<(IndexId, SourceId), RoutingEntry>,
+    self_availability_zone: Option<String>,
 }
 
 impl NodeBasedRoutingTable {
-    pub fn find_entry(&self, index_id: &str, source_id: &str) -> Option<&RoutingEntry> {
+    pub fn new(self_availability_zone: Option<String>) -> Self {
+        Self {
+            self_availability_zone,
+            ..Default::default()
+        }
+    }
+
+    pub fn pick_node(
+        &self,
+        index_id: &str,
+        source_id: &str,
+        ingester_pool: &IngesterPool,
+        unavailable_leaders: &HashSet<NodeId>,
+    ) -> Option<&IngesterNode> {
         let key = (index_id.to_string(), source_id.to_string());
-        self.table.get(&key)
+        let entry = self.table.get(&key)?;
+        entry.pick_node(
+            ingester_pool,
+            unavailable_leaders,
+            &self.self_availability_zone,
+        )
     }
 
-    pub fn debug_info(&self) -> HashMap<IndexId, Vec<serde_json::Value>> {
+    pub fn debug_info(
+        &self,
+        ingester_pool: &IngesterPool,
+    ) -> HashMap<IndexId, Vec<serde_json::Value>> {
         let mut per_index: HashMap<IndexId, Vec<serde_json::Value>> = HashMap::new();
         for ((index_id, source_id), entry) in &self.table {
             for (node_id, node) in &entry.nodes {
+                let az = ingester_pool.get(node_id).and_then(|h| h.availability_zone);
                 per_index
                     .entry(index_id.clone())
                     .or_default()
@@ -107,6 +141,7 @@ impl NodeBasedRoutingTable {
                         "node_id": node_id,
                         "capacity_score": node.capacity_score,
                         "open_shard_count": node.open_shard_count,
+                        "availability_zone": az,
                     }));
             }
         }
@@ -201,6 +236,14 @@ mod tests {
     use quickwit_proto::types::ShardId;
 
     use super::*;
+    use crate::IngesterPoolEntry;
+
+    fn mocked_ingester(availability_zone: Option<&str>) -> IngesterPoolEntry {
+        IngesterPoolEntry {
+            client: IngesterServiceClient::mocked(),
+            availability_zone: availability_zone.map(|s| s.to_string()),
+        }
+    }
 
     #[test]
     fn test_apply_capacity_update() {
@@ -274,7 +317,7 @@ mod tests {
         assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new()));
 
         // Node is in pool → true.
-        pool.insert("node-1".into(), IngesterServiceClient::mocked());
+        pool.insert("node-1".into(), mocked_ingester(None));
         assert!(table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new()));
 
         // Node is unavailable → false.
@@ -289,7 +332,7 @@ mod tests {
             6,
             2,
         );
-        pool.insert("node-2".into(), IngesterServiceClient::mocked());
+        pool.insert("node-2".into(), mocked_ingester(None));
         assert!(table.has_open_nodes("test-index", "test-source", &pool, &unavailable));
 
         // Node with capacity_score=0 is not eligible.
@@ -304,77 +347,81 @@ mod tests {
     }
 
     #[test]
-    fn test_pick_node() {
-        let mut table = NodeBasedRoutingTable::default();
+    fn test_pick_node_prefers_same_az() {
+        let mut table = NodeBasedRoutingTable::new(Some("az-1".to_string()));
         let pool = IngesterPool::default();
-        let key = ("test-index".to_string(), "test-source".to_string());
 
-        // Node exists but not in pool → None.
         table.apply_capacity_update(
             "node-1".into(),
             IndexUid::for_test("test-index", 0),
             "test-source".into(),
-            8,
-            3,
+            5,
+            1,
         );
-        assert!(
-            table
-                .table
-                .get(&key)
-                .unwrap()
-                .pick_node(&pool, &HashSet::new())
-                .is_none()
+        table.apply_capacity_update(
+            "node-2".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            5,
+            1,
         );
+        pool.insert("node-1".into(), mocked_ingester(Some("az-1")));
+        pool.insert("node-2".into(), mocked_ingester(Some("az-2")));
 
-        // Single node in pool → picks it.
-        pool.insert("node-1".into(), IngesterServiceClient::mocked());
         let picked = table
-            .table
-            .get(&key)
-            .unwrap()
-            .pick_node(&pool, &HashSet::new())
+            .pick_node("test-index", "test-source", &pool, &HashSet::new())
             .unwrap();
         assert_eq!(picked.node_id, NodeId::from("node-1"));
+    }
+
+    #[test]
+    fn test_pick_node_falls_back_to_cross_az() {
+        let mut table = NodeBasedRoutingTable::new(Some("az-1".to_string()));
+        let pool = IngesterPool::default();
 
-        // Multiple nodes → something is returned.
         table.apply_capacity_update(
             "node-2".into(),
             IndexUid::for_test("test-index", 0),
             "test-source".into(),
-            2,
+            5,
             1,
         );
-        pool.insert("node-2".into(), IngesterServiceClient::mocked());
-        assert!(
-            table
-                .table
-                .get(&key)
-                .unwrap()
-                .pick_node(&pool, &HashSet::new())
-                .is_some()
-        );
+        pool.insert("node-2".into(), mocked_ingester(Some("az-2")));
+
+        let picked = table
+            .pick_node("test-index", "test-source", &pool, &HashSet::new())
+            .unwrap();
+        assert_eq!(picked.node_id, NodeId::from("node-2"));
+    }
+
+    #[test]
+    fn test_pick_node_no_az_awareness() {
+        let mut table = NodeBasedRoutingTable::default();
+        let pool = IngesterPool::default();
 
-        // Node with capacity_score=0 is skipped.
         table.apply_capacity_update(
             "node-1".into(),
             IndexUid::for_test("test-index", 0),
             "test-source".into(),
-            0,
-            3,
-        );
-        table.apply_capacity_update(
-            "node-2".into(),
-            IndexUid::for_test("test-index", 0),
-            "test-source".into(),
-            0,
+            5,
             1,
         );
+        pool.insert("node-1".into(), mocked_ingester(Some("az-1")));
+
+        let picked = table
+            .pick_node("test-index", "test-source", &pool, &HashSet::new())
+            .unwrap();
+        assert_eq!(picked.node_id, NodeId::from("node-1"));
+    }
+
+    #[test]
+    fn test_pick_node_missing_entry() {
+        let table = NodeBasedRoutingTable::new(Some("az-1".to_string()));
+        let pool = IngesterPool::default();
+
         assert!(
             table
-                .table
-                .get(&key)
-                .unwrap()
-                .pick_node(&pool, &HashSet::new())
+                .pick_node("nonexistent", "source", &pool, &HashSet::new())
                 .is_none()
         );
     }
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs
index 5a473a1adb1..a1f5bf86302 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs
@@ -120,10 +120,11 @@ impl IngestRouter {
         ingester_pool: IngesterPool,
         replication_factor: usize,
         event_broker: EventBroker,
+        self_availability_zone: Option<String>,
     ) -> Self {
         let state = Arc::new(Mutex::new(RouterState {
             debouncer: GetOrCreateOpenShardsRequestDebouncer::default(),
-            node_routing_table: NodeBasedRoutingTable::default(),
+            node_routing_table: NodeBasedRoutingTable::new(self_availability_zone),
         }));
         let ingest_semaphore_permits = get_ingest_router_buffer_size().as_u64() as usize;
         let ingest_semaphore = Arc::new(Semaphore::new(ingest_semaphore_permits));
@@ -352,10 +353,12 @@ impl IngestRouter {
         let state_guard = self.state.lock().await;
 
         for subrequest in pending_subrequests(&workbench.subworkbenches) {
-            let ingester_node = state_guard
-                .node_routing_table
-                .find_entry(&subrequest.index_id, &subrequest.source_id)
-                .and_then(|entry| entry.pick_node(&self.ingester_pool, unavailable_leaders));
+            let ingester_node = state_guard.node_routing_table.pick_node(
+                &subrequest.index_id,
+                &subrequest.source_id,
+                &self.ingester_pool,
+                unavailable_leaders,
+            );
 
             let ingester_node = match ingester_node {
                 Some(node) => node,
@@ -383,7 +386,7 @@ impl IngestRouter {
                 .iter()
                 .map(|subrequest| subrequest.subrequest_id)
                 .collect();
-            let Some(ingester) = self.ingester_pool.get(&leader_id) else {
+            let Some(ingester) = self.ingester_pool.get(&leader_id).map(|h| h.client) else {
                 no_shards_available_subrequest_ids.extend(subrequest_ids);
                 continue;
             };
@@ -471,7 +474,9 @@ impl IngestRouter {
 
     pub async fn debug_info(&self) -> JsonValue {
         let state_guard = self.state.lock().await;
-        let routing_table_json = state_guard.node_routing_table.debug_info();
+        let routing_table_json = state_guard
+            .node_routing_table
+            .debug_info(&self.ingester_pool);
 
         json!({
             "routing_table": routing_table_json,
@@ -634,8 +639,16 @@ mod tests {
     use quickwit_proto::types::{DocUid, IndexUid, Position, ShardId, SourceUid};
 
     use super::*;
+    use crate::IngesterPoolEntry;
     use crate::ingest_v2::workbench::SubworkbenchFailure;
 
+    fn mocked_ingester() -> IngesterPoolEntry {
+        IngesterPoolEntry {
+            client: IngesterServiceClient::mocked(),
+            availability_zone: None,
+        }
+    }
+
     #[tokio::test]
     async fn test_router_make_get_or_create_open_shard_request() {
         let self_node_id = "test-router".into();
@@ -649,6 +662,7 @@ mod tests {
             ingester_pool.clone(),
             replication_factor,
             EventBroker::default(),
+            Some("test-az".to_string()),
         );
         let mut workbench = IngestWorkbench::default();
         let (get_or_create_open_shard_request_opt, rendezvous) = router
@@ -723,7 +737,7 @@ mod tests {
         drop(rendezvous_1);
         drop(rendezvous_2);
 
-        ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked());
+        ingester_pool.insert("test-ingester-0".into(), mocked_ingester());
         {
             // Ingester-0 is in pool and in table, but marked unavailable on the workbench
             // (simulating a prior transport error). has_open_nodes returns false → both
@@ -859,6 +873,7 @@ mod tests {
             ingester_pool.clone(),
             replication_factor,
             EventBroker::default(),
+            Some("test-az".to_string()),
         );
         let ingest_subrequests = vec![
             IngestSubrequest {
@@ -957,6 +972,7 @@ mod tests {
             ingester_pool.clone(),
             replication_factor,
             EventBroker::default(),
+            Some("test-az".to_string()),
         );
         let ingest_subrequests = vec![IngestSubrequest {
             subrequest_id: 0,
@@ -1016,6 +1032,7 @@ mod tests {
             ingester_pool.clone(),
             replication_factor,
             EventBroker::default(),
+            Some("test-az".to_string()),
         );
         let ingest_subrequests = vec![IngestSubrequest {
             subrequest_id: 0,
@@ -1046,6 +1063,7 @@ mod tests {
             ingester_pool.clone(),
             replication_factor,
             EventBroker::default(),
+            Some("test-az".to_string()),
         );
         let ingest_subrequests = vec![IngestSubrequest {
             subrequest_id: 0,
@@ -1103,6 +1121,7 @@ mod tests {
             ingester_pool.clone(),
             replication_factor,
             EventBroker::default(),
+            Some("test-az".to_string()),
         );
         let ingest_subrequests = vec![IngestSubrequest {
             subrequest_id: 0,
@@ -1153,8 +1172,8 @@ mod tests {
         let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new());
 
         let ingester_pool = IngesterPool::default();
-        ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked());
-        ingester_pool.insert("test-ingester-1".into(), IngesterServiceClient::mocked());
+        ingester_pool.insert("test-ingester-0".into(), mocked_ingester());
+        ingester_pool.insert("test-ingester-1".into(), mocked_ingester());
 
         let replication_factor = 1;
         let router = IngestRouter::new(
@@ -1163,6 +1182,7 @@ mod tests {
             ingester_pool.clone(),
             replication_factor,
             EventBroker::default(),
+            Some("test-az".to_string()),
         );
         let ingest_subrequests = vec![
             IngestSubrequest {
@@ -1246,6 +1266,7 @@ mod tests {
             ingester_pool.clone(),
             1,
             EventBroker::default(),
+            Some("test-az".to_string()),
         );
 
         let index_uid_0: IndexUid = IndexUid::for_test("test-index-0", 0);
@@ -1312,7 +1333,10 @@ mod tests {
             });
         ingester_pool.insert(
             "test-ingester-0".into(),
-            IngesterServiceClient::from_mock(mock_ingester_0),
+            IngesterPoolEntry {
+                client: IngesterServiceClient::from_mock(mock_ingester_0),
+                availability_zone: None,
+            },
         );
 
         let mut mock_ingester_1 = MockIngesterService::new();
@@ -1344,7 +1368,10 @@ mod tests {
             });
         ingester_pool.insert(
             "test-ingester-1".into(),
-            IngesterServiceClient::from_mock(mock_ingester_1),
+            IngesterPoolEntry {
+                client: IngesterServiceClient::from_mock(mock_ingester_1),
+                availability_zone: None,
+            },
         );
 
         let response = router
@@ -1388,6 +1415,7 @@ mod tests {
             ingester_pool.clone(),
             1,
             EventBroker::default(),
+            Some("test-az".to_string()),
         );
         let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0);
         {
@@ -1460,7 +1488,10 @@ mod tests {
             });
         ingester_pool.insert(
             "test-ingester-0".into(),
-            IngesterServiceClient::from_mock(mock_ingester_0),
+            IngesterPoolEntry {
+                client: IngesterServiceClient::from_mock(mock_ingester_0),
+                availability_zone: None,
+            },
         );
 
         let response = router
@@ -1491,6 +1522,7 @@ mod tests {
             ingester_pool.clone(),
             replication_factor,
             EventBroker::default(),
+            Some("test-az".to_string()),
         );
         let index_uid_0: IndexUid = IndexUid::for_test("test-index-0", 0);
         let index_uid_1: IndexUid = IndexUid::for_test("test-index-1", 0);
@@ -1547,6 +1579,7 @@ mod tests {
             ingester_pool.clone(),
             replication_factor,
             EventBroker::default(),
+            Some("test-az".to_string()),
         );
         let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0);
         {
@@ -1601,7 +1634,13 @@ mod tests {
             Ok(response)
         });
         let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
-        ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
+        ingester_pool.insert(
+            "test-ingester-0".into(),
+            IngesterPoolEntry {
+                client: ingester_0.clone(),
+                availability_zone: None,
+            },
+        );
 
         let ingest_request = IngestRequestV2 {
             subrequests: vec![IngestSubrequest {
@@ -1624,12 +1663,14 @@ mod tests {
     #[tokio::test]
     async fn test_router_updates_node_routing_table_on_capacity_update() {
         let event_broker = EventBroker::default();
+        let ingester_pool = IngesterPool::default();
         let router = IngestRouter::new(
             "test-router".into(),
             ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()),
-            IngesterPool::default(),
+            ingester_pool.clone(),
             1,
             event_broker.clone(),
+            Some("test-az".to_string()),
         );
         router.subscribe();
 
@@ -1645,14 +1686,13 @@ mod tests {
         // Give the async subscriber a moment to process.
         tokio::time::sleep(Duration::from_millis(10)).await;
 
+        ingester_pool.insert("test-ingester-0".into(), mocked_ingester());
         let state_guard = router.state.lock().await;
-        let entry = state_guard
+        let node = state_guard
             .node_routing_table
-            .find_entry("test-index", "test-source")
+            .pick_node("test-index", "test-source", &ingester_pool, &HashSet::new())
             .unwrap();
-        let node = entry.nodes.get("test-ingester-0").unwrap();
-        assert_eq!(node.capacity_score, 7);
-        assert_eq!(node.open_shard_count, 3);
+        assert_eq!(node.node_id, NodeId::from("test-ingester-0"));
     }
 
     #[tokio::test]
@@ -1663,6 +1703,7 @@ mod tests {
             IngesterPool::default(),
             1,
             EventBroker::default(),
+            Some("test-az".to_string()),
         );
         let ingest_subrequests = vec![
             IngestSubrequest {
@@ -1749,12 +1790,14 @@ mod tests {
 
     #[tokio::test]
     async fn test_router_process_persist_results_applies_piggybacked_routing_updates() {
+        let ingester_pool = IngesterPool::default();
         let router = IngestRouter::new(
             "test-router".into(),
             ControlPlaneServiceClient::from_mock(MockControlPlaneService::new()),
-            IngesterPool::default(),
+            ingester_pool.clone(),
             1,
             EventBroker::default(),
+            Some("test-az".to_string()),
         );
         let ingest_subrequests = vec![IngestSubrequest {
             subrequest_id: 0,
@@ -1790,13 +1833,12 @@ mod tests {
             .process_persist_results(&mut workbench, persist_futures)
             .await;
 
+        ingester_pool.insert("test-ingester-0".into(), mocked_ingester());
         let state_guard = router.state.lock().await;
-        let entry = state_guard
+        let node = state_guard
             .node_routing_table
-            .find_entry("test-index", "test-source")
+            .pick_node("test-index", "test-source", &ingester_pool, &HashSet::new())
             .unwrap();
-        let node = entry.nodes.get("test-ingester-0").unwrap();
-        assert_eq!(node.capacity_score, 3);
-        assert_eq!(node.open_shard_count, 2);
+        assert_eq!(node.node_id, NodeId::from("test-ingester-0"));
     }
 }
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs
index 987d754ed69..4b4150d6e98 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs
@@ -584,8 +584,20 @@ mod tests {
         assert!(closed_shard_ids.is_empty());
         assert!(unavailable_leaders.is_empty());
 
-        ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked());
-        ingester_pool.insert("test-ingester-1".into(), IngesterServiceClient::mocked());
+        ingester_pool.insert(
+            "test-ingester-0".into(),
+            crate::IngesterPoolEntry {
+                client: IngesterServiceClient::mocked(),
+                availability_zone: None,
+            },
+        );
+        ingester_pool.insert(
+            "test-ingester-1".into(),
+            crate::IngesterPoolEntry {
+                client: IngesterServiceClient::mocked(),
+                availability_zone: None,
+            },
+        );
 
         let table_entry = RoutingTableEntry {
             index_uid: index_uid.clone(),
@@ -675,8 +687,20 @@ mod tests {
             .unwrap_err();
         assert_eq!(error, NextOpenShardError::NoShardsAvailable);
 
-        ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked());
-        ingester_pool.insert("test-ingester-1".into(), IngesterServiceClient::mocked());
+        ingester_pool.insert(
+            "test-ingester-0".into(),
+            crate::IngesterPoolEntry {
+                client: IngesterServiceClient::mocked(),
+                availability_zone: None,
+            },
+        );
+        ingester_pool.insert(
+            "test-ingester-1".into(),
+            crate::IngesterPoolEntry {
+                client: IngesterServiceClient::mocked(),
+                availability_zone: None,
+            },
+        );
 
         let table_entry = RoutingTableEntry {
             index_uid: index_uid.clone(),
@@ -795,7 +819,13 @@ mod tests {
         let source_id: SourceId = "test-source".into();
 
         let ingester_pool = IngesterPool::default();
-        ingester_pool.insert("test-ingester-0".into(), IngesterServiceClient::mocked());
+        ingester_pool.insert(
+            "test-ingester-0".into(),
+            crate::IngesterPoolEntry {
+                client: IngesterServiceClient::mocked(),
+                availability_zone: None,
+            },
+        );
 
         let rate_limited_shards = HashSet::from_iter([ShardId::from(1)]);
 
diff --git a/quickwit/quickwit-proto/src/lib.rs b/quickwit/quickwit-proto/src/lib.rs
index f4ddb734d2a..f89fdb97687 100644
--- a/quickwit/quickwit-proto/src/lib.rs
+++ b/quickwit/quickwit-proto/src/lib.rs
@@ -28,7 +28,8 @@ use tracing_opentelemetry::OpenTelemetrySpanExt;
 
 pub mod cluster;
 pub mod control_plane;
-pub use {bytes, tonic};
+pub use bytes;
+pub use tonic;
 pub mod developer;
 pub mod error;
 mod getters;
diff --git a/quickwit/quickwit-serve/src/lib.rs b/quickwit/quickwit-serve/src/lib.rs
index 60515bc819f..78bdfee57da 100644
--- a/quickwit/quickwit-serve/src/lib.rs
+++ b/quickwit/quickwit-serve/src/lib.rs
@@ -82,9 +82,9 @@ use quickwit_indexing::models::ShardPositionsService;
 use quickwit_indexing::start_indexing_service;
 use quickwit_ingest::{
     GetMemoryCapacity, IngestRequest, IngestRouter, IngestServiceClient, Ingester, IngesterPool,
-    LocalShardsUpdate, get_idle_shard_timeout, setup_ingester_capacity_update_listener,
-    setup_local_shards_update_listener, start_ingest_api_service, wait_for_ingester_decommission,
-    wait_for_ingester_status,
+    IngesterPoolEntry, LocalShardsUpdate, get_idle_shard_timeout,
+    setup_ingester_capacity_update_listener, setup_local_shards_update_listener,
+    start_ingest_api_service, wait_for_ingester_decommission, wait_for_ingester_status,
 };
 use quickwit_jaeger::JaegerService;
 use quickwit_janitor::{JanitorService, start_janitor_service};
@@ -905,6 +905,7 @@ async fn setup_ingest_v2(
         ingester_pool.clone(),
         replication_factor,
         event_broker.clone(),
+        node_config.availability_zone.clone(),
     );
     ingest_router.subscribe();
     setup_ingester_capacity_update_listener(cluster.clone(), event_broker.clone())
@@ -968,21 +969,21 @@ async fn setup_ingest_v2(
                         chitchat_id.node_id,
                     );
                     let node_id: NodeId = node.node_id().into();
+                    let availability_zone = node.availability_zone().map(|az| az.to_string());
 
-                    if node.is_self_node() {
+                    let client = if node.is_self_node() {
                         // Here, since the service is available locally, we bypass the network stack
                         // and use the instance directly. However, we still want client-side
                         // metrics, so we use both metrics layers.
                         let ingester = ingester_opt_clone_clone
                             .expect("ingester service should be initialized");
-                        let ingester_service = ingester_service_layer_stack(
+                        ingester_service_layer_stack(
                             IngesterServiceClient::tower()
                                 .stack_layer(INGEST_GRPC_CLIENT_METRICS_LAYER.clone()),
                         )
-                        .build(ingester);
-                        Some(Change::Insert(node_id, ingester_service))
+                        .build(ingester)
                     } else {
-                        let ingester_service = IngesterServiceClient::tower()
+                        IngesterServiceClient::tower()
                             .stack_layer(INGEST_GRPC_CLIENT_METRICS_LAYER.clone())
                             .stack_layer(TimeoutLayer::new(GRPC_INGESTER_SERVICE_TIMEOUT))
                             .build_from_channel(
@@ -990,9 +991,13 @@ async fn setup_ingest_v2(
                                 node.channel(),
                                 max_message_size,
                                 grpc_compression_encoding_opt,
-                            );
-                        Some(Change::Insert(node_id, ingester_service))
-                    }
+                            )
+                    };
+                    let ingester_pool_entry = IngesterPoolEntry {
+                        client,
+                        availability_zone,
+                    };
+                    Some(Change::Insert(node_id, ingester_pool_entry))
                 }
                 ClusterChange::Remove(node) if node.is_indexer() => {
                     let chitchat_id = node.chitchat_id();

From 4c35484a788b1ec42437c92599efd8d5d7fe37b0 Mon Sep 17 00:00:00 2001
From: nadav-govari <nadav.govari@datadoghq.com>
Date: Mon, 9 Mar 2026 14:57:42 -0400
Subject: [PATCH 7/9] Remove old routing table; Take both disk and memory WAL
 readings (#6193)

* Remove old routing table; Take both disk and memory WAL readings

* Add az-aware ingest attempts metric (#6194)
---
 ...er_capacity_score.rs => capacity_score.rs} |    9 +-
 .../src/ingest_v2/broadcast/mod.rs            |    4 +-
 .../quickwit-ingest/src/ingest_v2/ingester.rs |   13 +-
 .../quickwit-ingest/src/ingest_v2/metrics.rs  |    8 +
 quickwit/quickwit-ingest/src/ingest_v2/mod.rs |    4 +-
 .../src/ingest_v2/node_routing_table.rs       |  519 ------
 .../quickwit-ingest/src/ingest_v2/router.rs   |   49 +-
 .../src/ingest_v2/routing_table.rs            | 1440 +++++------------
 .../quickwit-ingest/src/ingest_v2/state.rs    |   18 +-
 ..._timeseries.rs => wal_capacity_tracker.rs} |   75 +-
 10 files changed, 561 insertions(+), 1578 deletions(-)
 rename quickwit/quickwit-ingest/src/ingest_v2/broadcast/{ingester_capacity_score.rs => capacity_score.rs} (97%)
 delete mode 100644 quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs
 rename quickwit/quickwit-ingest/src/ingest_v2/{wal_capacity_timeseries.rs => wal_capacity_tracker.rs} (76%)

diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/capacity_score.rs
similarity index 97%
rename from quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs
rename to quickwit/quickwit-ingest/src/ingest_v2/broadcast/capacity_score.rs
index 482f5f58886..d9f456b7201 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/ingester_capacity_score.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/capacity_score.rs
@@ -69,7 +69,10 @@ impl BroadcastIngesterCapacityScoreTask {
             .map_err(|_| anyhow::anyhow!("failed to acquire ingester state lock"))?;
         let usage = guard.mrecordlog.resource_usage();
         let disk_used = ByteSize::b(usage.disk_used_bytes as u64);
-        let capacity_score = guard.wal_capacity_time_series.record_and_score(disk_used);
+        let memory_used = ByteSize::b(usage.memory_used_bytes as u64);
+        let capacity_score = guard
+            .wal_capacity_tracker
+            .record_and_score(disk_used, memory_used);
         let (open_shard_counts, _) = guard.get_shard_snapshot();
 
         Ok(Some((capacity_score, open_shard_counts)))
@@ -218,8 +221,8 @@ mod tests {
         state_guard.shards.insert(shard.queue_id(), shard);
         let (open_shard_counts, _) = state_guard.get_shard_snapshot();
         let capacity_score = state_guard
-            .wal_capacity_time_series
-            .record_and_score(ByteSize::b(500));
+            .wal_capacity_tracker
+            .record_and_score(ByteSize::b(500), ByteSize::b(0));
         drop(state_guard);
 
         assert_eq!(capacity_score, 6);
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs
index 18a00209de1..b579382af78 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/mod.rs
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #[allow(dead_code)]
-mod ingester_capacity_score;
+mod capacity_score;
 mod local_shards;
 
 use std::time::Duration;
@@ -26,7 +26,7 @@ pub(in crate::ingest_v2) const BROADCAST_INTERVAL_PERIOD: Duration = if cfg!(tes
     Duration::from_secs(5)
 };
 
-pub use ingester_capacity_score::{
+pub use capacity_score::{
     BroadcastIngesterCapacityScoreTask, IngesterCapacityScoreUpdate,
     setup_ingester_capacity_update_listener,
 };
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
index df51758a4ca..b77cefa7f38 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
@@ -131,7 +131,12 @@ impl Ingester {
         idle_shard_timeout: Duration,
     ) -> IngestV2Result<Self> {
         let self_node_id: NodeId = cluster.self_node_id().into();
-        let state = IngesterState::load(wal_dir_path, disk_capacity, rate_limiter_settings);
+        let state = IngesterState::load(
+            wal_dir_path,
+            disk_capacity,
+            memory_capacity,
+            rate_limiter_settings,
+        );
 
         let weak_state = state.weak();
         BroadcastLocalShardsTask::spawn(cluster.clone(), weak_state.clone());
@@ -784,10 +789,12 @@ impl Ingester {
         }
         let wal_usage = state_guard.mrecordlog.resource_usage();
         let disk_used = wal_usage.disk_used_bytes as u64;
+        let memory_used = wal_usage.memory_used_bytes as u64;
         let (open_shard_counts, closed_shards) = state_guard.get_shard_snapshot();
         let capacity_score = state_guard
-            .wal_capacity_time_series
-            .score(ByteSize::b(disk_used)) as u32;
+            .wal_capacity_tracker
+            .score(ByteSize::b(disk_used), ByteSize::b(memory_used))
+            as u32;
         drop(state_guard);
 
         if disk_used >= self.disk_capacity.as_u64() * 90 / 100 {
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs b/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs
index 1fb32c0b2fd..87975a3c462 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/metrics.rs
@@ -82,12 +82,20 @@ pub(super) struct IngestV2Metrics {
     pub wal_disk_used_bytes: IntGauge,
     pub wal_memory_used_bytes: IntGauge,
     pub ingest_results: IngestResultMetrics,
+    pub ingest_attempts: IntCounterVec<1>,
 }
 
 impl Default for IngestV2Metrics {
     fn default() -> Self {
         Self {
             ingest_results: IngestResultMetrics::default(),
+            ingest_attempts: new_counter_vec::<1>(
+                "ingest_attempts",
+                "Number of routing attempts by AZ locality",
+                "ingest",
+                &[],
+                ["az_routing"],
+            ),
             reset_shards_operations_total: new_counter_vec(
                 "reset_shards_operations_total",
                 "Total number of reset shards operations performed.",
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
index d5432936e58..151bf219fb7 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
@@ -22,15 +22,13 @@ mod metrics;
 mod models;
 mod mrecord;
 mod mrecordlog_utils;
-mod node_routing_table;
 mod publish_tracker;
 mod rate_meter;
 mod replication;
 mod router;
-#[allow(dead_code)]
 mod routing_table;
 mod state;
-mod wal_capacity_timeseries;
+mod wal_capacity_tracker;
 mod workbench;
 
 use std::collections::HashMap;
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs
deleted file mode 100644
index 68299358f77..00000000000
--- a/quickwit/quickwit-ingest/src/ingest_v2/node_routing_table.rs
+++ /dev/null
@@ -1,519 +0,0 @@
-// Copyright 2021-Present Datadog, Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::collections::{HashMap, HashSet};
-
-use itertools::Itertools;
-use quickwit_proto::ingest::Shard;
-use quickwit_proto::types::{IndexId, IndexUid, NodeId, SourceId};
-use rand::rng;
-use rand::seq::IndexedRandom;
-
-use crate::IngesterPool;
-
-/// A single ingester node's routing-relevant data for a specific (index, source) pair.
-/// Each entry is self-describing: it carries its own node_id, index_uid, and source_id
-/// so it can always be attributed back to a specific source on a specific node.
-#[derive(Debug, Clone)]
-pub(super) struct IngesterNode {
-    pub node_id: NodeId,
-    pub index_uid: IndexUid,
-    #[allow(unused)]
-    pub source_id: SourceId,
-    /// Score from 0-10. Higher means more available capacity.
-    pub capacity_score: usize,
-    /// Number of open shards on this node for this (index, source) pair. Tiebreaker for power of
-    /// two choices comparison - we favor a node with more open shards.
-    pub open_shard_count: usize,
-}
-
-#[derive(Debug, Default)]
-pub(super) struct RoutingEntry {
-    pub nodes: HashMap<NodeId, IngesterNode>,
-}
-
-/// Given a slice of candidates, picks the better of two random choices.
-/// Higher capacity_score wins; tiebreak on more open_shard_count (more landing spots).
-fn power_of_two_choices<'a>(candidates: &[&'a IngesterNode]) -> &'a IngesterNode {
-    debug_assert!(candidates.len() >= 2);
-    let mut iter = candidates.choose_multiple(&mut rng(), 2);
-    let (&a, &b) = (iter.next().unwrap(), iter.next().unwrap());
-
-    if (a.capacity_score, a.open_shard_count) >= (b.capacity_score, b.open_shard_count) {
-        a
-    } else {
-        b
-    }
-}
-
-fn pick_from(candidates: Vec<&IngesterNode>) -> Option<&IngesterNode> {
-    match candidates.len() {
-        0 => None,
-        1 => Some(candidates[0]),
-        _ => Some(power_of_two_choices(&candidates)),
-    }
-}
-
-impl RoutingEntry {
-    /// Pick an ingester node to persist the request to. Uses power of two choices based on reported
-    /// ingester capacity, if more than one eligible node exists. Prefers nodes in the same
-    /// availability zone, falling back to remote nodes.
-    fn pick_node(
-        &self,
-        ingester_pool: &IngesterPool,
-        unavailable_leaders: &HashSet<NodeId>,
-        self_availability_zone: &Option<String>,
-    ) -> Option<&IngesterNode> {
-        let (local_ingesters, remote_ingesters): (Vec<&IngesterNode>, Vec<&IngesterNode>) = self
-            .nodes
-            .values()
-            .filter(|node| {
-                node.capacity_score > 0
-                    && node.open_shard_count > 0
-                    && ingester_pool.contains_key(&node.node_id)
-                    && !unavailable_leaders.contains(&node.node_id)
-            })
-            .partition(|node| {
-                let node_az = ingester_pool
-                    .get(&node.node_id)
-                    .and_then(|h| h.availability_zone);
-                node_az == *self_availability_zone
-            });
-
-        pick_from(local_ingesters).or_else(|| pick_from(remote_ingesters))
-    }
-}
-
-#[derive(Debug, Default)]
-pub(super) struct NodeBasedRoutingTable {
-    table: HashMap<(IndexId, SourceId), RoutingEntry>,
-    self_availability_zone: Option<String>,
-}
-
-impl NodeBasedRoutingTable {
-    pub fn new(self_availability_zone: Option<String>) -> Self {
-        Self {
-            self_availability_zone,
-            ..Default::default()
-        }
-    }
-
-    pub fn pick_node(
-        &self,
-        index_id: &str,
-        source_id: &str,
-        ingester_pool: &IngesterPool,
-        unavailable_leaders: &HashSet<NodeId>,
-    ) -> Option<&IngesterNode> {
-        let key = (index_id.to_string(), source_id.to_string());
-        let entry = self.table.get(&key)?;
-        entry.pick_node(
-            ingester_pool,
-            unavailable_leaders,
-            &self.self_availability_zone,
-        )
-    }
-
-    pub fn debug_info(
-        &self,
-        ingester_pool: &IngesterPool,
-    ) -> HashMap<IndexId, Vec<serde_json::Value>> {
-        let mut per_index: HashMap<IndexId, Vec<serde_json::Value>> = HashMap::new();
-        for ((index_id, source_id), entry) in &self.table {
-            for (node_id, node) in &entry.nodes {
-                let az = ingester_pool.get(node_id).and_then(|h| h.availability_zone);
-                per_index
-                    .entry(index_id.clone())
-                    .or_default()
-                    .push(serde_json::json!({
-                        "source_id": source_id,
-                        "node_id": node_id,
-                        "capacity_score": node.capacity_score,
-                        "open_shard_count": node.open_shard_count,
-                        "availability_zone": az,
-                    }));
-            }
-        }
-        per_index
-    }
-
-    pub fn has_open_nodes(
-        &self,
-        index_id: &str,
-        source_id: &str,
-        ingester_pool: &IngesterPool,
-        unavailable_leaders: &HashSet<NodeId>,
-    ) -> bool {
-        let key = (index_id.to_string(), source_id.to_string());
-        let Some(entry) = self.table.get(&key) else {
-            return false;
-        };
-        entry.nodes.values().any(|node| {
-            node.capacity_score > 0
-                && node.open_shard_count > 0
-                && ingester_pool.contains_key(&node.node_id)
-                && !unavailable_leaders.contains(&node.node_id)
-        })
-    }
-
-    /// Applies a capacity update from the IngesterCapacityScoreUpdate broadcast. This is the
-    /// primary way the table learns about node availability and capacity.
-    pub fn apply_capacity_update(
-        &mut self,
-        node_id: NodeId,
-        index_uid: IndexUid,
-        source_id: SourceId,
-        capacity_score: usize,
-        open_shard_count: usize,
-    ) {
-        let key = (index_uid.index_id.to_string(), source_id.clone());
-
-        let entry = self.table.entry(key).or_default();
-        let ingester_node = IngesterNode {
-            node_id: node_id.clone(),
-            index_uid,
-            source_id,
-            capacity_score,
-            open_shard_count,
-        };
-        entry.nodes.insert(node_id, ingester_node);
-    }
-
-    /// Merges routing updates from a GetOrCreateOpenShards control plane response into the
-    /// table. For existing nodes, updates their open shard count, including if the count is 0, from
-    /// the CP response while preserving capacity scores if they already exist.
-    /// New nodes get a default capacity_score of 5.
-    pub fn merge_from_shards(
-        &mut self,
-        index_uid: IndexUid,
-        source_id: SourceId,
-        shards: Vec<Shard>,
-    ) {
-        let per_leader_count: HashMap<NodeId, usize> = shards
-            .iter()
-            .map(|shard| {
-                let num_open_shards = shard.is_open() as usize;
-                let leader_id = NodeId::from(shard.leader_id.clone());
-                (leader_id, num_open_shards)
-            })
-            .into_grouping_map()
-            .sum();
-
-        let key = (index_uid.index_id.to_string(), source_id.clone());
-        let entry = self.table.entry(key).or_default();
-
-        for (node_id, open_shard_count) in per_leader_count {
-            entry
-                .nodes
-                .entry(node_id.clone())
-                .and_modify(|node| node.open_shard_count = open_shard_count)
-                .or_insert_with(|| IngesterNode {
-                    node_id,
-                    index_uid: index_uid.clone(),
-                    source_id: source_id.clone(),
-                    capacity_score: 5,
-                    open_shard_count,
-                });
-        }
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use quickwit_proto::ingest::ShardState;
-    use quickwit_proto::ingest::ingester::IngesterServiceClient;
-    use quickwit_proto::types::ShardId;
-
-    use super::*;
-    use crate::IngesterPoolEntry;
-
-    fn mocked_ingester(availability_zone: Option<&str>) -> IngesterPoolEntry {
-        IngesterPoolEntry {
-            client: IngesterServiceClient::mocked(),
-            availability_zone: availability_zone.map(|s| s.to_string()),
-        }
-    }
-
-    #[test]
-    fn test_apply_capacity_update() {
-        let mut table = NodeBasedRoutingTable::default();
-        let key = ("test-index".to_string(), "test-source".into());
-
-        // Insert first node.
-        table.apply_capacity_update(
-            "node-1".into(),
-            IndexUid::for_test("test-index", 0),
-            "test-source".into(),
-            8,
-            3,
-        );
-        let entry = table.table.get(&key).unwrap();
-        assert_eq!(entry.nodes.len(), 1);
-        assert_eq!(entry.nodes.get("node-1").unwrap().capacity_score, 8);
-
-        // Update existing node.
-        table.apply_capacity_update(
-            "node-1".into(),
-            IndexUid::for_test("test-index", 0),
-            "test-source".into(),
-            4,
-            5,
-        );
-        let node = table.table.get(&key).unwrap().nodes.get("node-1").unwrap();
-        assert_eq!(node.capacity_score, 4);
-        assert_eq!(node.open_shard_count, 5);
-
-        // Add second node.
-        table.apply_capacity_update(
-            "node-2".into(),
-            IndexUid::for_test("test-index", 0),
-            "test-source".into(),
-            6,
-            2,
-        );
-        assert_eq!(table.table.get(&key).unwrap().nodes.len(), 2);
-
-        // Zero shards: node stays in table but becomes ineligible for routing.
-        table.apply_capacity_update(
-            "node-1".into(),
-            IndexUid::for_test("test-index", 0),
-            "test-source".into(),
-            0,
-            0,
-        );
-        let entry = table.table.get(&key).unwrap();
-        assert_eq!(entry.nodes.len(), 2);
-        assert_eq!(entry.nodes.get("node-1").unwrap().open_shard_count, 0);
-        assert_eq!(entry.nodes.get("node-1").unwrap().capacity_score, 0);
-    }
-
-    #[test]
-    fn test_has_open_nodes() {
-        let mut table = NodeBasedRoutingTable::default();
-        let pool = IngesterPool::default();
-
-        // Empty table.
-        assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new()));
-
-        // Node exists but is not in pool.
-        table.apply_capacity_update(
-            "node-1".into(),
-            IndexUid::for_test("test-index", 0),
-            "test-source".into(),
-            8,
-            3,
-        );
-        assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new()));
-
-        // Node is in pool → true.
-        pool.insert("node-1".into(), mocked_ingester(None));
-        assert!(table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new()));
-
-        // Node is unavailable → false.
-        let unavailable: HashSet<NodeId> = HashSet::from(["node-1".into()]);
-        assert!(!table.has_open_nodes("test-index", "test-source", &pool, &unavailable));
-
-        // Second node available → true despite first being unavailable.
-        table.apply_capacity_update(
-            "node-2".into(),
-            IndexUid::for_test("test-index", 0),
-            "test-source".into(),
-            6,
-            2,
-        );
-        pool.insert("node-2".into(), mocked_ingester(None));
-        assert!(table.has_open_nodes("test-index", "test-source", &pool, &unavailable));
-
-        // Node with capacity_score=0 is not eligible.
-        table.apply_capacity_update(
-            "node-2".into(),
-            IndexUid::for_test("test-index", 0),
-            "test-source".into(),
-            0,
-            2,
-        );
-        assert!(!table.has_open_nodes("test-index", "test-source", &pool, &unavailable));
-    }
-
-    #[test]
-    fn test_pick_node_prefers_same_az() {
-        let mut table = NodeBasedRoutingTable::new(Some("az-1".to_string()));
-        let pool = IngesterPool::default();
-
-        table.apply_capacity_update(
-            "node-1".into(),
-            IndexUid::for_test("test-index", 0),
-            "test-source".into(),
-            5,
-            1,
-        );
-        table.apply_capacity_update(
-            "node-2".into(),
-            IndexUid::for_test("test-index", 0),
-            "test-source".into(),
-            5,
-            1,
-        );
-        pool.insert("node-1".into(), mocked_ingester(Some("az-1")));
-        pool.insert("node-2".into(), mocked_ingester(Some("az-2")));
-
-        let picked = table
-            .pick_node("test-index", "test-source", &pool, &HashSet::new())
-            .unwrap();
-        assert_eq!(picked.node_id, NodeId::from("node-1"));
-    }
-
-    #[test]
-    fn test_pick_node_falls_back_to_cross_az() {
-        let mut table = NodeBasedRoutingTable::new(Some("az-1".to_string()));
-        let pool = IngesterPool::default();
-
-        table.apply_capacity_update(
-            "node-2".into(),
-            IndexUid::for_test("test-index", 0),
-            "test-source".into(),
-            5,
-            1,
-        );
-        pool.insert("node-2".into(), mocked_ingester(Some("az-2")));
-
-        let picked = table
-            .pick_node("test-index", "test-source", &pool, &HashSet::new())
-            .unwrap();
-        assert_eq!(picked.node_id, NodeId::from("node-2"));
-    }
-
-    #[test]
-    fn test_pick_node_no_az_awareness() {
-        let mut table = NodeBasedRoutingTable::default();
-        let pool = IngesterPool::default();
-
-        table.apply_capacity_update(
-            "node-1".into(),
-            IndexUid::for_test("test-index", 0),
-            "test-source".into(),
-            5,
-            1,
-        );
-        pool.insert("node-1".into(), mocked_ingester(Some("az-1")));
-
-        let picked = table
-            .pick_node("test-index", "test-source", &pool, &HashSet::new())
-            .unwrap();
-        assert_eq!(picked.node_id, NodeId::from("node-1"));
-    }
-
-    #[test]
-    fn test_pick_node_missing_entry() {
-        let table = NodeBasedRoutingTable::new(Some("az-1".to_string()));
-        let pool = IngesterPool::default();
-
-        assert!(
-            table
-                .pick_node("nonexistent", "source", &pool, &HashSet::new())
-                .is_none()
-        );
-    }
-
-    #[test]
-    fn test_power_of_two_choices() {
-        // 3 candidates: best appears in the random pair 2/3 of the time and always
-        // wins when it does, so it should win ~67% of 1000 runs. Asserting > 550
-        // is ~7.5 standard deviations from the mean — effectively impossible to flake.
-        let high = IngesterNode {
-            node_id: "high".into(),
-            index_uid: IndexUid::for_test("idx", 0),
-            source_id: "src".into(),
-            capacity_score: 9,
-            open_shard_count: 2,
-        };
-        let mid = IngesterNode {
-            node_id: "mid".into(),
-            index_uid: IndexUid::for_test("idx", 0),
-            source_id: "src".into(),
-            capacity_score: 5,
-            open_shard_count: 2,
-        };
-        let low = IngesterNode {
-            node_id: "low".into(),
-            index_uid: IndexUid::for_test("idx", 0),
-            source_id: "src".into(),
-            capacity_score: 1,
-            open_shard_count: 2,
-        };
-        let candidates: Vec<&IngesterNode> = vec![&high, &mid, &low];
-
-        let mut high_wins = 0;
-        for _ in 0..1000 {
-            if power_of_two_choices(&candidates).node_id == "high" {
-                high_wins += 1;
-            }
-        }
-        assert!(high_wins > 550, "high won only {high_wins}/1000 times");
-    }
-
-    #[test]
-    fn test_merge_from_shards() {
-        let mut table = NodeBasedRoutingTable::default();
-        let index_uid = IndexUid::for_test("test-index", 0);
-        let key = ("test-index".to_string(), "test-source".to_string());
-
-        let make_shard = |id: u64, leader: &str, open: bool| Shard {
-            index_uid: Some(index_uid.clone()),
-            source_id: "test-source".to_string(),
-            shard_id: Some(ShardId::from(id)),
-            shard_state: if open {
-                ShardState::Open as i32
-            } else {
-                ShardState::Closed as i32
-            },
-            leader_id: leader.to_string(),
-            ..Default::default()
-        };
-
-        // Two open shards on node-1, one open + one closed on node-2, only closed on node-3.
-        let shards = vec![
-            make_shard(1, "node-1", true),
-            make_shard(2, "node-1", true),
-            make_shard(3, "node-2", true),
-            make_shard(4, "node-2", false),
-            make_shard(5, "node-3", false),
-        ];
-        table.merge_from_shards(index_uid.clone(), "test-source".into(), shards);
-
-        let entry = table.table.get(&key).unwrap();
-        assert_eq!(entry.nodes.len(), 3);
-
-        let n1 = entry.nodes.get("node-1").unwrap();
-        assert_eq!(n1.open_shard_count, 2);
-        assert_eq!(n1.capacity_score, 5);
-
-        let n2 = entry.nodes.get("node-2").unwrap();
-        assert_eq!(n2.open_shard_count, 1);
-
-        let n3 = entry.nodes.get("node-3").unwrap();
-        assert_eq!(n3.open_shard_count, 0);
-
-        // Merging again adds new nodes but preserves existing ones.
-        let shards = vec![make_shard(10, "node-4", true)];
-        table.merge_from_shards(index_uid, "test-source".into(), shards);
-
-        let entry = table.table.get(&key).unwrap();
-        assert_eq!(entry.nodes.len(), 4);
-        assert!(entry.nodes.contains_key("node-1"));
-        assert!(entry.nodes.contains_key("node-2"));
-        assert!(entry.nodes.contains_key("node-3"));
-        assert!(entry.nodes.contains_key("node-4"));
-    }
-}
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs
index a1f5bf86302..46a476431f9 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs
@@ -46,10 +46,11 @@ use super::debouncing::{
 };
 use super::ingester::PERSIST_REQUEST_TIMEOUT;
 use super::metrics::IngestResultMetrics;
-use super::node_routing_table::NodeBasedRoutingTable;
+use super::routing_table::RoutingTable;
 use super::workbench::IngestWorkbench;
 use super::{IngesterPool, pending_subrequests};
 use crate::get_ingest_router_buffer_size;
+use crate::ingest_v2::metrics::INGEST_V2_METRICS;
 
 /// Duration after which ingest requests time out with [`IngestV2Error::Timeout`].
 fn ingest_request_timeout() -> Duration {
@@ -101,7 +102,7 @@ struct RouterState {
     // Debounces `GetOrCreateOpenShardsRequest` requests to the control plane.
     debouncer: GetOrCreateOpenShardsRequestDebouncer,
     // Routing table of nodes, their WAL capacity, and the number of open shards per source.
-    node_routing_table: NodeBasedRoutingTable,
+    routing_table: RoutingTable,
 }
 
 impl fmt::Debug for IngestRouter {
@@ -124,7 +125,7 @@ impl IngestRouter {
     ) -> Self {
         let state = Arc::new(Mutex::new(RouterState {
             debouncer: GetOrCreateOpenShardsRequestDebouncer::default(),
-            node_routing_table: NodeBasedRoutingTable::new(self_availability_zone),
+            routing_table: RoutingTable::new(self_availability_zone),
         }));
         let ingest_semaphore_permits = get_ingest_router_buffer_size().as_u64() as usize;
         let ingest_semaphore = Arc::new(Semaphore::new(ingest_semaphore_permits));
@@ -160,7 +161,7 @@ impl IngestRouter {
         let mut state_guard = self.state.lock().await;
 
         for subrequest in pending_subrequests(&workbench.subworkbenches) {
-            if !state_guard.node_routing_table.has_open_nodes(
+            if !state_guard.routing_table.has_open_nodes(
                 &subrequest.index_id,
                 &subrequest.source_id,
                 ingester_pool,
@@ -251,7 +252,7 @@ impl IngestRouter {
         let mut state_guard = self.state.lock().await;
 
         for success in response.successes {
-            state_guard.node_routing_table.merge_from_shards(
+            state_guard.routing_table.merge_from_shards(
                 success.index_uid().clone(),
                 success.source_id,
                 success.open_shards,
@@ -302,7 +303,7 @@ impl IngestRouter {
                         // opportunity to get a fresh routing update.
                         let mut state_guard = self.state.lock().await;
                         for shard_update in routing_update.source_shard_updates {
-                            state_guard.node_routing_table.apply_capacity_update(
+                            state_guard.routing_table.apply_capacity_update(
                                 leader_id.clone(),
                                 shard_update.index_uid().clone(),
                                 shard_update.source_id,
@@ -353,7 +354,7 @@ impl IngestRouter {
         let state_guard = self.state.lock().await;
 
         for subrequest in pending_subrequests(&workbench.subworkbenches) {
-            let ingester_node = state_guard.node_routing_table.pick_node(
+            let ingester_node = state_guard.routing_table.pick_node(
                 &subrequest.index_id,
                 &subrequest.source_id,
                 &self.ingester_pool,
@@ -367,6 +368,13 @@ impl IngestRouter {
                     continue;
                 }
             };
+            let az_locality = state_guard
+                .routing_table
+                .classify_az_locality(&ingester_node.node_id, &self.ingester_pool);
+            INGEST_V2_METRICS
+                .ingest_attempts
+                .with_label_values([az_locality])
+                .inc();
             let persist_subrequest = PersistSubrequest {
                 subrequest_id: subrequest.subrequest_id,
                 index_uid: Some(ingester_node.index_uid.clone()),
@@ -474,9 +482,7 @@ impl IngestRouter {
 
     pub async fn debug_info(&self) -> JsonValue {
         let state_guard = self.state.lock().await;
-        let routing_table_json = state_guard
-            .node_routing_table
-            .debug_info(&self.ingester_pool);
+        let routing_table_json = state_guard.routing_table.debug_info(&self.ingester_pool);
 
         json!({
             "routing_table": routing_table_json,
@@ -486,8 +492,7 @@ impl IngestRouter {
 
 fn update_ingest_metrics(ingest_result: &IngestV2Result<IngestResponseV2>, num_subrequests: usize) {
     let num_subrequests = num_subrequests as u64;
-    let ingest_results_metrics: &IngestResultMetrics =
-        &crate::ingest_v2::metrics::INGEST_V2_METRICS.ingest_results;
+    let ingest_results_metrics: &IngestResultMetrics = &INGEST_V2_METRICS.ingest_results;
     match ingest_result {
         Ok(ingest_response) => {
             ingest_results_metrics
@@ -607,7 +612,7 @@ impl EventSubscriber<IngesterCapacityScoreUpdate> for WeakRouterState {
             return;
         };
         let mut state_guard = state.lock().await;
-        state_guard.node_routing_table.apply_capacity_update(
+        state_guard.routing_table.apply_capacity_update(
             update.node_id,
             update.source_uid.index_uid,
             update.source_uid.source_id,
@@ -674,7 +679,7 @@ mod tests {
 
         {
             let mut state_guard = router.state.lock().await;
-            state_guard.node_routing_table.apply_capacity_update(
+            state_guard.routing_table.apply_capacity_update(
                 "test-ingester-0".into(),
                 IndexUid::for_test("test-index-0", 0),
                 "test-source".to_string(),
@@ -1273,7 +1278,7 @@ mod tests {
         let index_uid_1: IndexUid = IndexUid::for_test("test-index-1", 0);
         {
             let mut state_guard = router.state.lock().await;
-            state_guard.node_routing_table.merge_from_shards(
+            state_guard.routing_table.merge_from_shards(
                 index_uid_0.clone(),
                 "test-source".to_string(),
                 vec![Shard {
@@ -1285,7 +1290,7 @@ mod tests {
                     ..Default::default()
                 }],
             );
-            state_guard.node_routing_table.merge_from_shards(
+            state_guard.routing_table.merge_from_shards(
                 index_uid_1.clone(),
                 "test-source".to_string(),
                 vec![Shard {
@@ -1420,7 +1425,7 @@ mod tests {
         let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0);
         {
             let mut state_guard = router.state.lock().await;
-            state_guard.node_routing_table.merge_from_shards(
+            state_guard.routing_table.merge_from_shards(
                 index_uid.clone(),
                 "test-source".to_string(),
                 vec![Shard {
@@ -1529,7 +1534,7 @@ mod tests {
 
         {
             let mut state_guard = router.state.lock().await;
-            state_guard.node_routing_table.merge_from_shards(
+            state_guard.routing_table.merge_from_shards(
                 index_uid_0.clone(),
                 "test-source".to_string(),
                 vec![Shard {
@@ -1540,7 +1545,7 @@ mod tests {
                     ..Default::default()
                 }],
             );
-            state_guard.node_routing_table.merge_from_shards(
+            state_guard.routing_table.merge_from_shards(
                 index_uid_1.clone(),
                 "test-source".to_string(),
                 vec![Shard {
@@ -1584,7 +1589,7 @@ mod tests {
         let index_uid: IndexUid = IndexUid::for_test("test-index-0", 0);
         {
             let mut state_guard = router.state.lock().await;
-            state_guard.node_routing_table.merge_from_shards(
+            state_guard.routing_table.merge_from_shards(
                 index_uid.clone(),
                 "test-source".to_string(),
                 vec![Shard {
@@ -1689,7 +1694,7 @@ mod tests {
         ingester_pool.insert("test-ingester-0".into(), mocked_ingester());
         let state_guard = router.state.lock().await;
         let node = state_guard
-            .node_routing_table
+            .routing_table
             .pick_node("test-index", "test-source", &ingester_pool, &HashSet::new())
             .unwrap();
         assert_eq!(node.node_id, NodeId::from("test-ingester-0"));
@@ -1836,7 +1841,7 @@ mod tests {
         ingester_pool.insert("test-ingester-0".into(), mocked_ingester());
         let state_guard = router.state.lock().await;
         let node = state_guard
-            .node_routing_table
+            .routing_table
             .pick_node("test-index", "test-source", &ingester_pool, &HashSet::new())
             .unwrap();
         assert_eq!(node.node_id, NodeId::from("test-ingester-0"));
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs b/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs
index 4b4150d6e98..670822a6ae3 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/routing_table.rs
@@ -12,485 +12,238 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
-use std::sync::atomic::{AtomicUsize, Ordering};
 
-use quickwit_proto::ingest::{Shard, ShardIds, ShardState};
-use quickwit_proto::types::{IndexId, IndexUid, NodeId, ShardId, SourceId};
-use serde_json::{Value as JsonValue, json};
-use tracing::{info, warn};
+use itertools::Itertools;
+use quickwit_proto::ingest::Shard;
+use quickwit_proto::types::{IndexId, IndexUid, NodeId, SourceId};
+use rand::rng;
+use rand::seq::IndexedRandom;
 
 use crate::IngesterPool;
 
-#[derive(Debug)]
-pub(super) struct RoutingEntry {
+/// A single ingester node's routing-relevant data for a specific (index, source) pair.
+/// Each entry is self-describing: it carries its own node_id, index_uid, and source_id
+/// so it can always be attributed back to a specific source on a specific node.
+#[derive(Debug, Clone)]
+pub(super) struct IngesterNode {
+    pub node_id: NodeId,
     pub index_uid: IndexUid,
+    #[allow(unused)]
     pub source_id: SourceId,
-    pub shard_id: ShardId,
-    pub shard_state: ShardState,
-    pub leader_id: NodeId,
+    /// Score from 0-10. Higher means more available capacity.
+    pub capacity_score: usize,
+    /// Number of open shards on this node for this (index, source) pair. Tiebreaker for power of
+    /// two choices comparison - we favor a node with more open shards.
+    pub open_shard_count: usize,
 }
 
-impl From<Shard> for RoutingEntry {
-    fn from(shard: Shard) -> Self {
-        let shard_id = shard.shard_id().clone();
-        let shard_state = shard.shard_state();
-        Self {
-            index_uid: shard.index_uid().clone(),
-            source_id: shard.source_id,
-            shard_id,
-            shard_state,
-            leader_id: shard.leader_id.into(),
-        }
-    }
+#[derive(Debug, Default)]
+pub(super) struct RoutingEntry {
+    pub nodes: HashMap<NodeId, IngesterNode>,
 }
 
-/// The set of shards the router is aware of for the given index and source.
-#[derive(Debug, Default)]
-pub(super) struct RoutingTableEntry {
-    /// Index UID of the shards.
-    pub index_uid: IndexUid,
-    /// Source ID of the shards.
-    pub source_id: SourceId,
-    /// Shards located on this node.
-    pub local_shards: Vec<RoutingEntry>,
-    pub local_round_robin_idx: AtomicUsize,
-    /// Shards located on remote nodes.
-    pub remote_shards: Vec<RoutingEntry>,
-    pub remote_round_robin_idx: AtomicUsize,
+/// Given a slice of candidates, picks the better of two random choices.
+/// Higher capacity_score wins; tiebreak on more open_shard_count (more landing spots).
+fn power_of_two_choices<'a>(candidates: &[&'a IngesterNode]) -> &'a IngesterNode {
+    debug_assert!(candidates.len() >= 2);
+    let mut iter = candidates.choose_multiple(&mut rng(), 2);
+    let (&a, &b) = (iter.next().unwrap(), iter.next().unwrap());
+
+    if (a.capacity_score, a.open_shard_count) >= (b.capacity_score, b.open_shard_count) {
+        a
+    } else {
+        b
+    }
 }
 
-impl RoutingTableEntry {
-    /// Creates a new entry and ensures that the shards are open, unique, and sorted by shard ID.
-    fn new(
-        self_node_id: &NodeId,
-        index_uid: IndexUid,
-        source_id: SourceId,
-        mut shards: Vec<Shard>,
-    ) -> Self {
-        let num_shards = shards.len();
+fn pick_from(candidates: Vec<&IngesterNode>) -> Option<&IngesterNode> {
+    match candidates.len() {
+        0 => None,
+        1 => Some(candidates[0]),
+        _ => Some(power_of_two_choices(&candidates)),
+    }
+}
 
-        shards.sort_unstable_by(|left, right| left.shard_id.cmp(&right.shard_id));
-        shards.dedup_by(|left, right| left.shard_id == right.shard_id);
+impl RoutingEntry {
+    /// Pick an ingester node to persist the request to. Uses power of two choices based on reported
+    /// ingester capacity, if more than one eligible node exists. Prefers nodes in the same
+    /// availability zone, falling back to remote nodes.
+    fn pick_node(
+        &self,
+        ingester_pool: &IngesterPool,
+        unavailable_leaders: &HashSet<NodeId>,
+        self_availability_zone: &Option<String>,
+    ) -> Option<&IngesterNode> {
+        let (local_ingesters, remote_ingesters): (Vec<&IngesterNode>, Vec<&IngesterNode>) = self
+            .nodes
+            .values()
+            .filter(|node| {
+                node.capacity_score > 0
+                    && node.open_shard_count > 0
+                    && ingester_pool.contains_key(&node.node_id)
+                    && !unavailable_leaders.contains(&node.node_id)
+            })
+            .partition(|node| {
+                let node_az = ingester_pool
+                    .get(&node.node_id)
+                    .and_then(|h| h.availability_zone);
+                node_az == *self_availability_zone
+            });
 
-        let (local_shards, remote_shards): (Vec<_>, Vec<_>) = shards
-            .into_iter()
-            .filter(|shard| shard.is_open())
-            .map(RoutingEntry::from)
-            .partition(|shard| *self_node_id == shard.leader_id);
+        pick_from(local_ingesters).or_else(|| pick_from(remote_ingesters))
+    }
+}
 
-        if num_shards > local_shards.len() + remote_shards.len() {
-            warn!("input shards should not contain closed shards or duplicates");
-        }
+#[derive(Debug, Default)]
+pub(super) struct RoutingTable {
+    table: HashMap<(IndexId, SourceId), RoutingEntry>,
+    self_availability_zone: Option<String>,
+}
 
+impl RoutingTable {
+    pub fn new(self_availability_zone: Option<String>) -> Self {
         Self {
-            index_uid,
-            source_id,
-            local_shards,
-            remote_shards,
+            self_availability_zone,
             ..Default::default()
         }
     }
 
-    fn empty(index_uid: IndexUid, source_id: SourceId) -> Self {
-        Self {
-            index_uid,
-            source_id,
-            ..Default::default()
-        }
-    }
-
-    /// Returns `true` if at least one shard in the table entry is open and has a leader available.
-    /// As it goes through the list of shards in the entry, it populates `closed_shard_ids` and
-    /// `unavailable_leaders` with the shard IDs of the closed shards and the node ID of the
-    /// unavailable ingesters encountered along the way.
-    pub fn has_open_shards(
+    pub fn pick_node(
         &self,
+        index_id: &str,
+        source_id: &str,
         ingester_pool: &IngesterPool,
-        closed_shard_ids: &mut Vec<ShardId>,
-        unavailable_leaders: &mut HashSet<NodeId>,
-    ) -> bool {
-        let shards = self.local_shards.iter().chain(self.remote_shards.iter());
-
-        for shard in shards {
-            match shard.shard_state {
-                ShardState::Closed => {
-                    closed_shard_ids.push(shard.shard_id.clone());
-                    continue;
-                }
-                ShardState::Unavailable | ShardState::Unspecified => {
-                    continue;
-                }
-                ShardState::Open => {
-                    if unavailable_leaders.contains(&shard.leader_id) {
-                        continue;
-                    }
-                    if ingester_pool.contains_key(&shard.leader_id) {
-                        return true;
-                    } else {
-                        let leader_id: NodeId = shard.leader_id.clone();
-                        unavailable_leaders.insert(leader_id);
-                    }
-                }
-            }
-        }
-        false
+        unavailable_leaders: &HashSet<NodeId>,
+    ) -> Option<&IngesterNode> {
+        let key = (index_id.to_string(), source_id.to_string());
+        let entry = self.table.get(&key)?;
+        entry.pick_node(
+            ingester_pool,
+            unavailable_leaders,
+            &self.self_availability_zone,
+        )
     }
 
-    /// Returns the next open and available shard in the table entry in a round-robin fashion.
-    pub fn next_open_shard_round_robin(
+    pub fn classify_az_locality(
         &self,
+        target_node_id: &NodeId,
         ingester_pool: &IngesterPool,
-        rate_limited_shards: &HashSet<ShardId>,
-    ) -> Result<&RoutingEntry, NextOpenShardError> {
-        let mut error = NextOpenShardError::NoShardsAvailable;
-
-        for (shards, round_robin_idx) in [
-            (&self.local_shards, &self.local_round_robin_idx),
-            (&self.remote_shards, &self.remote_round_robin_idx),
-        ] {
-            if shards.is_empty() {
-                continue;
-            }
-            for _attempt in 0..shards.len() {
-                let shard_idx = round_robin_idx.fetch_add(1, Ordering::Relaxed);
-                let shard_routing_entry: &RoutingEntry = &shards[shard_idx % shards.len()];
-
-                if !shard_routing_entry.shard_state.is_open() {
-                    continue;
-                }
-                if rate_limited_shards.contains(&shard_routing_entry.shard_id) {
-                    error = NextOpenShardError::RateLimited;
-                    continue;
-                }
-                if ingester_pool.contains_key(&shard_routing_entry.leader_id) {
-                    return Ok(shard_routing_entry);
-                }
-            }
-        }
-        Err(error)
-    }
-
-    /// Inserts the open shards the routing table is not aware of.
-    fn insert_open_shards(
-        &mut self,
-        self_node_id: &NodeId,
-        leader_id: &NodeId,
-        index_uid: &IndexUid,
-        shard_ids: &[ShardId],
-    ) {
-        match self.index_uid.cmp(index_uid) {
-            // If we receive an update for a new incarnation of the index, then we clear the entry
-            // and insert all the shards.
-            std::cmp::Ordering::Less => {
-                self.index_uid = index_uid.clone();
-                self.clear_shards();
-            }
-            // If we receive an update for a previous incarnation of the index, then we ignore it.
-            std::cmp::Ordering::Greater => {
-                return;
-            }
-            std::cmp::Ordering::Equal => {}
+    ) -> &'static str {
+        let Some(self_az) = &self.self_availability_zone else {
+            return "az_unaware";
         };
-        let target_shards = if self_node_id == leader_id {
-            &mut self.local_shards
-        } else {
-            &mut self.remote_shards
-        };
-        let mut num_inserted_shards = 0;
-        let num_target_shards = target_shards.len();
-
-        if num_target_shards == 0 {
-            target_shards.reserve(num_target_shards);
-            target_shards.extend(shard_ids.iter().map(|shard_id| RoutingEntry {
-                index_uid: self.index_uid.clone(),
-                source_id: self.source_id.clone(),
-                shard_id: shard_id.clone(),
-                shard_state: ShardState::Open,
-                leader_id: leader_id.clone(),
-            }));
-            num_inserted_shards = target_shards.len();
-        } else {
-            let shard_ids_range = target_shards[0].shard_id.clone()
-                ..=target_shards[num_target_shards - 1].shard_id.clone();
-
-            for shard_id in shard_ids {
-                // If we can't find the shard, then we insert it.
-                if shard_ids_range.contains(shard_id) {
-                    continue;
-                }
-                if target_shards[..num_target_shards]
-                    .binary_search_by(|shard| shard.shard_id.cmp(shard_id))
-                    .is_err()
-                {
-                    target_shards.push(RoutingEntry {
-                        index_uid: self.index_uid.clone(),
-                        source_id: self.source_id.clone(),
-                        shard_id: shard_id.clone(),
-                        shard_state: ShardState::Open,
-                        leader_id: leader_id.clone(),
-                    });
-                    num_inserted_shards += 1;
-                }
-            }
-        }
-        if num_inserted_shards > 0 {
-            target_shards.sort_unstable_by(|left, right| left.shard_id.cmp(&right.shard_id));
-
-            info!(
-                index_uid=%self.index_uid,
-                source_id=%self.source_id,
-                "inserted {num_inserted_shards} shards into routing table"
-            );
+        let target_az = ingester_pool
+            .get(target_node_id)
+            .and_then(|entry| entry.availability_zone);
+        match target_az {
+            Some(ref az) if az == self_az => "same_az",
+            Some(_) => "cross_az",
+            None => "az_unaware",
         }
     }
 
-    /// Clears local and remote shards.
-    fn clear_shards(&mut self) {
-        self.local_shards.clear();
-        self.local_round_robin_idx = AtomicUsize::default();
-        self.remote_shards.clear();
-        self.remote_round_robin_idx = AtomicUsize::default();
-    }
-
-    /// Closes the shards identified by their shard IDs.
-    fn close_shards(&mut self, index_uid: &IndexUid, shard_ids: &[ShardId]) {
-        // If the shard table was just recently updated with shards for a new index UID, then we can
-        // safely discard this request.
-        if self.index_uid != *index_uid {
-            return;
-        }
-        for shards in [&mut self.local_shards, &mut self.remote_shards] {
-            if shards.is_empty() {
-                continue;
-            }
-            let num_shards = shards.len();
-            let shard_ids_range =
-                shards[0].shard_id.clone()..=shards[num_shards - 1].shard_id.clone();
-
-            for shard_id in shard_ids {
-                if !shard_ids_range.contains(shard_id) {
-                    continue;
-                }
-                if let Ok(shard_idx) = shards.binary_search_by(|shard| shard.shard_id.cmp(shard_id))
-                {
-                    shards[shard_idx].shard_state = ShardState::Closed;
-                }
-            }
-        }
-    }
-
-    /// Shards the shards identified by their shard IDs.
-    fn delete_shards(&mut self, index_uid: &IndexUid, shard_ids: &[ShardId]) {
-        // If the shard table was just recently updated with shards for a new index UID, then we can
-        // safely discard this request.
-        if self.index_uid != *index_uid {
-            return;
-        }
-        for shards in [&mut self.local_shards, &mut self.remote_shards] {
-            if shards.is_empty() {
-                continue;
-            }
-            let num_shards = shards.len();
-            let shard_ids_range =
-                shards[0].shard_id.clone()..=shards[num_shards - 1].shard_id.clone();
-            let mut deleted_any = false;
-
-            for shard_id in shard_ids {
-                if !shard_ids_range.contains(shard_id) {
-                    continue;
-                }
-                if let Ok(shard_idx) = shards.binary_search_by(|shard| shard.shard_id.cmp(shard_id))
-                {
-                    // We use `Unspecified` as a tombstone.
-                    shards[shard_idx].shard_state = ShardState::Unspecified;
-                    deleted_any = true;
-                }
-            }
-            if deleted_any {
-                shards.retain(|shard| shard.shard_state != ShardState::Unspecified);
+    pub fn debug_info(
+        &self,
+        ingester_pool: &IngesterPool,
+    ) -> HashMap<IndexId, Vec<serde_json::Value>> {
+        let mut per_index: HashMap<IndexId, Vec<serde_json::Value>> = HashMap::new();
+        for ((index_id, source_id), entry) in &self.table {
+            for (node_id, node) in &entry.nodes {
+                let az = ingester_pool.get(node_id).and_then(|h| h.availability_zone);
+                per_index
+                    .entry(index_id.clone())
+                    .or_default()
+                    .push(serde_json::json!({
+                        "source_id": source_id,
+                        "node_id": node_id,
+                        "capacity_score": node.capacity_score,
+                        "open_shard_count": node.open_shard_count,
+                        "availability_zone": az,
+                    }));
             }
         }
+        per_index
     }
 
-    #[cfg(test)]
-    pub fn len(&self) -> usize {
-        self.local_shards.len() + self.remote_shards.len()
-    }
-
-    #[cfg(test)]
-    pub fn all_shards(&self) -> Vec<&RoutingEntry> {
-        let mut shards = Vec::with_capacity(self.len());
-        shards.extend(&self.local_shards);
-        shards.extend(&self.remote_shards);
-        shards
-    }
-}
-
-#[derive(Debug, PartialEq, Eq)]
-pub(super) enum NextOpenShardError {
-    NoShardsAvailable,
-    RateLimited,
-}
-
-/// Stores the list of shards the router is aware of for each index and source. The resolution from
-/// index and source to shards is performed using index ID (not index UID) and source ID.
-#[derive(Debug)]
-pub(super) struct RoutingTable {
-    pub self_node_id: NodeId,
-    pub table: HashMap<(IndexId, SourceId), RoutingTableEntry>,
-}
-
-impl RoutingTable {
-    pub fn find_entry(
+    pub fn has_open_nodes(
         &self,
-        index_id: impl Into<IndexId>,
-        source_id: impl Into<SourceId>,
-    ) -> Option<&RoutingTableEntry> {
-        let key = (index_id.into(), source_id.into());
-        self.table.get(&key)
-    }
-
-    /// Returns `true` if the router already knows about a shard for a given source that has
-    /// an available `leader`.
-    ///
-    /// If this function returns false, it populates the set of unavailable leaders and closed
-    /// shards. These will be joined to the GetOrCreate shard request emitted to the control
-    /// plane.
-    pub fn has_open_shards(
-        &self,
-        index_id: impl Into<IndexId>,
-        source_id: impl Into<SourceId>,
+        index_id: &str,
+        source_id: &str,
         ingester_pool: &IngesterPool,
-        closed_shards: &mut Vec<ShardIds>,
-        unavailable_leaders: &mut HashSet<NodeId>,
+        unavailable_leaders: &HashSet<NodeId>,
     ) -> bool {
-        let Some(entry) = self.find_entry(index_id, source_id) else {
+        let key = (index_id.to_string(), source_id.to_string());
+        let Some(entry) = self.table.get(&key) else {
             return false;
         };
-        let mut closed_shard_ids: Vec<ShardId> = Vec::new();
-
-        let result =
-            entry.has_open_shards(ingester_pool, &mut closed_shard_ids, unavailable_leaders);
-
-        if !closed_shard_ids.is_empty() {
-            closed_shards.push(ShardIds {
-                index_uid: entry.index_uid.clone().into(),
-                source_id: entry.source_id.clone(),
-                shard_ids: closed_shard_ids,
-            });
-        }
-        result
+        entry.nodes.values().any(|node| {
+            node.capacity_score > 0
+                && node.open_shard_count > 0
+                && ingester_pool.contains_key(&node.node_id)
+                && !unavailable_leaders.contains(&node.node_id)
+        })
     }
 
-    /// Replaces the routing table entry for the source with the provided shards.
-    pub fn replace_shards(
+    /// Applies a capacity update from the IngesterCapacityScoreUpdate broadcast. This is the
+    /// primary way the table learns about node availability and capacity.
+    pub fn apply_capacity_update(
         &mut self,
+        node_id: NodeId,
         index_uid: IndexUid,
-        source_id: impl Into<SourceId>,
-        shards: Vec<Shard>,
+        source_id: SourceId,
+        capacity_score: usize,
+        open_shard_count: usize,
     ) {
-        let index_id: IndexId = index_uid.index_id.to_string();
-        let source_id: SourceId = source_id.into();
-        let key = (index_id, source_id.clone());
-
-        match self.table.entry(key) {
-            Entry::Vacant(entry) => {
-                entry.insert(RoutingTableEntry::new(
-                    &self.self_node_id,
-                    index_uid,
-                    source_id,
-                    shards,
-                ));
-            }
-            Entry::Occupied(mut entry) => {
-                assert!(
-                    entry.get().index_uid <= index_uid,
-                    "new index incarnation should be greater or equal"
-                );
+        let key = (index_uid.index_id.to_string(), source_id.clone());
 
-                entry.insert(RoutingTableEntry::new(
-                    &self.self_node_id,
-                    index_uid,
-                    source_id,
-                    shards,
-                ));
-            }
+        let entry = self.table.entry(key).or_default();
+        let ingester_node = IngesterNode {
+            node_id: node_id.clone(),
+            index_uid,
+            source_id,
+            capacity_score,
+            open_shard_count,
         };
+        entry.nodes.insert(node_id, ingester_node);
     }
 
-    /// Inserts the shards the routing table is not aware of.
-    pub fn insert_open_shards(
+    /// Merges routing updates from a GetOrCreateOpenShards control plane response into the
+    /// table. For existing nodes, updates their open shard count, including if the count is 0, from
+    /// the CP response while preserving capacity scores if they already exist.
+    /// New nodes get a default capacity_score of 5.
+    pub fn merge_from_shards(
         &mut self,
-        leader_id: &NodeId,
         index_uid: IndexUid,
-        source_id: impl Into<SourceId>,
-        shard_ids: &[ShardId],
-    ) {
-        let index_id: IndexId = index_uid.index_id.to_string();
-        let source_id: SourceId = source_id.into();
-        let key = (index_id, source_id.clone());
-
-        self.table
-            .entry(key.clone())
-            .or_insert_with(|| RoutingTableEntry::empty(index_uid.clone(), source_id))
-            .insert_open_shards(&self.self_node_id, leader_id, &index_uid, shard_ids);
-    }
-
-    /// Closes the targeted shards.
-    pub fn close_shards(
-        &mut self,
-        index_uid: &IndexUid,
-        source_id: impl Into<SourceId>,
-        shard_ids: &[ShardId],
-    ) {
-        let key = (index_uid.index_id.clone(), source_id.into());
-        if let Some(entry) = self.table.get_mut(&key) {
-            entry.close_shards(index_uid, shard_ids);
-        }
-    }
-
-    /// Deletes the targeted shards.
-    pub fn delete_shards(
-        &mut self,
-        index_uid: &IndexUid,
-        source_id: impl Into<SourceId>,
-        shard_ids: &[ShardId],
+        source_id: SourceId,
+        shards: Vec<Shard>,
     ) {
-        let key = (index_uid.index_id.clone(), source_id.into());
-        if let Some(entry) = self.table.get_mut(&key) {
-            entry.delete_shards(index_uid, shard_ids);
-        }
-    }
-
-    pub fn debug_info(&self) -> HashMap<IndexId, Vec<JsonValue>> {
-        let mut per_index_shards_json: HashMap<IndexId, Vec<JsonValue>> = HashMap::new();
-
-        for ((index_id, source_id), entry) in &self.table {
-            for (shards, is_local) in &[(&entry.local_shards, true), (&entry.remote_shards, false)]
-            {
-                let shards_json = shards.iter().map(|shard| {
-                    json!({
-                        "index_uid": shard.index_uid,
-                        "source_id": source_id,
-                        "shard_id": shard.shard_id,
-                        "shard_state": shard.shard_state.as_json_str_name(),
-                        "is_local": is_local,
-                    })
+        let per_leader_count: HashMap<NodeId, usize> = shards
+            .iter()
+            .map(|shard| {
+                let num_open_shards = shard.is_open() as usize;
+                let leader_id = NodeId::from(shard.leader_id.clone());
+                (leader_id, num_open_shards)
+            })
+            .into_grouping_map()
+            .sum();
+
+        let key = (index_uid.index_id.to_string(), source_id.clone());
+        let entry = self.table.entry(key).or_default();
+
+        for (node_id, open_shard_count) in per_leader_count {
+            entry
+                .nodes
+                .entry(node_id.clone())
+                .and_modify(|node| node.open_shard_count = open_shard_count)
+                .or_insert_with(|| IngesterNode {
+                    node_id,
+                    index_uid: index_uid.clone(),
+                    source_id: source_id.clone(),
+                    capacity_score: 5,
+                    open_shard_count,
                 });
-                per_index_shards_json
-                    .entry(index_id.clone())
-                    .or_default()
-                    .extend(shards_json);
-            }
         }
-        per_index_shards_json
-    }
-
-    #[cfg(test)]
-    pub fn len(&self) -> usize {
-        self.table.len()
     }
 }
 
@@ -498,624 +251,315 @@ impl RoutingTable {
 mod tests {
     use quickwit_proto::ingest::ShardState;
     use quickwit_proto::ingest::ingester::IngesterServiceClient;
+    use quickwit_proto::types::ShardId;
 
     use super::*;
+    use crate::IngesterPoolEntry;
+
+    fn mocked_ingester(availability_zone: Option<&str>) -> IngesterPoolEntry {
+        IngesterPoolEntry {
+            client: IngesterServiceClient::mocked(),
+            availability_zone: availability_zone.map(|s| s.to_string()),
+        }
+    }
 
     #[test]
-    fn test_routing_table_entry_new() {
-        let self_node_id: NodeId = "test-node-0".into();
-        let index_uid = IndexUid::for_test("test-index", 0);
-        let source_id: SourceId = "test-source".into();
-        let table_entry = RoutingTableEntry::new(
-            &self_node_id,
-            index_uid.clone(),
-            source_id.clone(),
-            Vec::new(),
+    fn test_apply_capacity_update() {
+        let mut table = RoutingTable::default();
+        let key = ("test-index".to_string(), "test-source".into());
+
+        // Insert first node.
+        table.apply_capacity_update(
+            "node-1".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            8,
+            3,
         );
-        assert_eq!(table_entry.len(), 0);
-
-        let index_uid: IndexUid = IndexUid::for_test("test-index", 0);
-        let shards = vec![
-            Shard {
-                index_uid: Some(index_uid.clone()),
-                source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(3)),
-                shard_state: ShardState::Open as i32,
-                leader_id: "test-node-0".to_string(),
-                ..Default::default()
-            },
-            Shard {
-                index_uid: Some(index_uid.clone()),
-                source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(1)),
-                shard_state: ShardState::Open as i32,
-                leader_id: "test-node-0".to_string(),
-                ..Default::default()
-            },
-            Shard {
-                index_uid: Some(index_uid.clone()),
-                source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(2)),
-                shard_state: ShardState::Open as i32,
-                leader_id: "test-node-1".to_string(),
-                ..Default::default()
-            },
-            Shard {
-                index_uid: Some(index_uid.clone()),
-                source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(1)),
-                shard_state: ShardState::Open as i32,
-                leader_id: "test-node-0".to_string(),
-                ..Default::default()
-            },
-            Shard {
-                index_uid: Some(index_uid.clone()),
-                source_id: "test-source".to_string(),
-                shard_id: Some(ShardId::from(4)),
-                shard_state: ShardState::Closed as i32,
-                leader_id: "test-node-0".to_string(),
-                ..Default::default()
-            },
-        ];
-        let table_entry = RoutingTableEntry::new(&self_node_id, index_uid, source_id, shards);
-        assert_eq!(table_entry.local_shards.len(), 2);
-        assert_eq!(table_entry.local_shards[0].shard_id, ShardId::from(1));
-        assert_eq!(table_entry.local_shards[1].shard_id, ShardId::from(3));
-
-        assert_eq!(table_entry.remote_shards.len(), 1);
-        assert_eq!(table_entry.remote_shards[0].shard_id, ShardId::from(2));
+        let entry = table.table.get(&key).unwrap();
+        assert_eq!(entry.nodes.len(), 1);
+        assert_eq!(entry.nodes.get("node-1").unwrap().capacity_score, 8);
+
+        // Update existing node.
+        table.apply_capacity_update(
+            "node-1".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            4,
+            5,
+        );
+        let node = table.table.get(&key).unwrap().nodes.get("node-1").unwrap();
+        assert_eq!(node.capacity_score, 4);
+        assert_eq!(node.open_shard_count, 5);
+
+        // Add second node.
+        table.apply_capacity_update(
+            "node-2".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            6,
+            2,
+        );
+        assert_eq!(table.table.get(&key).unwrap().nodes.len(), 2);
+
+        // Zero shards: node stays in table but becomes ineligible for routing.
+        table.apply_capacity_update(
+            "node-1".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            0,
+            0,
+        );
+        let entry = table.table.get(&key).unwrap();
+        assert_eq!(entry.nodes.len(), 2);
+        assert_eq!(entry.nodes.get("node-1").unwrap().open_shard_count, 0);
+        assert_eq!(entry.nodes.get("node-1").unwrap().capacity_score, 0);
     }
 
     #[test]
-    fn test_routing_table_entry_has_open_shards() {
-        let index_uid = IndexUid::for_test("test-index", 0);
-        let source_id: SourceId = "test-source".into();
-        let table_entry = RoutingTableEntry::empty(index_uid.clone(), source_id.clone());
-
-        let mut closed_shard_ids = Vec::new();
-        let ingester_pool = IngesterPool::default();
-        let mut unavailable_leaders = HashSet::new();
-
-        assert!(!table_entry.has_open_shards(
-            &ingester_pool,
-            &mut closed_shard_ids,
-            &mut unavailable_leaders
-        ));
-        assert!(closed_shard_ids.is_empty());
-        assert!(unavailable_leaders.is_empty());
-
-        ingester_pool.insert(
-            "test-ingester-0".into(),
-            crate::IngesterPoolEntry {
-                client: IngesterServiceClient::mocked(),
-                availability_zone: None,
-            },
+    fn test_has_open_nodes() {
+        let mut table = RoutingTable::default();
+        let pool = IngesterPool::default();
+
+        // Empty table.
+        assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new()));
+
+        // Node exists but is not in pool.
+        table.apply_capacity_update(
+            "node-1".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            8,
+            3,
         );
-        ingester_pool.insert(
-            "test-ingester-1".into(),
-            crate::IngesterPoolEntry {
-                client: IngesterServiceClient::mocked(),
-                availability_zone: None,
-            },
+        assert!(!table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new()));
+
+        // Node is in pool → true.
+        pool.insert("node-1".into(), mocked_ingester(None));
+        assert!(table.has_open_nodes("test-index", "test-source", &pool, &HashSet::new()));
+
+        // Node is unavailable → false.
+        let unavailable: HashSet<NodeId> = HashSet::from(["node-1".into()]);
+        assert!(!table.has_open_nodes("test-index", "test-source", &pool, &unavailable));
+
+        // Second node available → true despite first being unavailable.
+        table.apply_capacity_update(
+            "node-2".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            6,
+            2,
         );
-
-        let table_entry = RoutingTableEntry {
-            index_uid: index_uid.clone(),
-            source_id: source_id.clone(),
-            local_shards: vec![
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(1),
-                    shard_state: ShardState::Closed,
-                    leader_id: "test-ingester-0".into(),
-                },
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(2),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-0".into(),
-                },
-            ],
-            local_round_robin_idx: AtomicUsize::default(),
-            remote_shards: Vec::new(),
-            remote_round_robin_idx: AtomicUsize::default(),
-        };
-        assert!(table_entry.has_open_shards(
-            &ingester_pool,
-            &mut closed_shard_ids,
-            &mut unavailable_leaders
-        ));
-        assert_eq!(closed_shard_ids.len(), 1);
-        assert_eq!(closed_shard_ids[0], ShardId::from(1));
-        assert!(unavailable_leaders.is_empty());
-
-        closed_shard_ids.clear();
-
-        let table_entry = RoutingTableEntry {
-            index_uid: index_uid.clone(),
-            source_id,
-            local_shards: Vec::new(),
-            local_round_robin_idx: AtomicUsize::default(),
-            remote_shards: vec![
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(1),
-                    shard_state: ShardState::Closed,
-                    leader_id: "test-ingester-1".into(),
-                },
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(2),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-2".into(),
-                },
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(3),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-1".into(),
-                },
-            ],
-            remote_round_robin_idx: AtomicUsize::default(),
-        };
-        assert!(table_entry.has_open_shards(
-            &ingester_pool,
-            &mut closed_shard_ids,
-            &mut unavailable_leaders
-        ));
-        assert_eq!(closed_shard_ids.len(), 1);
-        assert_eq!(closed_shard_ids[0], ShardId::from(1));
-        assert_eq!(unavailable_leaders.len(), 1);
-        assert!(unavailable_leaders.contains("test-ingester-2"));
+        pool.insert("node-2".into(), mocked_ingester(None));
+        assert!(table.has_open_nodes("test-index", "test-source", &pool, &unavailable));
+
+        // Node with capacity_score=0 is not eligible.
+        table.apply_capacity_update(
+            "node-2".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            0,
+            2,
+        );
+        assert!(!table.has_open_nodes("test-index", "test-source", &pool, &unavailable));
     }
 
     #[test]
-    fn test_routing_table_entry_next_open_shard_round_robin() {
-        let index_uid = IndexUid::for_test("test-index", 0);
-        let source_id: SourceId = "test-source".into();
-        let table_entry = RoutingTableEntry::empty(index_uid.clone(), source_id.clone());
-        let ingester_pool = IngesterPool::default();
-        let mut rate_limited_shards = HashSet::new();
-
-        let error = table_entry
-            .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards)
-            .unwrap_err();
-        assert_eq!(error, NextOpenShardError::NoShardsAvailable);
-
-        ingester_pool.insert(
-            "test-ingester-0".into(),
-            crate::IngesterPoolEntry {
-                client: IngesterServiceClient::mocked(),
-                availability_zone: None,
-            },
+    fn test_pick_node_prefers_same_az() {
+        let mut table = RoutingTable::new(Some("az-1".to_string()));
+        let pool = IngesterPool::default();
+
+        table.apply_capacity_update(
+            "node-1".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            5,
+            1,
         );
-        ingester_pool.insert(
-            "test-ingester-1".into(),
-            crate::IngesterPoolEntry {
-                client: IngesterServiceClient::mocked(),
-                availability_zone: None,
-            },
+        table.apply_capacity_update(
+            "node-2".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            5,
+            1,
         );
+        pool.insert("node-1".into(), mocked_ingester(Some("az-1")));
+        pool.insert("node-2".into(), mocked_ingester(Some("az-2")));
 
-        let table_entry = RoutingTableEntry {
-            index_uid: index_uid.clone(),
-            source_id: source_id.clone(),
-            local_shards: vec![
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(1),
-                    shard_state: ShardState::Closed,
-                    leader_id: "test-ingester-0".into(),
-                },
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(2),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-0".into(),
-                },
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(3),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-0".into(),
-                },
-            ],
-            local_round_robin_idx: AtomicUsize::default(),
-            remote_shards: Vec::new(),
-            remote_round_robin_idx: AtomicUsize::default(),
-        };
-        let shard = table_entry
-            .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards)
-            .unwrap();
-        assert_eq!(shard.shard_id, ShardId::from(2));
-
-        let shard = table_entry
-            .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards)
-            .unwrap();
-        assert_eq!(shard.shard_id, ShardId::from(3));
-
-        let shard = table_entry
-            .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards)
-            .unwrap();
-        assert_eq!(shard.shard_id, ShardId::from(2));
-
-        let table_entry = RoutingTableEntry {
-            index_uid: index_uid.clone(),
-            source_id: source_id.clone(),
-            local_shards: vec![RoutingEntry {
-                index_uid: index_uid.clone(),
-                source_id: "test-source".to_string(),
-                shard_id: ShardId::from(1),
-                shard_state: ShardState::Closed,
-                leader_id: "test-ingester-0".into(),
-            }],
-            local_round_robin_idx: AtomicUsize::default(),
-            remote_shards: vec![
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(2),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-1".into(),
-                },
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(3),
-                    shard_state: ShardState::Closed,
-                    leader_id: "test-ingester-1".into(),
-                },
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(4),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-2".into(),
-                },
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(5),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-1".into(),
-                },
-            ],
-            remote_round_robin_idx: AtomicUsize::default(),
-        };
-        let shard = table_entry
-            .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards)
-            .unwrap();
-        assert_eq!(shard.shard_id, ShardId::from(2));
-
-        let shard = table_entry
-            .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards)
-            .unwrap();
-        assert_eq!(shard.shard_id, ShardId::from(5));
-
-        let shard = table_entry
-            .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards)
-            .unwrap();
-        assert_eq!(shard.shard_id, ShardId::from(2));
-
-        rate_limited_shards.insert(ShardId::from(5));
-
-        let shard = table_entry
-            .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards)
+        let picked = table
+            .pick_node("test-index", "test-source", &pool, &HashSet::new())
             .unwrap();
-        assert_eq!(shard.shard_id, ShardId::from(2));
+        assert_eq!(picked.node_id, NodeId::from("node-1"));
     }
 
     #[test]
-    fn test_routing_table_entry_next_open_shard_round_robin_rate_limited_error() {
-        let index_uid = IndexUid::for_test("test-index", 0);
-        let source_id: SourceId = "test-source".into();
-
-        let ingester_pool = IngesterPool::default();
-        ingester_pool.insert(
-            "test-ingester-0".into(),
-            crate::IngesterPoolEntry {
-                client: IngesterServiceClient::mocked(),
-                availability_zone: None,
-            },
+    fn test_pick_node_falls_back_to_cross_az() {
+        let mut table = RoutingTable::new(Some("az-1".to_string()));
+        let pool = IngesterPool::default();
+
+        table.apply_capacity_update(
+            "node-2".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            5,
+            1,
         );
+        pool.insert("node-2".into(), mocked_ingester(Some("az-2")));
 
-        let rate_limited_shards = HashSet::from_iter([ShardId::from(1)]);
-
-        let table_entry = RoutingTableEntry {
-            index_uid: index_uid.clone(),
-            source_id: source_id.clone(),
-            local_shards: vec![RoutingEntry {
-                index_uid: index_uid.clone(),
-                source_id: "test-source".to_string(),
-                shard_id: ShardId::from(1),
-                shard_state: ShardState::Open,
-                leader_id: "test-ingester-0".into(),
-            }],
-            local_round_robin_idx: AtomicUsize::default(),
-            remote_shards: Vec::new(),
-            remote_round_robin_idx: AtomicUsize::default(),
-        };
-        let error = table_entry
-            .next_open_shard_round_robin(&ingester_pool, &rate_limited_shards)
-            .unwrap_err();
-        assert_eq!(error, NextOpenShardError::RateLimited);
+        let picked = table
+            .pick_node("test-index", "test-source", &pool, &HashSet::new())
+            .unwrap();
+        assert_eq!(picked.node_id, NodeId::from("node-2"));
     }
 
     #[test]
-    fn test_routing_table_entry_insert_open_shards() {
-        let index_uid_0 = IndexUid::for_test("test-index", 0);
-        let source_id: SourceId = "test-source".into();
-        let mut table_entry = RoutingTableEntry::empty(index_uid_0.clone(), source_id.clone());
-
-        let local_node_id: NodeId = "test-ingester-0".into();
-        let remote_node_id: NodeId = "test-ingester-1".into();
-        table_entry.insert_open_shards(&local_node_id, &local_node_id, &index_uid_0, &[]);
-
-        assert_eq!(table_entry.local_shards.len(), 0);
-        assert_eq!(table_entry.remote_shards.len(), 0);
-
-        table_entry.insert_open_shards(
-            &local_node_id,
-            &local_node_id,
-            &index_uid_0,
-            &[ShardId::from(2)],
+    fn test_pick_node_no_az_awareness() {
+        let mut table = RoutingTable::default();
+        let pool = IngesterPool::default();
+
+        table.apply_capacity_update(
+            "node-1".into(),
+            IndexUid::for_test("test-index", 0),
+            "test-source".into(),
+            5,
+            1,
         );
+        pool.insert("node-1".into(), mocked_ingester(Some("az-1")));
 
-        assert_eq!(table_entry.local_shards.len(), 1);
-        assert_eq!(table_entry.remote_shards.len(), 0);
-
-        assert_eq!(table_entry.local_shards[0].index_uid, index_uid_0);
-        assert_eq!(table_entry.local_shards[0].source_id, source_id);
-        assert_eq!(table_entry.local_shards[0].shard_id, ShardId::from(2));
-        assert_eq!(table_entry.local_shards[0].shard_state, ShardState::Open);
-        assert_eq!(table_entry.local_shards[0].leader_id, local_node_id);
+        let picked = table
+            .pick_node("test-index", "test-source", &pool, &HashSet::new())
+            .unwrap();
+        assert_eq!(picked.node_id, NodeId::from("node-1"));
+    }
 
-        table_entry.local_shards[0].shard_state = ShardState::Closed;
-        table_entry.insert_open_shards(
-            &local_node_id,
-            &local_node_id,
-            &index_uid_0,
-            &[ShardId::from(1), ShardId::from(2)],
+    #[test]
+    fn test_pick_node_missing_entry() {
+        let table = RoutingTable::new(Some("az-1".to_string()));
+        let pool = IngesterPool::default();
+
+        assert!(
+            table
+                .pick_node("nonexistent", "source", &pool, &HashSet::new())
+                .is_none()
         );
+    }
 
-        assert_eq!(table_entry.local_shards.len(), 2);
-        assert_eq!(table_entry.remote_shards.len(), 0);
-
-        assert_eq!(table_entry.local_shards[0].shard_id, ShardId::from(1));
-        assert_eq!(table_entry.local_shards[0].shard_state, ShardState::Open);
-        assert_eq!(table_entry.local_shards[1].shard_id, ShardId::from(2));
-        assert_eq!(table_entry.local_shards[1].shard_state, ShardState::Closed);
-
-        table_entry.local_shards.clear();
-        table_entry.insert_open_shards(
-            &local_node_id,
-            &remote_node_id,
-            &index_uid_0,
-            &[ShardId::from(2)],
-        );
+    #[test]
+    fn test_power_of_two_choices() {
+        // 3 candidates: best appears in the random pair 2/3 of the time and always
+        // wins when it does, so it should win ~67% of 1000 runs. Asserting > 550
+        // is ~7.5 standard deviations from the mean — effectively impossible to flake.
+        let high = IngesterNode {
+            node_id: "high".into(),
+            index_uid: IndexUid::for_test("idx", 0),
+            source_id: "src".into(),
+            capacity_score: 9,
+            open_shard_count: 2,
+        };
+        let mid = IngesterNode {
+            node_id: "mid".into(),
+            index_uid: IndexUid::for_test("idx", 0),
+            source_id: "src".into(),
+            capacity_score: 5,
+            open_shard_count: 2,
+        };
+        let low = IngesterNode {
+            node_id: "low".into(),
+            index_uid: IndexUid::for_test("idx", 0),
+            source_id: "src".into(),
+            capacity_score: 1,
+            open_shard_count: 2,
+        };
+        let candidates: Vec<&IngesterNode> = vec![&high, &mid, &low];
 
-        assert_eq!(table_entry.local_shards.len(), 0);
-        assert_eq!(table_entry.remote_shards.len(), 1);
+        let mut high_wins = 0;
+        for _ in 0..1000 {
+            if power_of_two_choices(&candidates).node_id == "high" {
+                high_wins += 1;
+            }
+        }
+        assert!(high_wins > 550, "high won only {high_wins}/1000 times");
+    }
 
-        assert_eq!(table_entry.remote_shards[0].index_uid, index_uid_0);
-        assert_eq!(table_entry.remote_shards[0].source_id, source_id);
-        assert_eq!(table_entry.remote_shards[0].shard_id, ShardId::from(2));
-        assert_eq!(table_entry.remote_shards[0].shard_state, ShardState::Open);
-        assert_eq!(table_entry.remote_shards[0].leader_id, remote_node_id);
+    #[test]
+    fn test_merge_from_shards() {
+        let mut table = RoutingTable::default();
+        let index_uid = IndexUid::for_test("test-index", 0);
+        let key = ("test-index".to_string(), "test-source".to_string());
+
+        let make_shard = |id: u64, leader: &str, open: bool| Shard {
+            index_uid: Some(index_uid.clone()),
+            source_id: "test-source".to_string(),
+            shard_id: Some(ShardId::from(id)),
+            shard_state: if open {
+                ShardState::Open as i32
+            } else {
+                ShardState::Closed as i32
+            },
+            leader_id: leader.to_string(),
+            ..Default::default()
+        };
 
-        table_entry.remote_shards[0].shard_state = ShardState::Closed;
-        table_entry.insert_open_shards(
-            &local_node_id,
-            &remote_node_id,
-            &index_uid_0,
-            &[ShardId::from(1), ShardId::from(2)],
-        );
+        // Two open shards on node-1, one open + one closed on node-2, only closed on node-3.
+        let shards = vec![
+            make_shard(1, "node-1", true),
+            make_shard(2, "node-1", true),
+            make_shard(3, "node-2", true),
+            make_shard(4, "node-2", false),
+            make_shard(5, "node-3", false),
+        ];
+        table.merge_from_shards(index_uid.clone(), "test-source".into(), shards);
 
-        assert_eq!(table_entry.local_shards.len(), 0);
-        assert_eq!(table_entry.remote_shards.len(), 2);
+        let entry = table.table.get(&key).unwrap();
+        assert_eq!(entry.nodes.len(), 3);
 
-        assert_eq!(table_entry.remote_shards[0].shard_id, ShardId::from(1));
-        assert_eq!(table_entry.remote_shards[0].shard_state, ShardState::Open);
-        assert_eq!(table_entry.remote_shards[1].shard_id, ShardId::from(2));
-        assert_eq!(table_entry.remote_shards[1].shard_state, ShardState::Closed);
+        let n1 = entry.nodes.get("node-1").unwrap();
+        assert_eq!(n1.open_shard_count, 2);
+        assert_eq!(n1.capacity_score, 5);
 
-        // Update index incarnation.
-        let index_uid_1 = IndexUid::for_test("test-index", 1);
-        table_entry.insert_open_shards(
-            &local_node_id,
-            &local_node_id,
-            &index_uid_1,
-            &[ShardId::from(1)],
-        );
+        let n2 = entry.nodes.get("node-2").unwrap();
+        assert_eq!(n2.open_shard_count, 1);
 
-        assert_eq!(table_entry.index_uid, index_uid_1);
-        assert_eq!(table_entry.local_shards.len(), 1);
-        assert_eq!(table_entry.remote_shards.len(), 0);
+        let n3 = entry.nodes.get("node-3").unwrap();
+        assert_eq!(n3.open_shard_count, 0);
 
-        assert_eq!(table_entry.local_shards[0].index_uid, index_uid_1);
-        assert_eq!(table_entry.local_shards[0].source_id, source_id);
-        assert_eq!(table_entry.local_shards[0].shard_id, ShardId::from(1));
-        assert_eq!(table_entry.local_shards[0].shard_state, ShardState::Open);
-        assert_eq!(table_entry.local_shards[0].leader_id, local_node_id);
+        // Merging again adds new nodes but preserves existing ones.
+        let shards = vec![make_shard(10, "node-4", true)];
+        table.merge_from_shards(index_uid, "test-source".into(), shards);
 
-        // Ignore previous index incarnation.
-        table_entry.insert_open_shards(
-            &local_node_id,
-            &local_node_id,
-            &index_uid_0,
-            &[ShardId::from(12), ShardId::from(42), ShardId::from(1337)],
-        );
-        assert_eq!(table_entry.index_uid, index_uid_1);
-        assert_eq!(table_entry.local_shards.len(), 1);
-        assert_eq!(table_entry.remote_shards.len(), 0);
+        let entry = table.table.get(&key).unwrap();
+        assert_eq!(entry.nodes.len(), 4);
+        assert!(entry.nodes.contains_key("node-1"));
+        assert!(entry.nodes.contains_key("node-2"));
+        assert!(entry.nodes.contains_key("node-3"));
+        assert!(entry.nodes.contains_key("node-4"));
     }
 
     #[test]
-    fn test_routing_table_entry_close_shards() {
-        let index_uid = IndexUid::for_test("test-index", 0);
-        let source_id: SourceId = "test-source".into();
-
-        let mut table_entry = RoutingTableEntry::empty(index_uid.clone(), source_id.clone());
-        table_entry.close_shards(&index_uid, &[]);
-        table_entry.close_shards(&index_uid, &[ShardId::from(1)]);
-        assert!(table_entry.local_shards.is_empty());
-        assert!(table_entry.remote_shards.is_empty());
-
-        let mut table_entry = RoutingTableEntry {
-            index_uid: index_uid.clone(),
-            source_id: source_id.clone(),
-            local_shards: vec![
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(1),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-0".into(),
-                },
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(2),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-0".into(),
-                },
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(3),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-0".into(),
-                },
-            ],
-            local_round_robin_idx: AtomicUsize::default(),
-            remote_shards: vec![
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(5),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-1".into(),
-                },
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(6),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-1".into(),
-                },
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(7),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-1".into(),
-                },
-            ],
-            remote_round_robin_idx: AtomicUsize::default(),
-        };
-        table_entry.close_shards(
-            &index_uid,
-            &[
-                ShardId::from(1),
-                ShardId::from(3),
-                ShardId::from(4),
-                ShardId::from(6),
-                ShardId::from(8),
-            ],
+    fn test_classify_az_locality() {
+        let table = RoutingTable::new(Some("az-1".to_string()));
+        let pool = IngesterPool::default();
+        pool.insert("node-local".into(), mocked_ingester(Some("az-1")));
+        pool.insert("node-remote".into(), mocked_ingester(Some("az-2")));
+        pool.insert("node-no-az".into(), mocked_ingester(None));
+
+        assert_eq!(
+            table.classify_az_locality(&"node-local".into(), &pool),
+            "same_az"
+        );
+        assert_eq!(
+            table.classify_az_locality(&"node-remote".into(), &pool),
+            "cross_az"
+        );
+        assert_eq!(
+            table.classify_az_locality(&"node-no-az".into(), &pool),
+            "az_unaware"
         );
-        assert!(table_entry.local_shards[0].shard_state.is_closed());
-        assert!(table_entry.local_shards[1].shard_state.is_open());
-        assert!(table_entry.local_shards[2].shard_state.is_closed());
-        assert!(table_entry.remote_shards[0].shard_state.is_open());
-        assert!(table_entry.remote_shards[1].shard_state.is_closed());
-        assert!(table_entry.remote_shards[2].shard_state.is_open());
-    }
-
-    #[test]
-    fn test_routing_table_entry_delete_shards() {
-        let index_uid = IndexUid::for_test("test-index", 0);
-        let source_id: SourceId = "test-source".into();
-
-        let mut table_entry = RoutingTableEntry::empty(index_uid.clone(), source_id.clone());
-        table_entry.delete_shards(&index_uid, &[]);
-        table_entry.delete_shards(&index_uid, &[ShardId::from(1)]);
-        assert!(table_entry.local_shards.is_empty());
-        assert!(table_entry.remote_shards.is_empty());
 
-        let mut table_entry = RoutingTableEntry {
-            index_uid: index_uid.clone(),
-            source_id: source_id.clone(),
-            local_shards: vec![
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(1),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-0".into(),
-                },
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(2),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-0".into(),
-                },
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(3),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-0".into(),
-                },
-            ],
-            local_round_robin_idx: AtomicUsize::default(),
-            remote_shards: vec![
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(5),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-1".into(),
-                },
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(6),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-1".into(),
-                },
-                RoutingEntry {
-                    index_uid: index_uid.clone(),
-                    source_id: "test-source".to_string(),
-                    shard_id: ShardId::from(7),
-                    shard_state: ShardState::Open,
-                    leader_id: "test-ingester-1".into(),
-                },
-            ],
-            remote_round_robin_idx: AtomicUsize::default(),
-        };
-        table_entry.delete_shards(
-            &index_uid,
-            &[
-                ShardId::from(1),
-                ShardId::from(3),
-                ShardId::from(4),
-                ShardId::from(6),
-                ShardId::from(8),
-            ],
+        let table_no_az = RoutingTable::default();
+        assert_eq!(
+            table_no_az.classify_az_locality(&"node-local".into(), &pool),
+            "az_unaware"
         );
-        assert_eq!(table_entry.local_shards.len(), 1);
-        assert_eq!(table_entry.local_shards[0].shard_id, ShardId::from(2));
-        assert_eq!(table_entry.remote_shards.len(), 2);
-        assert_eq!(table_entry.remote_shards[0].shard_id, ShardId::from(5));
-        assert_eq!(table_entry.remote_shards[1].shard_id, ShardId::from(7));
     }
 }
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/state.rs b/quickwit/quickwit-ingest/src/ingest_v2/state.rs
index e158bce7c58..b77de8d608a 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/state.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/state.rs
@@ -35,7 +35,7 @@ use tracing::{error, info};
 use super::models::IngesterShard;
 use super::rate_meter::RateMeter;
 use super::replication::{ReplicationStreamTaskHandle, ReplicationTaskHandle};
-use super::wal_capacity_timeseries::WalDiskCapacityTimeSeries;
+use super::wal_capacity_tracker::WalCapacityTracker;
 use crate::ingest_v2::mrecordlog_utils::{force_delete_queue, queue_position_range};
 use crate::mrecordlog_async::MultiRecordLogAsync;
 use crate::{FollowerId, LeaderId, OpenShardCounts};
@@ -61,7 +61,7 @@ pub(super) struct InnerIngesterState {
     pub replication_streams: HashMap<FollowerId, ReplicationStreamTaskHandle>,
     // Replication tasks running for each replication stream opened with leaders.
     pub replication_tasks: HashMap<LeaderId, ReplicationTaskHandle>,
-    pub wal_capacity_time_series: WalDiskCapacityTimeSeries,
+    pub wal_capacity_tracker: WalCapacityTracker,
     status: IngesterStatus,
     status_tx: watch::Sender<IngesterStatus>,
 }
@@ -130,7 +130,7 @@ impl InnerIngesterState {
 }
 
 impl IngesterState {
-    fn new(disk_capacity: ByteSize) -> Self {
+    fn new(disk_capacity: ByteSize, memory_capacity: ByteSize) -> Self {
         let status = IngesterStatus::Initializing;
         let (status_tx, status_rx) = watch::channel(status);
         let inner = InnerIngesterState {
@@ -138,7 +138,7 @@ impl IngesterState {
             doc_mappers: Default::default(),
             replication_streams: Default::default(),
             replication_tasks: Default::default(),
-            wal_capacity_time_series: WalDiskCapacityTimeSeries::new(disk_capacity),
+            wal_capacity_tracker: WalCapacityTracker::new(disk_capacity, memory_capacity),
             status,
             status_tx,
         };
@@ -155,9 +155,10 @@ impl IngesterState {
     pub fn load(
         wal_dir_path: &Path,
         disk_capacity: ByteSize,
+        memory_capacity: ByteSize,
         rate_limiter_settings: RateLimiterSettings,
     ) -> Self {
-        let state = Self::new(disk_capacity);
+        let state = Self::new(disk_capacity, memory_capacity);
         let state_clone = state.clone();
         let wal_dir_path = wal_dir_path.to_path_buf();
 
@@ -180,6 +181,7 @@ impl IngesterState {
         let mut state = IngesterState::load(
             temp_dir.path(),
             disk_capacity,
+            ByteSize::mb(256),
             RateLimiterSettings::default(),
         );
 
@@ -530,7 +532,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_ingester_state_does_not_lock_while_initializing() {
-        let state = IngesterState::new(ByteSize::mb(256));
+        let state = IngesterState::new(ByteSize::mb(256), ByteSize::mb(256));
         let inner_guard = state.inner.lock().await;
 
         assert_eq!(inner_guard.status(), IngesterStatus::Initializing);
@@ -545,7 +547,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_ingester_state_failed() {
-        let state = IngesterState::new(ByteSize::mb(256));
+        let state = IngesterState::new(ByteSize::mb(256), ByteSize::mb(256));
 
         state.inner.lock().await.set_status(IngesterStatus::Failed);
 
@@ -558,7 +560,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_ingester_state_init() {
-        let mut state = IngesterState::new(ByteSize::mb(256));
+        let mut state = IngesterState::new(ByteSize::mb(256), ByteSize::mb(256));
         let temp_dir = tempfile::tempdir().unwrap();
 
         state
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_timeseries.rs b/quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_tracker.rs
similarity index 76%
rename from quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_timeseries.rs
rename to quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_tracker.rs
index 58f030cbf74..f24e8254053 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_timeseries.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/wal_capacity_tracker.rs
@@ -25,38 +25,36 @@ const WAL_CAPACITY_LOOKBACK_WINDOW_LEN: usize = 6;
 /// reading would be discarded when the next reading is inserted.
 const WAL_CAPACITY_READINGS_LEN: usize = WAL_CAPACITY_LOOKBACK_WINDOW_LEN + 1;
 
-pub struct WalDiskCapacityTimeSeries {
-    disk_capacity: ByteSize,
+struct WalCapacityTimeSeries {
+    capacity: ByteSize,
     readings: RingBuffer<f64, WAL_CAPACITY_READINGS_LEN>,
 }
 
-impl WalDiskCapacityTimeSeries {
-    pub fn new(disk_capacity: ByteSize) -> Self {
+impl WalCapacityTimeSeries {
+    fn new(capacity: ByteSize) -> Self {
         #[cfg(not(test))]
-        assert!(disk_capacity.as_u64() > 0);
+        assert!(capacity.as_u64() > 0);
         Self {
-            disk_capacity,
+            capacity,
             readings: RingBuffer::default(),
         }
     }
 
-    /// Records a disk usage reading and returns the resulting capacity score.
-    pub fn record_and_score(&mut self, disk_used: ByteSize) -> usize {
-        self.record(disk_used);
+    fn record_and_score(&mut self, used: ByteSize) -> usize {
+        self.record(used);
         let remaining = self.current().unwrap_or(1.0);
         let delta = self.delta().unwrap_or(0.0);
         compute_capacity_score(remaining, delta)
     }
 
-    /// Computes a capacity score for the given disk usage without recording it.
-    pub fn score(&self, disk_used: ByteSize) -> usize {
-        let remaining = 1.0 - (disk_used.as_u64() as f64 / self.disk_capacity.as_u64() as f64);
+    fn score(&self, used: ByteSize) -> usize {
+        let remaining = 1.0 - (used.as_u64() as f64 / self.capacity.as_u64() as f64);
         let delta = self.delta().unwrap_or(0.0);
         compute_capacity_score(remaining, delta)
     }
 
-    fn record(&mut self, disk_used: ByteSize) {
-        let remaining = 1.0 - (disk_used.as_u64() as f64 / self.disk_capacity.as_u64() as f64);
+    fn record(&mut self, used: ByteSize) {
+        let remaining = 1.0 - (used.as_u64() as f64 / self.capacity.as_u64() as f64);
         self.readings.push_back(remaining.clamp(0.0, 1.0));
     }
 
@@ -64,8 +62,6 @@ impl WalDiskCapacityTimeSeries {
         self.readings.last()
     }
 
-    /// How much remaining capacity changed between the oldest and newest readings.
-    /// Positive = improving, negative = draining.
     fn delta(&self) -> Option<f64> {
         let current = self.readings.last()?;
         let oldest = self.readings.front()?;
@@ -73,6 +69,35 @@ impl WalDiskCapacityTimeSeries {
     }
 }
 
+pub struct WalCapacityTracker {
+    disk: WalCapacityTimeSeries,
+    memory: WalCapacityTimeSeries,
+}
+
+impl WalCapacityTracker {
+    pub fn new(disk_capacity: ByteSize, memory_capacity: ByteSize) -> Self {
+        Self {
+            disk: WalCapacityTimeSeries::new(disk_capacity),
+            memory: WalCapacityTimeSeries::new(memory_capacity),
+        }
+    }
+
+    /// Records disk and memory usage readings and returns the resulting capacity score.
+    /// The score is the minimum of the individual disk and memory scores.
+    pub fn record_and_score(&mut self, disk_used: ByteSize, memory_used: ByteSize) -> usize {
+        let disk_score = self.disk.record_and_score(disk_used);
+        let memory_score = self.memory.record_and_score(memory_used);
+        disk_score.min(memory_score)
+    }
+
+    /// Computes a capacity score for the given usage without recording it.
+    pub fn score(&self, disk_used: ByteSize, memory_used: ByteSize) -> usize {
+        let disk_score = self.disk.score(disk_used);
+        let memory_score = self.memory.score(memory_used);
+        disk_score.min(memory_score)
+    }
+}
+
 /// Computes a capacity score from 0 to 10 using a PD controller.
 ///
 /// The score has two components:
@@ -115,18 +140,18 @@ fn compute_capacity_score(remaining_capacity: f64, capacity_delta: f64) -> usize
 mod tests {
     use super::*;
 
-    fn ts() -> WalDiskCapacityTimeSeries {
-        WalDiskCapacityTimeSeries::new(ByteSize::b(100))
+    fn ts() -> WalCapacityTimeSeries {
+        WalCapacityTimeSeries::new(ByteSize::b(100))
     }
 
     /// Helper: record a reading with `used` bytes against the series' fixed capacity.
-    fn record(series: &mut WalDiskCapacityTimeSeries, used: u64) {
+    fn record(series: &mut WalCapacityTimeSeries, used: u64) {
         series.record(ByteSize::b(used));
     }
 
     #[test]
     fn test_wal_disk_capacity_current_after_record() {
-        let mut series = WalDiskCapacityTimeSeries::new(ByteSize::b(256));
+        let mut series = WalCapacityTimeSeries::new(ByteSize::b(256));
         // 192 of 256 used => 25% remaining
         series.record(ByteSize::b(192));
         assert_eq!(series.current(), Some(0.25));
@@ -211,4 +236,14 @@ mod tests {
         record(&mut series, 0);
         assert_eq!(series.delta(), Some(0.50));
     }
+
+    #[test]
+    fn test_wal_capacity_tracker_returns_min() {
+        let mut tracker = WalCapacityTracker::new(ByteSize::b(100), ByteSize::b(100));
+        // Disk 10% used (score 9), memory 90% used (score 2) → returns 2.
+        assert_eq!(
+            tracker.record_and_score(ByteSize::b(10), ByteSize::b(90)),
+            2
+        );
+    }
 }

From 056113e2a0f22414e3aa52f77a453edb7a7189f0 Mon Sep 17 00:00:00 2001
From: Nadav Gov-Ari <nadav.govari@datadoghq.com>
Date: Fri, 13 Mar 2026 15:51:21 -0400
Subject: [PATCH 8/9] PR comments

---
 .github/workflows/ci.yml                      |  1 -
 quickwit/quickwit-control-plane/Cargo.toml    |  1 +
 .../src/control_plane.rs                      | 15 +--
 .../src/ingest/ingest_controller.rs           | 66 ++++++-------
 .../src/source/ingest/mod.rs                  | 22 ++---
 .../src/ingest_v2/broadcast/capacity_score.rs |  3 +-
 .../quickwit-ingest/src/ingest_v2/fetch.rs    | 22 ++---
 .../quickwit-ingest/src/ingest_v2/ingester.rs | 92 +++++++++++++++++++
 quickwit/quickwit-ingest/src/ingest_v2/mod.rs | 20 ++++
 .../quickwit-ingest/src/ingest_v2/router.rs   | 18 +---
 .../protos/quickwit/ingester.proto            |  2 +
 quickwit/quickwit-serve/src/lib.rs            |  2 +
 12 files changed, 171 insertions(+), 93 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 502950b5399..85093c255f6 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -56,7 +56,6 @@ jobs:
         run: |
           df -h
 
-
           if [ "$(df -BG / | awk 'NR==2 {gsub("G","",$4); print $4}')" -lt 30 ]; then
             echo "Less than 30GiB available. Running cleanup..."
             sudo rm -rf /usr/share/dotnet
diff --git a/quickwit/quickwit-control-plane/Cargo.toml b/quickwit/quickwit-control-plane/Cargo.toml
index 2957c9858c4..abaf672e908 100644
--- a/quickwit/quickwit-control-plane/Cargo.toml
+++ b/quickwit/quickwit-control-plane/Cargo.toml
@@ -47,6 +47,7 @@ quickwit-cluster = { workspace = true, features = ["testsuite"] }
 quickwit-common = { workspace = true, features = ["testsuite"] }
 quickwit-config = { workspace = true, features = ["testsuite"] }
 quickwit-indexing = { workspace = true }
+quickwit-ingest = { workspace = true, features = ["testsuite"] }
 quickwit-metastore = { workspace = true, features = ["testsuite"] }
 quickwit-proto = { workspace = true, features = ["testsuite"] }
 
diff --git a/quickwit/quickwit-control-plane/src/control_plane.rs b/quickwit/quickwit-control-plane/src/control_plane.rs
index 3603a2b3b7d..4e08b3a6b44 100644
--- a/quickwit/quickwit-control-plane/src/control_plane.rs
+++ b/quickwit/quickwit-control-plane/src/control_plane.rs
@@ -1216,13 +1216,6 @@ mod tests {
     use super::*;
     use crate::IndexerNodeInfo;
 
-    fn ingester_pool_entry(client: IngesterServiceClient) -> IngesterPoolEntry {
-        IngesterPoolEntry {
-            client,
-            status: IngesterStatus::Ready,
-            availability_zone: None,
-        }
-    }
     #[tokio::test]
     async fn test_control_plane_create_index() {
         let universe = Universe::with_accelerated_time();
@@ -2230,7 +2223,7 @@ mod tests {
                 assert!(&retain_shards_for_source.shard_ids.is_empty());
                 Ok(RetainShardsResponse {})
             });
-        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("node1".into(), ingester);
 
         let cluster_config = ClusterConfig::for_test();
@@ -2276,7 +2269,7 @@ mod tests {
                 );
                 Ok(RetainShardsResponse {})
             });
-        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("node1".into(), ingester);
 
         let mut index_0 = IndexMetadata::for_test("test-index-0", "ram:///test-index-0");
@@ -2651,7 +2644,7 @@ mod tests {
             };
             Ok(response)
         });
-        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert(ingester_id, ingester);
 
         let mut mock_metastore = MockMetastoreService::new();
@@ -2805,7 +2798,7 @@ mod tests {
             };
             Ok(response)
         });
-        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert(ingester_id, ingester);
 
         let mut mock_metastore = MockMetastoreService::new();
diff --git a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs
index 85def904c0a..94c2e23c87f 100644
--- a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs
+++ b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs
@@ -325,17 +325,17 @@ impl IngestController {
     /// Syncs the ingester in a fire and forget manner.
     ///
     /// The returned oneshot is just here for unit test to wait for the operation to terminate.
-    fn sync_with_ingester(&self, ingester_node: &NodeId, model: &ControlPlaneModel) -> WaitHandle {
-        info!(ingester = %ingester_node, "sync_with_ingester");
+    fn sync_with_ingester(&self, ingester_id: &NodeId, model: &ControlPlaneModel) -> WaitHandle {
+        info!(ingester = %ingester_id, "sync_with_ingester");
         let (wait_drop_guard, wait_handle) = WaitHandle::new();
-        let Some(ingester) = self.ingester_pool.get(ingester_node) else {
+        let Some(ingester) = self.ingester_pool.get(ingester_id) else {
             // TODO: (Maybe) We should mark the ingester as unavailable, and stop advertise its
             // shard to routers.
-            warn!("failed to sync with ingester `{ingester_node}`: not available");
+            warn!("failed to sync with ingester `{ingester_id}`: not available");
             return wait_handle;
         };
         let mut retain_shards_req = RetainShardsRequest::default();
-        for (source_uid, shard_ids) in &*model.list_shards_for_node(ingester_node) {
+        for (source_uid, shard_ids) in &*model.list_shards_for_node(ingester_id) {
             let shards_for_source = RetainShardsForSource {
                 index_uid: Some(source_uid.index_uid.clone()),
                 source_id: source_uid.source_id.clone(),
@@ -345,8 +345,8 @@ impl IngestController {
                 .retain_shards_for_sources
                 .push(shards_for_source);
         }
-        info!(ingester = %ingester_node, "retain shards ingester");
-        let operation: String = format!("retain shards `{ingester_node}`");
+        info!(%ingester_id, "retain shards ingester");
+        let operation: String = format!("retain shards `{ingester_id}`");
         fire_and_forget(
             async move {
                 if let Err(retain_shards_err) =
@@ -1341,14 +1341,6 @@ mod tests {
 
     use super::*;
 
-    fn ingester_pool_entry(client: IngesterServiceClient) -> IngesterPoolEntry {
-        IngesterPoolEntry {
-            client,
-            status: IngesterStatus::Ready,
-            availability_zone: None,
-        }
-    }
-
     const TEST_SHARD_THROUGHPUT_LIMIT_MIB: f32 =
         DEFAULT_SHARD_THROUGHPUT_LIMIT.as_u64() as f32 / quickwit_common::shared_consts::MIB as f32;
 
@@ -1408,7 +1400,7 @@ mod tests {
         let ingester_pool = IngesterPool::default();
         ingester_pool.insert(
             NodeId::from("test-ingester-1"),
-            ingester_pool_entry(client.clone()),
+            IngesterPoolEntry::ready_with_client(client.clone()),
         );
 
         let mut mock_ingester = MockIngesterService::new();
@@ -1439,7 +1431,7 @@ mod tests {
         let ingester = IngesterServiceClient::from_mock(mock_ingester);
         ingester_pool.insert(
             NodeId::from("test-ingester-2"),
-            ingester_pool_entry(ingester.clone()),
+            IngesterPoolEntry::ready_with_client(ingester.clone()),
         );
 
         let replication_factor = 2;
@@ -1628,7 +1620,7 @@ mod tests {
         let ingester_pool = IngesterPool::default();
         ingester_pool.insert(
             NodeId::from("test-ingester-1"),
-            ingester_pool_entry(client.clone()),
+            IngesterPoolEntry::ready_with_client(client.clone()),
         );
 
         let replication_factor = 1;
@@ -1742,7 +1734,7 @@ mod tests {
 
         ingester_pool.insert(
             NodeId::from("test-ingester-1"),
-            ingester_pool_entry(IngesterServiceClient::mocked()),
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::mocked()),
         );
 
         let leader_follower_pairs_opt =
@@ -1754,7 +1746,7 @@ mod tests {
 
         ingester_pool.insert(
             "test-ingester-2".into(),
-            ingester_pool_entry(IngesterServiceClient::mocked()),
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::mocked()),
         );
 
         let leader_follower_pairs = controller
@@ -1876,7 +1868,7 @@ mod tests {
 
         ingester_pool.insert(
             "test-ingester-3".into(),
-            ingester_pool_entry(IngesterServiceClient::mocked()),
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::mocked()),
         );
         let unavailable_leaders = FnvHashSet::from_iter([NodeId::from("test-ingester-2")]);
         let leader_follower_pairs = controller
@@ -1970,7 +1962,7 @@ mod tests {
                 Ok(response)
             });
         let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
-        ingester_pool.insert(ingester_id_0, ingester_pool_entry(ingester_0));
+        ingester_pool.insert(ingester_id_0, IngesterPoolEntry::ready_with_client(ingester_0));
 
         let ingester_id_1 = NodeId::from("test-ingester-1");
         let mut mock_ingester_1 = MockIngesterService::new();
@@ -1991,7 +1983,7 @@ mod tests {
 
                 Err(IngestV2Error::Internal("internal error".to_string()))
             });
-        let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1));
+        let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
         ingester_pool.insert(ingester_id_1, ingester_1);
 
         let ingester_id_2 = NodeId::from("test-ingester-2");
@@ -2001,7 +1993,7 @@ mod tests {
         let client_2 = IngesterServiceClient::tower()
             .stack_init_shards_layer(DelayLayer::new(INIT_SHARDS_REQUEST_TIMEOUT * 2))
             .build_from_mock(mock_ingester_2);
-        ingester_pool.insert(ingester_id_2, ingester_pool_entry(client_2));
+        ingester_pool.insert(ingester_id_2, IngesterPoolEntry::ready_with_client(client_2));
 
         let init_shards_response = controller
             .init_shards(Vec::new(), &Progress::default())
@@ -2209,7 +2201,7 @@ mod tests {
 
         ingester_pool.insert(
             NodeId::from("test-ingester-1"),
-            ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester)),
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester)),
         );
         let source_uids: HashMap<SourceUid, usize> = HashMap::from_iter([(source_uid.clone(), 1)]);
         let unavailable_leaders = FnvHashSet::default();
@@ -2379,7 +2371,7 @@ mod tests {
                     "failed to close shards".to_string(),
                 ))
             });
-        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("test-ingester".into(), ingester);
 
         let shard_infos = BTreeSet::from_iter([
@@ -2532,7 +2524,7 @@ mod tests {
             },
         );
 
-        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("test-ingester".into(), ingester);
 
         let shard_infos = BTreeSet::from_iter([ShardInfo {
@@ -2678,7 +2670,7 @@ mod tests {
                 };
                 Ok(response)
             });
-        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("test-ingester".into(), ingester);
 
         // Test failed to open shards.
@@ -2800,7 +2792,7 @@ mod tests {
                 };
                 Ok(response)
             });
-        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("test-ingester".into(), ingester);
 
         // Test failed to close shard.
@@ -3032,15 +3024,15 @@ mod tests {
             });
         ingester_pool.insert(
             "node-1".into(),
-            ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1)),
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1)),
         );
         ingester_pool.insert(
             "node-2".into(),
-            ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_2)),
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_2)),
         );
         ingester_pool.insert(
             "node-3".into(),
-            ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_3)),
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_3)),
         );
         let node_id = "node-1".into();
         let wait_handle = controller.sync_with_ingester(&node_id, &model);
@@ -3170,7 +3162,7 @@ mod tests {
                 Ok(response)
             });
         let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
-        ingester_pool.insert(ingester_id_0.clone(), ingester_pool_entry(ingester_0));
+        ingester_pool.insert(ingester_id_0.clone(), IngesterPoolEntry::ready_with_client(ingester_0));
 
         let ingester_id_1 = NodeId::from("test-ingester-1");
         let mut mock_ingester_1 = MockIngesterService::new();
@@ -3188,7 +3180,7 @@ mod tests {
                 Err(IngestV2Error::Internal("internal error".to_string()))
             });
         let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1);
-        ingester_pool.insert(ingester_id_1.clone(), ingester_pool_entry(ingester_1));
+        ingester_pool.insert(ingester_id_1.clone(), IngesterPoolEntry::ready_with_client(ingester_1));
 
         let ingester_id_2 = NodeId::from("test-ingester-2");
         let mut mock_ingester_2 = MockIngesterService::new();
@@ -3197,7 +3189,7 @@ mod tests {
         let client_2 = IngesterServiceClient::tower()
             .stack_close_shards_layer(DelayLayer::new(CLOSE_SHARDS_REQUEST_TIMEOUT * 2))
             .build_from_mock(mock_ingester_2);
-        ingester_pool.insert(ingester_id_2.clone(), ingester_pool_entry(client_2));
+        ingester_pool.insert(ingester_id_2.clone(), IngesterPoolEntry::ready_with_client(client_2));
 
         // In this test:
         // - ingester 0 will close shard 0 successfully and fail to close shard 1;
@@ -3377,7 +3369,7 @@ mod tests {
                 Ok(response)
             });
         let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
-        ingester_pool.insert(ingester_id_0.clone(), ingester_pool_entry(ingester_0));
+        ingester_pool.insert(ingester_id_0.clone(), IngesterPoolEntry::ready_with_client(ingester_0));
 
         let ingester_id_1 = NodeId::from("test-ingester-1");
         let mut mock_ingester_1 = MockIngesterService::new();
@@ -3418,7 +3410,7 @@ mod tests {
             };
             Ok(response)
         });
-        let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1));
+        let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
         ingester_pool.insert(ingester_id_1.clone(), ingester_1);
 
         let num_opened_shards = controller
diff --git a/quickwit/quickwit-indexing/src/source/ingest/mod.rs b/quickwit/quickwit-indexing/src/source/ingest/mod.rs
index 4d5ae8179ca..2aa02c09963 100644
--- a/quickwit/quickwit-indexing/src/source/ingest/mod.rs
+++ b/quickwit/quickwit-indexing/src/source/ingest/mod.rs
@@ -690,14 +690,6 @@ mod tests {
     use crate::models::RawDocBatch;
     use crate::source::SourceActor;
 
-    fn ingester_pool_entry(client: IngesterServiceClient) -> IngesterPoolEntry {
-        IngesterPoolEntry {
-            client,
-            status: IngesterStatus::Ready,
-            availability_zone: None,
-        }
-    }
-
     // In this test, we simulate a source to which we sequentially assign the following set of
     // shards []
     // [1] (triggers a reset, and the creation of a publish lock)
@@ -940,7 +932,7 @@ mod tests {
                 Ok(response)
             });
 
-        let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0));
+        let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
 
         let event_broker = EventBroker::default();
@@ -1137,7 +1129,7 @@ mod tests {
                 Ok(response)
             });
 
-        let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0));
+        let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
 
         let event_broker = EventBroker::default();
@@ -1302,7 +1294,7 @@ mod tests {
                 Ok(response)
             });
 
-        let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0));
+        let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
 
         let event_broker = EventBroker::default();
@@ -1610,7 +1602,7 @@ mod tests {
                 })
             });
 
-        let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0));
+        let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
 
         let event_broker = EventBroker::default();
@@ -1710,7 +1702,7 @@ mod tests {
 
                 Ok(TruncateShardsResponse {})
             });
-        let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0));
+        let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
 
         let mut mock_ingester_1 = MockIngesterService::new();
@@ -1737,7 +1729,7 @@ mod tests {
 
                 Ok(TruncateShardsResponse {})
             });
-        let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1));
+        let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
         ingester_pool.insert("test-ingester-1".into(), ingester_1.clone());
 
         let mut mock_ingester_3 = MockIngesterService::new();
@@ -1757,7 +1749,7 @@ mod tests {
 
                 Ok(TruncateShardsResponse {})
             });
-        let ingester_3 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_3));
+        let ingester_3 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_3));
         ingester_pool.insert("test-ingester-3".into(), ingester_3.clone());
 
         let event_broker = EventBroker::default();
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/capacity_score.rs b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/capacity_score.rs
index abaee5c7f5f..062539db797 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/broadcast/capacity_score.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/broadcast/capacity_score.rs
@@ -66,7 +66,8 @@ impl BroadcastIngesterCapacityScoreTask {
         let mut guard = state
             .lock_fully()
             .await
-            .map_err(|_| anyhow::anyhow!("failed to acquire ingester state lock"))?;
+            .context("failed to acquire ingester state lock")?;
+
         let usage = guard.mrecordlog.resource_usage();
         let disk_used = ByteSize::b(usage.disk_used_bytes as u64);
         let memory_used = ByteSize::b(usage.memory_used_bytes as u64);
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs
index bed2c693a09..96970fc4e20 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs
@@ -650,14 +650,6 @@ pub(super) mod tests {
         }
     }
 
-    fn ingester_pool_entry(client: IngesterServiceClient) -> IngesterPoolEntry {
-        IngesterPoolEntry {
-            client,
-            status: IngesterStatus::Ready,
-            availability_zone: None,
-        }
-    }
-
     #[tokio::test]
     async fn test_fetch_task_happy_path() {
         let tempdir = tempfile::tempdir().unwrap();
@@ -1339,7 +1331,7 @@ pub(super) mod tests {
 
                 Ok(service_stream_1)
             });
-        let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1));
+        let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
         ingester_pool.insert("test-ingester-1".into(), ingester_1);
 
         let fetch_payload = FetchPayload {
@@ -1438,7 +1430,7 @@ pub(super) mod tests {
                     "open fetch stream error".to_string(),
                 ))
             });
-        let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0));
+        let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
 
         let mut mock_ingester_1 = MockIngesterService::new();
         let index_uid_clone = index_uid.clone();
@@ -1453,7 +1445,7 @@ pub(super) mod tests {
 
                 Ok(service_stream_1)
             });
-        let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1));
+        let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
 
         ingester_pool.insert("test-ingester-0".into(), ingester_0);
         ingester_pool.insert("test-ingester-1".into(), ingester_1);
@@ -1553,7 +1545,7 @@ pub(super) mod tests {
 
                 Ok(service_stream_0)
             });
-        let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0));
+        let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
 
         let mut mock_ingester_1 = MockIngesterService::new();
         let index_uid_clone = index_uid.clone();
@@ -1568,7 +1560,7 @@ pub(super) mod tests {
 
                 Ok(service_stream_1)
             });
-        let ingester_1 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_1));
+        let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
 
         ingester_pool.insert("test-ingester-0".into(), ingester_0);
         ingester_pool.insert("test-ingester-1".into(), ingester_1);
@@ -1671,7 +1663,7 @@ pub(super) mod tests {
                     shard_id: ShardId::from(1),
                 })
             });
-        let ingester_0 = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester_0));
+        let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0);
 
         fault_tolerant_fetch_stream(
@@ -1759,7 +1751,7 @@ pub(super) mod tests {
 
                 Ok(service_stream_2)
             });
-        let ingester = ingester_pool_entry(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
 
         ingester_pool.insert("test-ingester".into(), ingester);
 
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
index b01500398c0..5091784b885 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/ingester.rs
@@ -2872,6 +2872,98 @@ mod tests {
             .assert_records_eq(&queue_id_01, .., &[]);
     }
 
+    #[tokio::test]
+    async fn test_ingester_persist_returns_routing_update() {
+        let (ingester_ctx, ingester) = IngesterForTest::default().build().await;
+
+        let index_uid_0 = IndexUid::for_test("test-index-0", 0);
+        let index_uid_1 = IndexUid::for_test("test-index-1", 0);
+        let source_id = SourceId::from("test-source");
+
+        let doc_mapping_uid = DocMappingUid::random();
+        let doc_mapping_json = format!(
+            r#"{{
+                "doc_mapping_uid": "{doc_mapping_uid}"
+            }}"#
+        );
+        let init_shards_request = InitShardsRequest {
+            subrequests: vec![
+                InitShardSubrequest {
+                    subrequest_id: 0,
+                    shard: Some(Shard {
+                        index_uid: Some(index_uid_0.clone()),
+                        source_id: source_id.clone(),
+                        shard_id: Some(ShardId::from(1)),
+                        shard_state: ShardState::Open as i32,
+                        leader_id: ingester_ctx.node_id.to_string(),
+                        doc_mapping_uid: Some(doc_mapping_uid),
+                        ..Default::default()
+                    }),
+                    doc_mapping_json: doc_mapping_json.clone(),
+                    validate_docs: false,
+                },
+                InitShardSubrequest {
+                    subrequest_id: 1,
+                    shard: Some(Shard {
+                        index_uid: Some(index_uid_1.clone()),
+                        source_id: source_id.clone(),
+                        shard_id: Some(ShardId::from(1)),
+                        shard_state: ShardState::Open as i32,
+                        leader_id: ingester_ctx.node_id.to_string(),
+                        doc_mapping_uid: Some(doc_mapping_uid),
+                        ..Default::default()
+                    }),
+                    doc_mapping_json,
+                    validate_docs: false,
+                },
+            ],
+        };
+        ingester.init_shards(init_shards_request).await.unwrap();
+
+        let persist_request = PersistRequest {
+            leader_id: ingester_ctx.node_id.to_string(),
+            commit_type: CommitTypeV2::Force as i32,
+            subrequests: vec![
+                PersistSubrequest {
+                    subrequest_id: 0,
+                    index_uid: Some(index_uid_0.clone()),
+                    source_id: source_id.clone(),
+                    doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-010"}"#])),
+                },
+                PersistSubrequest {
+                    subrequest_id: 1,
+                    index_uid: Some(index_uid_1.clone()),
+                    source_id: source_id.clone(),
+                    doc_batch: Some(DocBatchV2::for_test([r#"{"doc": "test-doc-110"}"#])),
+                },
+            ],
+        };
+        let persist_response = ingester.persist(persist_request).await.unwrap();
+        assert_eq!(persist_response.successes.len(), 2);
+
+        let routing_update = persist_response
+            .routing_update
+            .expect("routing update should be present");
+
+        assert!(
+            routing_update.capacity_score > 0,
+            "capacity score should be non-zero after a small persist"
+        );
+
+        let mut source_shard_updates = routing_update.source_shard_updates;
+        source_shard_updates.sort_by(|a, b| a.index_uid().cmp(b.index_uid()));
+
+        assert_eq!(source_shard_updates.len(), 2);
+        assert_eq!(source_shard_updates[0].index_uid(), &index_uid_0);
+        assert_eq!(source_shard_updates[0].source_id, source_id.as_str());
+        assert_eq!(source_shard_updates[0].open_shard_count, 1);
+        assert_eq!(source_shard_updates[1].index_uid(), &index_uid_1);
+        assert_eq!(source_shard_updates[1].source_id, source_id.as_str());
+        assert_eq!(source_shard_updates[1].open_shard_count, 1);
+
+        assert!(routing_update.closed_shards.is_empty());
+    }
+
     #[tokio::test]
     async fn test_ingester_open_replication_stream() {
         let (_ingester_ctx, ingester) = IngesterForTest::default()
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
index 9e5122abff3..a2b1095bdf3 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/mod.rs
@@ -73,6 +73,26 @@ pub struct IngesterPoolEntry {
     pub availability_zone: Option<String>,
 }
 
+impl IngesterPoolEntry {
+    #[cfg(any(test, feature = "testsuite"))]
+    pub fn ready_with_client(client: IngesterServiceClient) -> Self {
+        IngesterPoolEntry {
+            client,
+            status: IngesterStatus::Ready,
+            availability_zone: None,
+        }
+    }
+
+    #[cfg(any(test, feature = "testsuite"))]
+    pub fn mocked_ingester() -> Self {
+        IngesterPoolEntry {
+            client: IngesterServiceClient::mocked(),
+            status: IngesterStatus::Ready,
+            availability_zone: None,
+        }
+    }
+}
+
 pub type IngesterPool = Pool<NodeId, IngesterPoolEntry>;
 
 /// Identifies an ingester client, typically a source, for logging and debugging purposes.
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs
index 8ee5b9ea863..ad37032e550 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs
@@ -647,14 +647,6 @@ mod tests {
     use crate::IngesterPoolEntry;
     use crate::ingest_v2::workbench::SubworkbenchFailure;
 
-    fn mocked_ingester() -> IngesterPoolEntry {
-        IngesterPoolEntry {
-            client: IngesterServiceClient::mocked(),
-            status: IngesterStatus::Ready,
-            availability_zone: None,
-        }
-    }
-
     #[tokio::test]
     async fn test_router_make_get_or_create_open_shard_request() {
         let self_node_id = "test-router".into();
@@ -743,7 +735,7 @@ mod tests {
         drop(rendezvous_1);
         drop(rendezvous_2);
 
-        ingester_pool.insert("test-ingester-0".into(), mocked_ingester());
+        ingester_pool.insert("test-ingester-0".into(), IngesterPoolEntry::mocked_ingester());
         {
             // Ingester-0 is in pool and in table, but marked unavailable on the workbench
             // (simulating a prior transport error). has_open_nodes returns false → both
@@ -1178,8 +1170,8 @@ mod tests {
         let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new());
 
         let ingester_pool = IngesterPool::default();
-        ingester_pool.insert("test-ingester-0".into(), mocked_ingester());
-        ingester_pool.insert("test-ingester-1".into(), mocked_ingester());
+        ingester_pool.insert("test-ingester-0".into(), IngesterPoolEntry::mocked_ingester());
+        ingester_pool.insert("test-ingester-1".into(), IngesterPoolEntry::mocked_ingester());
 
         let replication_factor = 1;
         let router = IngestRouter::new(
@@ -1696,7 +1688,7 @@ mod tests {
         // Give the async subscriber a moment to process.
         tokio::time::sleep(Duration::from_millis(10)).await;
 
-        ingester_pool.insert("test-ingester-0".into(), mocked_ingester());
+        ingester_pool.insert("test-ingester-0".into(), IngesterPoolEntry::mocked_ingester());
         let state_guard = router.state.lock().await;
         let node = state_guard
             .routing_table
@@ -1843,7 +1835,7 @@ mod tests {
             .process_persist_results(&mut workbench, persist_futures)
             .await;
 
-        ingester_pool.insert("test-ingester-0".into(), mocked_ingester());
+        ingester_pool.insert("test-ingester-0".into(), IngesterPoolEntry::mocked_ingester());
         let state_guard = router.state.lock().await;
         let node = state_guard
             .routing_table
diff --git a/quickwit/quickwit-proto/protos/quickwit/ingester.proto b/quickwit/quickwit-proto/protos/quickwit/ingester.proto
index 57568f46b8b..d48bc553a62 100644
--- a/quickwit/quickwit-proto/protos/quickwit/ingester.proto
+++ b/quickwit/quickwit-proto/protos/quickwit/ingester.proto
@@ -74,6 +74,7 @@ message PersistSubrequest {
   quickwit.common.IndexUid index_uid = 2;
   string source_id = 3;
   quickwit.ingest.DocBatchV2 doc_batch = 5;
+  reserved 4;
 }
 
 message PersistResponse {
@@ -119,6 +120,7 @@ message PersistFailure {
   quickwit.common.IndexUid index_uid = 2;
   string source_id = 3;
   PersistFailureReason reason = 5;
+  reserved 4;
 }
 
 message SynReplicationMessage {
diff --git a/quickwit/quickwit-serve/src/lib.rs b/quickwit/quickwit-serve/src/lib.rs
index d9b4bc0e3f2..3ad5bd72aa7 100644
--- a/quickwit/quickwit-serve/src/lib.rs
+++ b/quickwit/quickwit-serve/src/lib.rs
@@ -1013,6 +1013,8 @@ fn setup_ingester_pool(
                     );
                     Some(change)
                 }
+                // only update the ingester pool when the ingester status changes, to avoid
+                // unnecessary churn
                 ClusterChange::Update { previous, updated }
                     if updated.is_indexer()
                         && previous.ingester_status() != updated.ingester_status() =>

From 1f17dfd0bbf58963024eb4695bff4edd6c6bab78 Mon Sep 17 00:00:00 2001
From: Nadav Gov-Ari <nadav.govari@datadoghq.com>
Date: Mon, 16 Mar 2026 11:16:04 -0400
Subject: [PATCH 9/9] lints

---
 .../src/control_plane.rs                      | 12 +++--
 .../src/ingest/ingest_controller.rs           | 48 ++++++++++++++-----
 .../src/source/ingest/mod.rs                  | 24 ++++++----
 .../quickwit-ingest/src/ingest_v2/fetch.rs    | 25 ++++++----
 .../quickwit-ingest/src/ingest_v2/router.rs   | 25 ++++++++--
 5 files changed, 94 insertions(+), 40 deletions(-)

diff --git a/quickwit/quickwit-control-plane/src/control_plane.rs b/quickwit/quickwit-control-plane/src/control_plane.rs
index 4e08b3a6b44..1056aba6eb8 100644
--- a/quickwit/quickwit-control-plane/src/control_plane.rs
+++ b/quickwit/quickwit-control-plane/src/control_plane.rs
@@ -2223,7 +2223,8 @@ mod tests {
                 assert!(&retain_shards_for_source.shard_ids.is_empty());
                 Ok(RetainShardsResponse {})
             });
-        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("node1".into(), ingester);
 
         let cluster_config = ClusterConfig::for_test();
@@ -2269,7 +2270,8 @@ mod tests {
                 );
                 Ok(RetainShardsResponse {})
             });
-        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("node1".into(), ingester);
 
         let mut index_0 = IndexMetadata::for_test("test-index-0", "ram:///test-index-0");
@@ -2644,7 +2646,8 @@ mod tests {
             };
             Ok(response)
         });
-        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert(ingester_id, ingester);
 
         let mut mock_metastore = MockMetastoreService::new();
@@ -2798,7 +2801,8 @@ mod tests {
             };
             Ok(response)
         });
-        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert(ingester_id, ingester);
 
         let mut mock_metastore = MockMetastoreService::new();
diff --git a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs
index 94c2e23c87f..53d3ee4034f 100644
--- a/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs
+++ b/quickwit/quickwit-control-plane/src/ingest/ingest_controller.rs
@@ -1962,7 +1962,10 @@ mod tests {
                 Ok(response)
             });
         let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
-        ingester_pool.insert(ingester_id_0, IngesterPoolEntry::ready_with_client(ingester_0));
+        ingester_pool.insert(
+            ingester_id_0,
+            IngesterPoolEntry::ready_with_client(ingester_0),
+        );
 
         let ingester_id_1 = NodeId::from("test-ingester-1");
         let mut mock_ingester_1 = MockIngesterService::new();
@@ -1983,7 +1986,8 @@ mod tests {
 
                 Err(IngestV2Error::Internal("internal error".to_string()))
             });
-        let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
+        let ingester_1 =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
         ingester_pool.insert(ingester_id_1, ingester_1);
 
         let ingester_id_2 = NodeId::from("test-ingester-2");
@@ -1993,7 +1997,10 @@ mod tests {
         let client_2 = IngesterServiceClient::tower()
             .stack_init_shards_layer(DelayLayer::new(INIT_SHARDS_REQUEST_TIMEOUT * 2))
             .build_from_mock(mock_ingester_2);
-        ingester_pool.insert(ingester_id_2, IngesterPoolEntry::ready_with_client(client_2));
+        ingester_pool.insert(
+            ingester_id_2,
+            IngesterPoolEntry::ready_with_client(client_2),
+        );
 
         let init_shards_response = controller
             .init_shards(Vec::new(), &Progress::default())
@@ -2371,7 +2378,8 @@ mod tests {
                     "failed to close shards".to_string(),
                 ))
             });
-        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("test-ingester".into(), ingester);
 
         let shard_infos = BTreeSet::from_iter([
@@ -2524,7 +2532,8 @@ mod tests {
             },
         );
 
-        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("test-ingester".into(), ingester);
 
         let shard_infos = BTreeSet::from_iter([ShardInfo {
@@ -2670,7 +2679,8 @@ mod tests {
                 };
                 Ok(response)
             });
-        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("test-ingester".into(), ingester);
 
         // Test failed to open shards.
@@ -2792,7 +2802,8 @@ mod tests {
                 };
                 Ok(response)
             });
-        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
         ingester_pool.insert("test-ingester".into(), ingester);
 
         // Test failed to close shard.
@@ -3162,7 +3173,10 @@ mod tests {
                 Ok(response)
             });
         let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
-        ingester_pool.insert(ingester_id_0.clone(), IngesterPoolEntry::ready_with_client(ingester_0));
+        ingester_pool.insert(
+            ingester_id_0.clone(),
+            IngesterPoolEntry::ready_with_client(ingester_0),
+        );
 
         let ingester_id_1 = NodeId::from("test-ingester-1");
         let mut mock_ingester_1 = MockIngesterService::new();
@@ -3180,7 +3194,10 @@ mod tests {
                 Err(IngestV2Error::Internal("internal error".to_string()))
             });
         let ingester_1 = IngesterServiceClient::from_mock(mock_ingester_1);
-        ingester_pool.insert(ingester_id_1.clone(), IngesterPoolEntry::ready_with_client(ingester_1));
+        ingester_pool.insert(
+            ingester_id_1.clone(),
+            IngesterPoolEntry::ready_with_client(ingester_1),
+        );
 
         let ingester_id_2 = NodeId::from("test-ingester-2");
         let mut mock_ingester_2 = MockIngesterService::new();
@@ -3189,7 +3206,10 @@ mod tests {
         let client_2 = IngesterServiceClient::tower()
             .stack_close_shards_layer(DelayLayer::new(CLOSE_SHARDS_REQUEST_TIMEOUT * 2))
             .build_from_mock(mock_ingester_2);
-        ingester_pool.insert(ingester_id_2.clone(), IngesterPoolEntry::ready_with_client(client_2));
+        ingester_pool.insert(
+            ingester_id_2.clone(),
+            IngesterPoolEntry::ready_with_client(client_2),
+        );
 
         // In this test:
         // - ingester 0 will close shard 0 successfully and fail to close shard 1;
@@ -3369,7 +3389,10 @@ mod tests {
                 Ok(response)
             });
         let ingester_0 = IngesterServiceClient::from_mock(mock_ingester_0);
-        ingester_pool.insert(ingester_id_0.clone(), IngesterPoolEntry::ready_with_client(ingester_0));
+        ingester_pool.insert(
+            ingester_id_0.clone(),
+            IngesterPoolEntry::ready_with_client(ingester_0),
+        );
 
         let ingester_id_1 = NodeId::from("test-ingester-1");
         let mut mock_ingester_1 = MockIngesterService::new();
@@ -3410,7 +3433,8 @@ mod tests {
             };
             Ok(response)
         });
-        let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
+        let ingester_1 =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
         ingester_pool.insert(ingester_id_1.clone(), ingester_1);
 
         let num_opened_shards = controller
diff --git a/quickwit/quickwit-indexing/src/source/ingest/mod.rs b/quickwit/quickwit-indexing/src/source/ingest/mod.rs
index 2aa02c09963..46b7ddd8e0e 100644
--- a/quickwit/quickwit-indexing/src/source/ingest/mod.rs
+++ b/quickwit/quickwit-indexing/src/source/ingest/mod.rs
@@ -676,8 +676,7 @@ mod tests {
     use quickwit_ingest::IngesterPoolEntry;
     use quickwit_proto::indexing::IndexingPipelineId;
     use quickwit_proto::ingest::ingester::{
-        FetchMessage, IngesterServiceClient, IngesterStatus, MockIngesterService,
-        TruncateShardsResponse,
+        FetchMessage, IngesterServiceClient, MockIngesterService, TruncateShardsResponse,
     };
     use quickwit_proto::ingest::{IngestV2Error, MRecordBatch, Shard, ShardState};
     use quickwit_proto::metastore::{AcquireShardsResponse, MockMetastoreService};
@@ -932,7 +931,8 @@ mod tests {
                 Ok(response)
             });
 
-        let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
+        let ingester_0 =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
 
         let event_broker = EventBroker::default();
@@ -1129,7 +1129,8 @@ mod tests {
                 Ok(response)
             });
 
-        let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
+        let ingester_0 =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
 
         let event_broker = EventBroker::default();
@@ -1294,7 +1295,8 @@ mod tests {
                 Ok(response)
             });
 
-        let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
+        let ingester_0 =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
 
         let event_broker = EventBroker::default();
@@ -1602,7 +1604,8 @@ mod tests {
                 })
             });
 
-        let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
+        let ingester_0 =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
 
         let event_broker = EventBroker::default();
@@ -1702,7 +1705,8 @@ mod tests {
 
                 Ok(TruncateShardsResponse {})
             });
-        let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
+        let ingester_0 =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0.clone());
 
         let mut mock_ingester_1 = MockIngesterService::new();
@@ -1729,7 +1733,8 @@ mod tests {
 
                 Ok(TruncateShardsResponse {})
             });
-        let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
+        let ingester_1 =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
         ingester_pool.insert("test-ingester-1".into(), ingester_1.clone());
 
         let mut mock_ingester_3 = MockIngesterService::new();
@@ -1749,7 +1754,8 @@ mod tests {
 
                 Ok(TruncateShardsResponse {})
             });
-        let ingester_3 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_3));
+        let ingester_3 =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_3));
         ingester_pool.insert("test-ingester-3".into(), ingester_3.clone());
 
         let event_broker = EventBroker::default();
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs
index 96970fc4e20..73c1fb2858d 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/fetch.rs
@@ -627,9 +627,7 @@ pub(super) mod tests {
 
     use bytes::Bytes;
     use quickwit_proto::ingest::ShardState;
-    use quickwit_proto::ingest::ingester::{
-        IngesterServiceClient, IngesterStatus, MockIngesterService,
-    };
+    use quickwit_proto::ingest::ingester::{IngesterServiceClient, MockIngesterService};
     use quickwit_proto::types::queue_id;
     use tokio::time::timeout;
 
@@ -1331,7 +1329,8 @@ pub(super) mod tests {
 
                 Ok(service_stream_1)
             });
-        let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
+        let ingester_1 =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
         ingester_pool.insert("test-ingester-1".into(), ingester_1);
 
         let fetch_payload = FetchPayload {
@@ -1430,7 +1429,8 @@ pub(super) mod tests {
                     "open fetch stream error".to_string(),
                 ))
             });
-        let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
+        let ingester_0 =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
 
         let mut mock_ingester_1 = MockIngesterService::new();
         let index_uid_clone = index_uid.clone();
@@ -1445,7 +1445,8 @@ pub(super) mod tests {
 
                 Ok(service_stream_1)
             });
-        let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
+        let ingester_1 =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
 
         ingester_pool.insert("test-ingester-0".into(), ingester_0);
         ingester_pool.insert("test-ingester-1".into(), ingester_1);
@@ -1545,7 +1546,8 @@ pub(super) mod tests {
 
                 Ok(service_stream_0)
             });
-        let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
+        let ingester_0 =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
 
         let mut mock_ingester_1 = MockIngesterService::new();
         let index_uid_clone = index_uid.clone();
@@ -1560,7 +1562,8 @@ pub(super) mod tests {
 
                 Ok(service_stream_1)
             });
-        let ingester_1 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
+        let ingester_1 =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_1));
 
         ingester_pool.insert("test-ingester-0".into(), ingester_0);
         ingester_pool.insert("test-ingester-1".into(), ingester_1);
@@ -1663,7 +1666,8 @@ pub(super) mod tests {
                     shard_id: ShardId::from(1),
                 })
             });
-        let ingester_0 = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
+        let ingester_0 =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester_0));
         ingester_pool.insert("test-ingester-0".into(), ingester_0);
 
         fault_tolerant_fetch_stream(
@@ -1751,7 +1755,8 @@ pub(super) mod tests {
 
                 Ok(service_stream_2)
             });
-        let ingester = IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
+        let ingester =
+            IngesterPoolEntry::ready_with_client(IngesterServiceClient::from_mock(mock_ingester));
 
         ingester_pool.insert("test-ingester".into(), ingester);
 
diff --git a/quickwit/quickwit-ingest/src/ingest_v2/router.rs b/quickwit/quickwit-ingest/src/ingest_v2/router.rs
index ad37032e550..581b648ae71 100644
--- a/quickwit/quickwit-ingest/src/ingest_v2/router.rs
+++ b/quickwit/quickwit-ingest/src/ingest_v2/router.rs
@@ -735,7 +735,10 @@ mod tests {
         drop(rendezvous_1);
         drop(rendezvous_2);
 
-        ingester_pool.insert("test-ingester-0".into(), IngesterPoolEntry::mocked_ingester());
+        ingester_pool.insert(
+            "test-ingester-0".into(),
+            IngesterPoolEntry::mocked_ingester(),
+        );
         {
             // Ingester-0 is in pool and in table, but marked unavailable on the workbench
             // (simulating a prior transport error). has_open_nodes returns false → both
@@ -1170,8 +1173,14 @@ mod tests {
         let control_plane = ControlPlaneServiceClient::from_mock(MockControlPlaneService::new());
 
         let ingester_pool = IngesterPool::default();
-        ingester_pool.insert("test-ingester-0".into(), IngesterPoolEntry::mocked_ingester());
-        ingester_pool.insert("test-ingester-1".into(), IngesterPoolEntry::mocked_ingester());
+        ingester_pool.insert(
+            "test-ingester-0".into(),
+            IngesterPoolEntry::mocked_ingester(),
+        );
+        ingester_pool.insert(
+            "test-ingester-1".into(),
+            IngesterPoolEntry::mocked_ingester(),
+        );
 
         let replication_factor = 1;
         let router = IngestRouter::new(
@@ -1688,7 +1697,10 @@ mod tests {
         // Give the async subscriber a moment to process.
         tokio::time::sleep(Duration::from_millis(10)).await;
 
-        ingester_pool.insert("test-ingester-0".into(), IngesterPoolEntry::mocked_ingester());
+        ingester_pool.insert(
+            "test-ingester-0".into(),
+            IngesterPoolEntry::mocked_ingester(),
+        );
         let state_guard = router.state.lock().await;
         let node = state_guard
             .routing_table
@@ -1835,7 +1847,10 @@ mod tests {
             .process_persist_results(&mut workbench, persist_futures)
             .await;
 
-        ingester_pool.insert("test-ingester-0".into(), IngesterPoolEntry::mocked_ingester());
+        ingester_pool.insert(
+            "test-ingester-0".into(),
+            IngesterPoolEntry::mocked_ingester(),
+        );
         let state_guard = router.state.lock().await;
         let node = state_guard
             .routing_table