From 0f588debd35839f1236886af37eff4c4b798f981 Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:19:54 -0400
Subject: [PATCH 01/23] chore(noxa-bd4): delete dead IngestionContext struct
 from types.rs

---
 crates/noxa-rag/src/types.rs | 60 ++----------------------------------
 1 file changed, 2 insertions(+), 58 deletions(-)
diff --git a/crates/noxa-rag/src/types.rs b/crates/noxa-rag/src/types.rs
index 92d7611..8ecef7b 100644
--- a/crates/noxa-rag/src/types.rs
+++ b/crates/noxa-rag/src/types.rs
@@ -71,14 +71,14 @@ pub struct PointPayload {
     /// Git branch detected from .git/HEAD walk-up (file:// sources only).
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub git_branch: Option<String>,
-    // ── Ingestion-provenance fields from IngestionContext ───────────────────
+    // ── Ingestion-provenance fields ─────────────────────────────────────────
     /// Opaque platform id: 'linkding:42', 'memos:7' (Wave 3+).
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub external_id: Option<String>,
     /// Native platform UI URL (Wave 3+).
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub platform_url: Option<String>,
-    // ── Web-provenance fields from IngestionContext ─────────────────────────
+    // ── Web-provenance fields ────────────────────────────────────────────────
     #[serde(default, skip_serializing_if = "Option::is_none")]
     pub seed_url: Option<String>,
     #[serde(default, skip_serializing_if = "Option::is_none")]
@@ -211,62 +211,6 @@ pub struct SearchMetadataFilter {
     pub hnsw_ef: Option<usize>,
 }
 
-/// RAG-pipeline provenance carried alongside ExtractionResult through ingestion.
-///
-/// These fields have no meaning to noxa-fetch, noxa-mcp, or WASM consumers — they
-/// live here in noxa-rag, not in noxa-core. At upsert time both Metadata and
-/// IngestionContext are serialized into PointPayload.
-#[derive(Debug, Clone, Default, Serialize, Deserialize)]
-pub struct IngestionContext {
-    /// Matches Metadata.source_type: 'web' | 'file' | 'mcp' | 'notebook' | 'email'
-    pub source_type: String,
-    /// SHA-256 hex digest — duplicated from Metadata.content_hash for fast access.
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub content_hash: Option<String>,
-    // Platform fields — populated when MCP sources land in Wave 3.
-    /// Opaque platform identifier: 'linkding:42', 'memos:7', 'paperless:15'.
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub external_id: Option<String>,
-    /// Native UI URL (not the canonical content URL).
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub platform_url: Option<String>,
-    // AI session fields — populated when AI session sources land.
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub session_tool: Option<String>,
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub conversation_id: Option<String>,
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub model_id: Option<String>,
-    // Web provenance — populated by noxa-fetch.
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub seed_url: Option<String>,
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub search_query: Option<String>,
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub crawl_depth: Option<u32>,
-    #[serde(default, skip_serializing_if = "Vec::is_empty")]
-    pub email_to: Vec<String>,
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub email_message_id: Option<String>,
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub email_thread_id: Option<String>,
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub email_has_attachments: Option<bool>,
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub feed_url: Option<String>,
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub feed_item_id: Option<String>,
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub pptx_slide_count: Option<u32>,
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub pptx_has_notes: Option<bool>,
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub subtitle_start_s: Option<f64>,
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub subtitle_end_s: Option<f64>,
-    #[serde(default, skip_serializing_if = "Option::is_none")]
-    pub subtitle_source_file: Option<String>,
-}
 
 #[cfg(test)]
 mod tests {

From 59a195c7f3d702046380f1d293618ba5968dde02 Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:23:34 -0400
Subject: [PATCH 02/23] refactor(noxa-4jt): split mcp_bridge.rs into
 per-platform module directory

Splits the 1429-line mcp_bridge.rs into a mcp_bridge/ directory module:
- mod.rs: shared types (McpSource, BridgeConfig, SyncReport, BridgeDocument,
  WriteStatus), McporterExecutor trait, McpBridge<E> struct + sync() dispatch,
  call_data(), private helper functions, and all tests
- executor.rs: ProcessMcporterExecutor implementation (59 lines)
- io.rs: write_bridge_document(), relative_output_path(), build_extraction(),
  and file I/O helpers (129 lines)
- linkding.rs: sync_linkding() + normalize_linkding_record() (120 lines)
- memos.rs: sync_memos() + normalize_memo_record() (86 lines)
- bytestash.rs: sync_bytestash() + normalize_bytestash_record() (105 lines)
- paperless.rs: sync_paperless() + fetch_paperless_lookup() +
  normalize_paperless_record() (127 lines)

Zero behavior change. Public API identical. All platform files < 200 lines.
---
 crates/noxa-rag/src/mcp_bridge/bytestash.rs | 105 +++
 crates/noxa-rag/src/mcp_bridge/executor.rs  |  59 ++
 crates/noxa-rag/src/mcp_bridge/io.rs        | 129 +++
 crates/noxa-rag/src/mcp_bridge/linkding.rs  | 120 +++
 crates/noxa-rag/src/mcp_bridge/memos.rs     |  86 ++
 crates/noxa-rag/src/mcp_bridge/mod.rs       | 900 ++++++++++++++++++++
 crates/noxa-rag/src/mcp_bridge/paperless.rs | 127 +++
 7 files changed, 1526 insertions(+)
 create mode 100644 crates/noxa-rag/src/mcp_bridge/bytestash.rs
 create mode 100644 crates/noxa-rag/src/mcp_bridge/executor.rs
 create mode 100644 crates/noxa-rag/src/mcp_bridge/io.rs
 create mode 100644 crates/noxa-rag/src/mcp_bridge/linkding.rs
 create mode 100644 crates/noxa-rag/src/mcp_bridge/memos.rs
 create mode 100644 crates/noxa-rag/src/mcp_bridge/mod.rs
 create mode 100644 crates/noxa-rag/src/mcp_bridge/paperless.rs

diff --git a/crates/noxa-rag/src/mcp_bridge/bytestash.rs b/crates/noxa-rag/src/mcp_bridge/bytestash.rs
new file mode 100644
index 0000000..921217f
--- /dev/null
+++ b/crates/noxa-rag/src/mcp_bridge/bytestash.rs
@@ -0,0 +1,105 @@
+use serde_json::Value;
+
+use crate::RagError;
+
+use super::{
+    BridgeDocument, McpBridge, McpSource, McporterExecutor, SyncReport, WriteStatus,
+    array_field, join_base_url, join_non_empty, optional_string, required_base_url,
+    required_string, string_array,
+    io::{build_extraction, write_bridge_document},
+};
+
+impl<E> McpBridge<E>
+where
+    E: McporterExecutor,
+{
+    pub(super) async fn sync_bytestash(&self) -> Result<SyncReport, RagError> {
+        let base_url = required_base_url(&self.config, McpSource::Bytestash)?;
+        let data = self
+            .call_data(McpSource::Bytestash, "snippets.list", serde_json::json!({}))
+            .await?;
+        let records = if let Some(array) = data.as_array() {
+            array.iter().collect::<Vec<_>>()
+        } else {
+            array_field(&data, "snippets")?
+        };
+
+        let mut report = SyncReport::default();
+        for record in records {
+            let document = normalize_bytestash_record(record, base_url)?;
+            report.fetched += 1;
+            match write_bridge_document(&self.config.watch_dir, &document).await? {
+                WriteStatus::Written => report.written += 1,
+                WriteStatus::Unchanged => report.skipped += 1,
+            }
+        }
+
+        Ok(report)
+    }
+}
+
+pub fn normalize_bytestash_record(
+    record: &Value,
+    platform_base_url: &str,
+) -> Result<BridgeDocument, RagError> {
+    let id = required_string(record, "id")?;
+    let title = optional_string(record, "title");
+    let description = optional_string(record, "description");
+    let language = optional_string(record, "language");
+    let fragments = record
+        .get("fragments")
+        .and_then(Value::as_array)
+        .ok_or_else(|| RagError::Parse("bytestash record missing fragments array".to_string()))?;
+
+    let mut markdown_parts = Vec::new();
+    if let Some(value) = title.as_deref() {
+        markdown_parts.push(format!("# {value}"));
+    }
+    if let Some(value) = description.as_deref() {
+        markdown_parts.push(value.to_string());
+    }
+    for fragment in fragments {
+        let file_name = fragment
+            .get("fileName")
+            .or_else(|| fragment.get("file_name"))
+            .and_then(Value::as_str)
+            .unwrap_or("snippet");
+        let code = fragment
+            .get("code")
+            .and_then(Value::as_str)
+            .unwrap_or_default();
+        markdown_parts.push(format!(
+            "## {file_name}\n```{}\n{}\n```",
+            language.clone().unwrap_or_default(),
+            code
+        ));
+    }
+    let plain_text = join_non_empty([
+        title.clone(),
+        description.clone(),
+        Some(
+            fragments
+                .iter()
+                .filter_map(|fragment| fragment.get("code").and_then(Value::as_str))
+                .collect::<Vec<_>>()
+                .join("\n\n"),
+        ),
+    ]);
+    let url = join_base_url(platform_base_url, &format!("/api/snippets/{id}"))?;
+
+    Ok(BridgeDocument {
+        source: McpSource::Bytestash,
+        external_id: format!("bytestash:{id}"),
+        platform_url: Some(url.clone()),
+        extraction: build_extraction(
+            url,
+            title,
+            None,
+            None,
+            language,
+            string_array(record.get("categories")),
+            markdown_parts.join("\n\n"),
+            plain_text,
+        ),
+    })
+}
diff --git a/crates/noxa-rag/src/mcp_bridge/executor.rs b/crates/noxa-rag/src/mcp_bridge/executor.rs
new file mode 100644
index 0000000..f363fa1
--- /dev/null
+++ b/crates/noxa-rag/src/mcp_bridge/executor.rs
@@ -0,0 +1,59 @@
+use async_trait::async_trait;
+use serde_json::Value;
+use tokio::process::Command;
+
+use crate::RagError;
+
+use super::{McpSource, McporterExecutor};
+
+#[derive(Debug, Clone)]
+pub struct ProcessMcporterExecutor {
+    executable: String,
+}
+
+impl ProcessMcporterExecutor {
+    pub fn new(executable: impl Into<String>) -> Self {
+        Self {
+            executable: executable.into(),
+        }
+    }
+}
+
+#[async_trait]
+impl McporterExecutor for ProcessMcporterExecutor {
+    async fn call(
+        &self,
+        server: &str,
+        service: McpSource,
+        action: &str,
+        params: Value,
+    ) -> Result<Value, RagError> {
+        let selector = format!("{}.{}", server, service.as_str());
+        let args = serde_json::json!({
+            "action": action,
+            "params": params,
+        });
+        let output = Command::new(&self.executable)
+            .arg("call")
+            .arg(&selector)
+            .arg("--args")
+            .arg(args.to_string())
+            .arg("--output")
+            .arg("json")
+            .output()
+            .await
+            .map_err(|e| RagError::Generic(format!("failed to execute mcporter: {e}")))?;
+
+        if !output.status.success() {
+            return Err(RagError::Generic(format!(
+                "mcporter call {} {} failed: {}",
+                selector,
+                action,
+                String::from_utf8_lossy(&output.stderr).trim()
+            )));
+        }
+
+        serde_json::from_slice(&output.stdout)
+            .map_err(|e| RagError::Generic(format!("mcporter returned invalid JSON: {e}")))
+    }
+}
diff --git a/crates/noxa-rag/src/mcp_bridge/io.rs b/crates/noxa-rag/src/mcp_bridge/io.rs
new file mode 100644
index 0000000..826df46
--- /dev/null
+++ b/crates/noxa-rag/src/mcp_bridge/io.rs
@@ -0,0 +1,129 @@
+use std::path::{Path, PathBuf};
+
+use noxa_core::{Content, ExtractionResult, Metadata};
+use serde::{Deserialize, Serialize};
+
+use crate::RagError;
+
+use super::{BridgeDocument, McpSource, WriteStatus};
+
+pub fn relative_output_path(source: McpSource, external_id: &str) -> PathBuf {
+    PathBuf::from("mcp").join(source.as_str()).join(format!(
+        "{}-{:016x}.json",
+        sanitize_component(external_id),
+        stable_component_hash(external_id)
+    ))
+}
+
+pub async fn write_bridge_document(
+    root: &Path,
+    document: &BridgeDocument,
+) -> Result<WriteStatus, RagError> {
+    let path = root.join(relative_output_path(document.source, &document.external_id));
+    if let Some(parent) = path.parent() {
+        tokio::fs::create_dir_all(parent).await?;
+    }
+
+    let payload = StoredExtractionResult {
+        extraction: document.extraction.clone(),
+        external_id: Some(document.external_id.clone()),
+        platform_url: document.platform_url.clone(),
+    };
+    let serialized = serde_json::to_vec_pretty(&payload)?;
+    if tokio::fs::read(&path).await.ok().as_deref() == Some(serialized.as_slice()) {
+        return Ok(WriteStatus::Unchanged);
+    }
+
+    let tmp_path = temp_output_path(&path);
+    tokio::fs::write(&tmp_path, &serialized).await?;
+    tokio::fs::rename(&tmp_path, &path).await?;
+    Ok(WriteStatus::Written)
+}
+
+#[allow(clippy::too_many_arguments)]
+pub fn build_extraction(
+    url: String,
+    title: Option<String>,
+    published_date: Option<String>,
+    author: Option<String>,
+    language: Option<String>,
+    technologies: Vec<String>,
+    markdown: String,
+    plain_text: String,
+) -> ExtractionResult {
+    ExtractionResult {
+        metadata: Metadata {
+            title,
+            description: None,
+            author,
+            published_date,
+            language,
+            url: Some(url),
+            site_name: None,
+            image: None,
+            favicon: None,
+            word_count: count_words(&plain_text),
+            content_hash: None,
+            source_type: Some("mcp".to_string()),
+            file_path: None,
+            last_modified: None,
+            is_truncated: None,
+            technologies,
+            seed_url: None,
+            crawl_depth: None,
+            search_query: None,
+            fetched_at: None,
+        },
+        content: Content {
+            markdown,
+            plain_text,
+            links: Vec::new(),
+            images: Vec::new(),
+            code_blocks: Vec::new(),
+            raw_html: None,
+        },
+        domain_data: None,
+        vertical_data: None,
+        structured_data: Vec::new(),
+    }
+}
+
+fn sanitize_component(value: &str) -> String {
+    value
+        .chars()
+        .map(|ch| match ch {
+            'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_' => ch,
+            _ => '_',
+        })
+        .collect()
+}
+
+fn stable_component_hash(value: &str) -> u64 {
+    use std::hash::{DefaultHasher, Hasher};
+    let mut hasher = DefaultHasher::new();
+    hasher.write(value.as_bytes());
+    hasher.finish()
+}
+
+fn temp_output_path(path: &Path) -> PathBuf {
+    let suffix = format!(
+        "tmp-{}-{}",
+        std::process::id(),
+        uuid::Uuid::new_v4().simple()
+    );
+    path.with_extension(format!("json.{suffix}"))
+}
+
+fn count_words(value: &str) -> usize {
+    value.split_whitespace().count()
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub(super) struct StoredExtractionResult {
+    #[serde(flatten)]
+    pub extraction: ExtractionResult,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub external_id: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub platform_url: Option<String>,
+}
diff --git a/crates/noxa-rag/src/mcp_bridge/linkding.rs b/crates/noxa-rag/src/mcp_bridge/linkding.rs
new file mode 100644
index 0000000..7f7be87
--- /dev/null
+++ b/crates/noxa-rag/src/mcp_bridge/linkding.rs
@@ -0,0 +1,120 @@
+use std::collections::HashSet;
+
+use serde_json::Value;
+use url::Url;
+
+use crate::RagError;
+
+use super::{
+    BridgeDocument, McpBridge, McpSource, McporterExecutor, SyncReport, WriteStatus,
+    array_field, as_u64_value, join_non_empty, optional_string, required_string, required_value,
+    string_array,
+    io::{build_extraction, write_bridge_document},
+};
+
+impl<E> McpBridge<E>
+where
+    E: McporterExecutor,
+{
+    pub(super) async fn sync_linkding(&self) -> Result<SyncReport, RagError> {
+        let mut report = SyncReport::default();
+        let mut offset = 0_u32;
+        let mut seen_ids = HashSet::new();
+
+        loop {
+            let data = self
+                .call_data(
+                    McpSource::Linkding,
+                    "bookmark.list",
+                    serde_json::json!({
+                        "limit": self.config.page_size,
+                        "offset": offset,
+                    }),
+                )
+                .await?;
+            let records = array_field(&data, "results")?;
+            if records.is_empty() {
+                break;
+            }
+
+            let mut new_records = 0_usize;
+            for record in records {
+                let document =
+                    normalize_linkding_record(record, self.config.platform_base_url.as_deref())?;
+                if !seen_ids.insert(document.external_id.clone()) {
+                    continue;
+                }
+                new_records += 1;
+                report.fetched += 1;
+                match write_bridge_document(&self.config.watch_dir, &document).await? {
+                    WriteStatus::Written => report.written += 1,
+                    WriteStatus::Unchanged => report.skipped += 1,
+                }
+            }
+
+            if data.get("next").is_none() || data.get("next").is_some_and(Value::is_null) {
+                break;
+            }
+            if new_records == 0 {
+                break;
+            }
+            offset = offset.saturating_add(self.config.page_size.max(1));
+        }
+
+        Ok(report)
+    }
+}
+
+pub fn normalize_linkding_record(
+    record: &Value,
+    platform_base_url: Option<&str>,
+) -> Result<BridgeDocument, RagError> {
+    let id = required_value(record, "id").and_then(as_u64_value)?;
+    let url = required_string(record, "url")?;
+    let title = optional_string(record, "title");
+    let description = optional_string(record, "description");
+    let notes = optional_string(record, "notes");
+    let markdown = join_non_empty([
+        title.as_deref().map(|value| format!("# {value}")),
+        description.clone(),
+        notes.clone(),
+    ]);
+    let plain_text = join_non_empty([title.clone(), description, notes]);
+    let platform_url = match platform_base_url {
+        Some(base) => Some(linkding_platform_url(base, &url)?),
+        None => None,
+    };
+
+    Ok(BridgeDocument {
+        source: McpSource::Linkding,
+        external_id: format!("linkding:{id}"),
+        platform_url,
+        extraction: build_extraction(
+            url,
+            title,
+            optional_string(record, "date_added"),
+            None,
+            None,
+            string_array(record.get("tag_names")),
+            markdown,
+            plain_text,
+        ),
+    })
+}
+
+fn linkding_platform_url(base: &str, bookmark_url: &str) -> Result<String, RagError> {
+    let mut url = Url::parse(base)
+        .map_err(|e| RagError::Parse(format!("invalid linkding base URL {base:?}: {e}")))?;
+    let current_path = url.path().trim_end_matches('/');
+    let next_path = if current_path.is_empty() {
+        "/bookmarks".to_string()
+    } else {
+        format!("{current_path}/bookmarks")
+    };
+    url.set_path(&next_path);
+    let query = url::form_urlencoded::Serializer::new(String::new())
+        .append_pair("q", bookmark_url)
+        .finish();
+    url.set_query(Some(&query));
+    Ok(url.to_string())
+}
diff --git a/crates/noxa-rag/src/mcp_bridge/memos.rs b/crates/noxa-rag/src/mcp_bridge/memos.rs
new file mode 100644
index 0000000..a81937b
--- /dev/null
+++ b/crates/noxa-rag/src/mcp_bridge/memos.rs
@@ -0,0 +1,86 @@
+use serde_json::Value;
+
+use crate::RagError;
+
+use super::{
+    BridgeDocument, McpBridge, McpSource, McporterExecutor, SyncReport, WriteStatus,
+    array_field, first_line_title, join_base_url, optional_string, required_base_url,
+    required_string, string_array,
+    io::{build_extraction, write_bridge_document},
+};
+
+impl<E> McpBridge<E>
+where
+    E: McporterExecutor,
+{
+    pub(super) async fn sync_memos(&self) -> Result<SyncReport, RagError> {
+        let base_url = required_base_url(&self.config, McpSource::Memos)?;
+        let mut report = SyncReport::default();
+        let mut page_token: Option<String> = None;
+
+        loop {
+            let mut params = serde_json::json!({ "page_size": self.config.page_size });
+            if let Some(token) = &page_token {
+                params["page_token"] = Value::String(token.clone());
+            }
+            let data = self
+                .call_data(McpSource::Memos, "memos.list", params)
+                .await?;
+            let records = array_field(&data, "memos")?;
+            if records.is_empty() {
+                break;
+            }
+
+            for record in records {
+                let document = normalize_memo_record(record, base_url)?;
+                report.fetched += 1;
+                match write_bridge_document(&self.config.watch_dir, &document).await? {
+                    WriteStatus::Written => report.written += 1,
+                    WriteStatus::Unchanged => report.skipped += 1,
+                }
+            }
+
+            let next_page_token = data
+                .get("nextPageToken")
+                .and_then(Value::as_str)
+                .map(str::trim)
+                .filter(|value| !value.is_empty())
+                .map(ToOwned::to_owned);
+            if next_page_token.is_none() || next_page_token == page_token {
+                break;
+            }
+            page_token = next_page_token;
+        }
+
+        Ok(report)
+    }
+}
+
+pub fn normalize_memo_record(
+    record: &Value,
+    platform_base_url: &str,
+) -> Result<BridgeDocument, RagError> {
+    let name = required_string(record, "name")?;
+    let memo_id = name.strip_prefix("memos/").unwrap_or(&name).to_string();
+    let content = required_string(record, "content")?;
+    let title = first_line_title(&content);
+    let url = join_base_url(platform_base_url, &format!("/api/v1/{name}"))?;
+    let published_date =
+        optional_string(record, "displayTime").or_else(|| optional_string(record, "createTime"));
+
+    Ok(BridgeDocument {
+        source: McpSource::Memos,
+        external_id: format!("memos:{memo_id}"),
+        platform_url: Some(url.clone()),
+        extraction: build_extraction(
+            url,
+            title,
+            published_date,
+            None,
+            None,
+            string_array(record.get("tags")),
+            content.clone(),
+            content,
+        ),
+    })
+}
diff --git a/crates/noxa-rag/src/mcp_bridge/mod.rs b/crates/noxa-rag/src/mcp_bridge/mod.rs
new file mode 100644
index 0000000..559b3b5
--- /dev/null
+++ b/crates/noxa-rag/src/mcp_bridge/mod.rs
@@ -0,0 +1,900 @@
+use std::path::PathBuf;
+
+use async_trait::async_trait;
+use serde_json::Value;
+use url::Url;
+
+use crate::RagError;
+
+pub mod executor;
+mod io;
+mod linkding;
+mod memos;
+mod bytestash;
+mod paperless;
+
+pub use executor::ProcessMcporterExecutor;
+pub use io::{relative_output_path, write_bridge_document};
+pub use linkding::normalize_linkding_record;
+pub use memos::normalize_memo_record;
+pub use bytestash::normalize_bytestash_record;
+pub use paperless::normalize_paperless_record;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum McpSource {
+    Linkding,
+    Memos,
+    Bytestash,
+    Paperless,
+}
+
+impl McpSource {
+    pub fn as_str(self) -> &'static str {
+        match self {
+            Self::Linkding => "linkding",
+            Self::Memos => "memos",
+            Self::Bytestash => "bytestash",
+            Self::Paperless => "paperless",
+        }
+    }
+}
+
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct BridgeConfig {
+    pub server: String,
+    /// Directory where bridge JSON files are written. Must be one of the dirs
+    /// listed in the daemon's `source.watch_dirs` so the daemon picks them up.
+    pub watch_dir: PathBuf,
+    pub page_size: u32,
+    pub platform_base_url: Option<String>,
+}
+
+#[derive(Debug, Clone, Default, PartialEq, Eq)]
+pub struct SyncReport {
+    pub fetched: usize,
+    pub written: usize,
+    pub skipped: usize,
+}
+
+#[derive(Debug, Clone)]
+pub struct BridgeDocument {
+    pub source: McpSource,
+    pub external_id: String,
+    pub platform_url: Option<String>,
+    pub extraction: noxa_core::ExtractionResult,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum WriteStatus {
+    Written,
+    Unchanged,
+}
+
+#[async_trait]
+pub trait McporterExecutor: Send + Sync {
+    async fn call(
+        &self,
+        server: &str,
+        service: McpSource,
+        action: &str,
+        params: Value,
+    ) -> Result<Value, RagError>;
+}
+
+pub struct McpBridge<E> {
+    executor: E,
+    config: BridgeConfig,
+}
+
+impl<E> McpBridge<E>
+where
+    E: McporterExecutor,
+{
+    pub fn new(executor: E, config: BridgeConfig) -> Self {
+        Self { executor, config }
+    }
+
+    pub async fn sync(&self, source: McpSource) -> Result<SyncReport, RagError> {
+        match source {
+            McpSource::Linkding => self.sync_linkding().await,
+            McpSource::Memos => self.sync_memos().await,
+            McpSource::Bytestash => self.sync_bytestash().await,
+            McpSource::Paperless => self.sync_paperless().await,
+        }
+    }
+
+    pub(self) async fn call_data(
+        &self,
+        source: McpSource,
+        action: &str,
+        params: Value,
+    ) -> Result<Value, RagError> {
+        let raw = self
+            .executor
+            .call(&self.config.server, source, action, params)
+            .await?;
+        extract_mcporter_data(raw)
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Shared helper functions (used by platform submodules via `use super::...`)
+// ---------------------------------------------------------------------------
+
+pub(super) fn extract_mcporter_data(raw: Value) -> Result<Value, RagError> {
+    let ok = raw.get("ok").and_then(Value::as_bool).unwrap_or(true);
+    if ok {
+        raw.get("data")
+            .cloned()
+            .ok_or_else(|| RagError::Parse("mcporter response missing data".to_string()))
+    } else {
+        let message = raw
+            .get("error")
+            .and_then(|value| value.get("message"))
+            .and_then(Value::as_str)
+            .unwrap_or("unknown mcporter error");
+        Err(RagError::Generic(message.to_string()))
+    }
+}
+
+pub(super) fn array_field<'a>(value: &'a Value, key: &str) -> Result<Vec<&'a Value>, RagError> {
+    value
+        .get(key)
+        .and_then(Value::as_array)
+        .map(|items| items.iter().collect::<Vec<_>>())
+        .ok_or_else(|| RagError::Parse(format!("expected array field {key}")))
+}
+
+pub(super) fn required_base_url(config: &BridgeConfig, source: McpSource) -> Result<&str, RagError> {
+    config.platform_base_url.as_deref().ok_or_else(|| {
+        RagError::Config(format!("{} requires --platform-base-url", source.as_str()))
+    })
+}
+
+pub(super) fn join_base_url(base: &str, path: &str) -> Result<String, RagError> {
+    let base = base.trim_end_matches('/');
+    let url = format!("{base}{path}");
+    Url::parse(&url)
+        .map(|parsed| parsed.to_string())
+        .map_err(|e| RagError::Parse(format!("invalid base URL {base:?}: {e}")))
+}
+
+pub(super) fn required_value<'a>(value: &'a Value, key: &str) -> Result<&'a Value, RagError> {
+    value
+        .get(key)
+        .ok_or_else(|| RagError::Parse(format!("missing required field {key}")))
+}
+
+pub(super) fn required_string(value: &Value, key: &str) -> Result<String, RagError> {
+    value
+        .get(key)
+        .and_then(Value::as_str)
+        .map(ToOwned::to_owned)
+        .ok_or_else(|| RagError::Parse(format!("missing required string field {key}")))
+}
+
+pub(super) fn optional_string(value: &Value, key: &str) -> Option<String> {
+    value
+        .get(key)
+        .and_then(Value::as_str)
+        .map(ToOwned::to_owned)
+}
+
+pub(super) fn as_u64(value: &Value) -> Option<u64> {
+    value
+        .as_u64()
+        .or_else(|| value.as_str().and_then(|raw| raw.parse::<u64>().ok()))
+}
+
+pub(super) fn as_u64_value(value: &Value) -> Result<u64, RagError> {
+    as_u64(value).ok_or_else(|| RagError::Parse("expected integer id".to_string()))
+}
+
+pub(super) fn string_array(value: Option<&Value>) -> Vec<String> {
+    value
+        .and_then(Value::as_array)
+        .map(|items| {
+            items
+                .iter()
+                .filter_map(|item| {
+                    item.as_str().map(ToOwned::to_owned).or_else(|| {
+                        item.get("name")
+                            .and_then(Value::as_str)
+                            .map(ToOwned::to_owned)
+                    })
+                })
+                .collect::<Vec<_>>()
+        })
+        .unwrap_or_default()
+}
+
+pub(super) fn join_non_empty<I>(parts: I) -> String
+where
+    I: IntoIterator<Item = Option<String>>,
+{
+    parts
+        .into_iter()
+        .flatten()
+        .map(|value| value.trim().to_string())
+        .filter(|value| !value.is_empty())
+        .collect::<Vec<_>>()
+        .join("\n\n")
+}
+
+pub(super) fn first_line_title(content: &str) -> Option<String> {
+    content
+        .lines()
+        .map(str::trim)
+        .find(|line| !line.is_empty() && !line.starts_with('#'))
+        .map(|line| line.chars().take(80).collect::<String>())
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::{HashMap, VecDeque};
+    use std::sync::{Arc, Mutex};
+
+    use super::*;
+
+    type ExecutorCall = (String, McpSource, String, Value);
+
+    #[derive(Clone, Default)]
+    struct MockExecutor {
+        calls: Arc<Mutex<Vec<ExecutorCall>>>,
+        responses: Arc<Mutex<VecDeque<Result<Value, RagError>>>>,
+    }
+
+    impl MockExecutor {
+        fn with_responses(responses: Vec<Result<Value, RagError>>) -> Self {
+            Self {
+                calls: Arc::new(Mutex::new(Vec::new())),
+                responses: Arc::new(Mutex::new(VecDeque::from(responses))),
+            }
+        }
+
+        fn calls(&self) -> Vec<ExecutorCall> {
+            self.calls.lock().expect("calls lock").clone()
+        }
+    }
+
+    #[async_trait]
+    impl McporterExecutor for MockExecutor {
+        async fn call(
+            &self,
+            server: &str,
+            service: McpSource,
+            action: &str,
+            params: Value,
+        ) -> Result<Value, RagError> {
+            self.calls.lock().expect("calls lock").push((
+                server.to_string(),
+                service,
+                action.to_string(),
+                params,
+            ));
+            self.responses
+                .lock()
+                .expect("responses lock")
+                .pop_front()
+                .expect("response queued")
+        }
+    }
+
+    #[test]
+    fn relative_output_path_sanitizes_external_ids() {
+        let path = relative_output_path(McpSource::Memos, "memos/G3BLCD3swV4Fuxk4uMT97d");
+
+        let rendered = path.display().to_string();
+        assert!(rendered.starts_with("mcp/memos/memos_G3BLCD3swV4Fuxk4uMT97d-"));
+        assert!(rendered.ends_with(".json"));
+    }
+
+    #[test]
+    fn relative_output_path_distinguishes_colliding_sanitized_ids() {
+        let first = relative_output_path(McpSource::Memos, "memos/G3");
+        let second = relative_output_path(McpSource::Memos, "memos_G3");
+        assert_ne!(first, second);
+    }
+
+    #[test]
+    fn normalize_linkding_record_maps_bookmark_fields() {
+        let record = serde_json::json!({
+            "id": 441,
+            "url": "https://pipenet.dev/",
+            "title": "pipenet",
+            "description": "",
+            "notes": "MCP client",
+            "tag_names": ["mcp", "rust"],
+            "date_added": "2026-02-02T15:23:27.821564-05:00"
+        });
+
+        let document =
+            normalize_linkding_record(&record, Some("https://ding.tootie.tv")).expect("maps");
+
+        assert_eq!(document.external_id, "linkding:441");
+        assert_eq!(
+            document.extraction.metadata.url.as_deref(),
+            Some("https://pipenet.dev/")
+        );
+        assert_eq!(
+            document.platform_url.as_deref(),
+            Some("https://ding.tootie.tv/bookmarks?q=https%3A%2F%2Fpipenet.dev%2F")
+        );
+        assert_eq!(
+            document.extraction.metadata.technologies,
+            vec!["mcp".to_string(), "rust".to_string()]
+        );
+        assert_eq!(
+            document.extraction.metadata.published_date.as_deref(),
+            Some("2026-02-02T15:23:27.821564-05:00")
+        );
+        assert!(document.extraction.content.markdown.contains("MCP client"));
+    }
+
+    #[test]
+    fn normalize_linkding_record_preserves_base_path_prefix() {
+        let record = serde_json::json!({
+            "id": 12,
+            "url": "https://example.com/doc",
+            "title": "Doc",
+            "description": "",
+            "notes": ""
+        });
+
+        let document =
+            normalize_linkding_record(&record, Some("https://ding.example/app")).expect("maps");
+
+        assert_eq!(
+            document.platform_url.as_deref(),
+            Some("https://ding.example/app/bookmarks?q=https%3A%2F%2Fexample.com%2Fdoc")
+        );
+    }
+
+    #[test]
+    fn normalize_memo_record_maps_tags_and_dates() {
+        let record = serde_json::json!({
+            "name": "memos/G3BLCD3swV4Fuxk4uMT97d",
+            "content": "Use `docker inspect`.\n#docker #cmd",
+            "tags": ["docker", "cmd"],
+            "createTime": "2024-05-07T00:26:59Z",
+            "displayTime": "2024-05-17T22:39:50Z"
+        });
+
+        let document = normalize_memo_record(&record, "https://memos.example.com").expect("maps");
+
+        assert_eq!(document.external_id, "memos:G3BLCD3swV4Fuxk4uMT97d");
+        assert_eq!(
+            document.extraction.metadata.url.as_deref(),
+            Some("https://memos.example.com/api/v1/memos/G3BLCD3swV4Fuxk4uMT97d")
+        );
+        assert_eq!(
+            document.platform_url.as_deref(),
+            Some("https://memos.example.com/api/v1/memos/G3BLCD3swV4Fuxk4uMT97d")
+        );
+        assert_eq!(
+            document.extraction.metadata.published_date.as_deref(),
+            Some("2024-05-17T22:39:50Z")
+        );
+        assert_eq!(
+            document.extraction.metadata.technologies,
+            vec!["docker".to_string(), "cmd".to_string()]
+        );
+    }
+
+    #[test]
+    fn normalize_bytestash_record_maps_fragments_and_categories() {
+        let record = serde_json::json!({
+            "id": "abc123",
+            "title": "docker env",
+            "description": "Inspect container env vars",
+            "language": "bash",
+            "categories": ["docker", "ops"],
+            "fragments": [
+                {
+                    "fileName": "inspect.sh",
+                    "code": "docker inspect my-container | jq '.[0].Config.Env'"
+                }
+            ]
+        });
+
+        let document =
+            normalize_bytestash_record(&record, "https://stash.example.com").expect("maps");
+
+        assert_eq!(document.external_id, "bytestash:abc123");
+        assert_eq!(
+            document.extraction.metadata.url.as_deref(),
+            Some("https://stash.example.com/api/snippets/abc123")
+        );
+        assert_eq!(
+            document.extraction.metadata.language.as_deref(),
+            Some("bash")
+        );
+        assert_eq!(
+            document.extraction.metadata.technologies,
+            vec!["docker".to_string(), "ops".to_string()]
+        );
+        assert!(document.extraction.content.markdown.contains("```bash\n"));
+    }
+
+    #[test]
+    fn normalize_paperless_record_maps_tag_ids_and_correspondent() {
+        let record = serde_json::json!({
+            "id": 6,
+            "title": "Alcohol Server Training Certificate",
+            "content": "SERVSAFE certificate text",
+            "tags": [1, 4],
+            "correspondent": 9,
+            "created": "2026-04-01"
+        });
+        let tag_names = HashMap::from([
+            (1_u64, "paperless".to_string()),
+            (4_u64, "certification".to_string()),
+        ]);
+        let correspondent_names = HashMap::from([(9_u64, "ServSafe".to_string())]);
+
+        let document = normalize_paperless_record(
+            &record,
+            &tag_names,
+            &correspondent_names,
+            "https://paperless.example.com",
+        )
+        .expect("maps");
+
+        assert_eq!(document.external_id, "paperless:6");
+        assert_eq!(
+            document.extraction.metadata.url.as_deref(),
+            Some("https://paperless.example.com/api/documents/6/")
+        );
+        assert_eq!(
+            document.extraction.metadata.author.as_deref(),
+            Some("ServSafe")
+        );
+        assert_eq!(
+            document.extraction.metadata.published_date.as_deref(),
+            Some("2026-04-01")
+        );
+        assert_eq!(
+            document.extraction.metadata.technologies,
+            vec!["paperless".to_string(), "certification".to_string()]
+        );
+    }
+
+    #[tokio::test]
+    async fn write_bridge_document_skips_unchanged_payloads() {
+        let temp = tempfile::tempdir().expect("tempdir");
+        let document = BridgeDocument {
+            source: McpSource::Linkding,
+            external_id: "linkding:441".to_string(),
+            platform_url: Some(
+                "https://ding.tootie.tv/bookmarks?q=https%3A%2F%2Fpipenet.dev%2F".to_string(),
+            ),
+            extraction: io::build_extraction(
+                "https://pipenet.dev/".to_string(),
+                Some("pipenet".to_string()),
+                Some("2026-02-02T15:23:27.821564-05:00".to_string()),
+                None,
+                None,
+                vec!["mcp".to_string()],
+                "# pipenet".to_string(),
+                "pipenet".to_string(),
+            ),
+        };
+
+        let first = write_bridge_document(temp.path(), &document)
+            .await
+            .expect("first write");
+        let second = write_bridge_document(temp.path(), &document)
+            .await
+            .expect("second write");
+
+        assert_eq!(first, WriteStatus::Written);
+        assert_eq!(second, WriteStatus::Unchanged);
+    }
+
+    #[tokio::test]
+    async fn sync_linkding_pages_and_writes_documents() {
+        let temp = tempfile::tempdir().expect("tempdir");
+        let executor = MockExecutor::with_responses(vec![
+            Ok(serde_json::json!({
+                "ok": true,
+                "data": {
+                    "results": [
+                        {
+                            "id": 1,
+                            "url": "https://example.com/one",
+                            "title": "One",
+                            "description": "",
+                            "notes": "",
+                            "tag_names": ["rust"],
+                            "date_added": "2026-04-01T00:00:00Z"
+                        }
+                    ],
+                    "next": "https://ding.example/api/bookmarks/?limit=1&offset=1"
+                }
+            })),
+            Ok(serde_json::json!({
+                "ok": true,
+                "data": {
+                    "results": [
+                        {
+                            "id": 2,
+                            "url": "https://example.com/two",
+                            "title": "Two",
+                            "description": "Second",
+                            "notes": "",
+                            "tag_names": [],
+                            "date_added": "2026-04-02T00:00:00Z"
+                        }
+                    ],
+                    "next": null
+                }
+            })),
+        ]);
+        let bridge = McpBridge::new(
+            executor.clone(),
+            BridgeConfig {
+                server: "lab".to_string(),
+                watch_dir: temp.path().to_path_buf(),
+                page_size: 1,
+                platform_base_url: Some("https://ding.example".to_string()),
+            },
+        );
+
+        let report = bridge.sync(McpSource::Linkding).await.expect("sync");
+
+        assert_eq!(
+            report,
+            SyncReport {
+                fetched: 2,
+                written: 2,
+                skipped: 0
+            }
+        );
+        let calls = executor.calls();
+        assert_eq!(calls.len(), 2);
+        assert_eq!(calls[0].0, "lab");
+        assert_eq!(calls[0].1, McpSource::Linkding);
+        assert_eq!(calls[0].2, "bookmark.list");
+        assert_eq!(calls[0].3, serde_json::json!({ "limit": 1, "offset": 0 }));
+        assert_eq!(calls[1].3, serde_json::json!({ "limit": 1, "offset": 1 }));
+        assert!(
+            temp.path()
+                .join(relative_output_path(McpSource::Linkding, "linkding:1"))
+                .exists()
+        );
+        assert!(
+            temp.path()
+                .join(relative_output_path(McpSource::Linkding, "linkding:2"))
+                .exists()
+        );
+    }
+
+    #[tokio::test]
+    async fn sync_linkding_stops_when_pagination_repeats_same_records() {
+        let temp = tempfile::tempdir().expect("tempdir");
+        let executor = MockExecutor::with_responses(vec![
+            Ok(serde_json::json!({
+                "ok": true,
+                "data": {
+                    "results": [
+                        {
+                            "id": 1,
+                            "url": "https://example.com/one",
+                            "title": "One",
+                            "description": "",
+                            "notes": "",
+                            "tag_names": [],
+                            "date_added": "2026-04-01T00:00:00Z"
+                        }
+                    ],
+                    "next": "https://ding.example/api/bookmarks/?limit=1&offset=1"
+                }
+            })),
+            Ok(serde_json::json!({
+                "ok": true,
+                "data": {
+                    "results": [
+                        {
+                            "id": 1,
+                            "url": "https://example.com/one",
+                            "title": "One",
+                            "description": "",
+                            "notes": "",
+                            "tag_names": [],
+                            "date_added": "2026-04-01T00:00:00Z"
+                        }
+                    ],
+                    "next": "https://ding.example/api/bookmarks/?limit=1&offset=2"
+                }
+            })),
+        ]);
+        let bridge = McpBridge::new(
+            executor.clone(),
+            BridgeConfig {
+                server: "lab".to_string(),
+                watch_dir: temp.path().to_path_buf(),
+                page_size: 1,
+                platform_base_url: Some("https://ding.example".to_string()),
+            },
+        );
+
+        let report = bridge.sync(McpSource::Linkding).await.expect("sync");
+
+        assert_eq!(
+            report,
+            SyncReport {
+                fetched: 1,
+                written: 1,
+                skipped: 0
+            }
+        );
+        let calls = executor.calls();
+        assert_eq!(calls.len(), 2);
+        assert!(
+            temp.path()
+                .join(relative_output_path(McpSource::Linkding, "linkding:1"))
+                .exists()
+        );
+    }
+
+    #[tokio::test]
+    async fn sync_memos_uses_page_tokens_and_writes_documents() {
+        let temp = tempfile::tempdir().expect("tempdir");
+        let executor = MockExecutor::with_responses(vec![
+            Ok(serde_json::json!({
+                "ok": true,
+                "data": {
+                    "memos": [
+                        {
+                            "name": "memos/first",
+                            "content": "first memo",
+                            "tags": ["ops"],
+                            "displayTime": "2026-04-01T00:00:00Z"
+                        }
+                    ],
+                    "nextPageToken": "page-2"
+                }
+            })),
+            Ok(serde_json::json!({
+                "ok": true,
+                "data": {
+                    "memos": [
+                        {
+                            "name": "memos/second",
+                            "content": "second memo",
+                            "tags": [],
+                            "createTime": "2026-04-02T00:00:00Z"
+                        }
+                    ],
+                    "nextPageToken": ""
+                }
+            })),
+        ]);
+        let bridge = McpBridge::new(
+            executor.clone(),
+            BridgeConfig {
+                server: "lab".to_string(),
+                watch_dir: temp.path().to_path_buf(),
+                page_size: 1,
+                platform_base_url: Some("https://memos.example.com".to_string()),
+            },
+        );
+
+        let report = bridge.sync(McpSource::Memos).await.expect("sync memos");
+
+        assert_eq!(
+            report,
+            SyncReport {
+                fetched: 2,
+                written: 2,
+                skipped: 0
+            }
+        );
+        let calls = executor.calls();
+        assert_eq!(calls.len(), 2);
+        assert_eq!(calls[0].2, "memos.list");
+        assert_eq!(calls[0].3, serde_json::json!({ "page_size": 1 }));
+        assert_eq!(
+            calls[1].3,
+            serde_json::json!({ "page_size": 1, "page_token": "page-2" })
+        );
+        assert!(
+            temp.path()
+                .join(relative_output_path(McpSource::Memos, "memos:first"))
+                .exists()
+        );
+        assert!(
+            temp.path()
+                .join(relative_output_path(McpSource::Memos, "memos:second"))
+                .exists()
+        );
+    }
+
+    #[tokio::test]
+    async fn sync_memos_stops_when_next_page_token_repeats() {
+        let temp = tempfile::tempdir().expect("tempdir");
+        let executor = MockExecutor::with_responses(vec![
+            Ok(serde_json::json!({
+                "ok": true,
+                "data": {
+                    "memos": [
+                        {
+                            "name": "memos/first",
+                            "content": "first memo",
+                            "tags": []
+                        }
+                    ],
+                    "nextPageToken": "page-2"
+                }
+            })),
+            Ok(serde_json::json!({
+                "ok": true,
+                "data": {
+                    "memos": [
+                        {
+                            "name": "memos/second",
+                            "content": "second memo",
+                            "tags": []
+                        }
+                    ],
+                    "nextPageToken": "page-2"
+                }
+            })),
+        ]);
+        let bridge = McpBridge::new(
+            executor.clone(),
+            BridgeConfig {
+                server: "lab".to_string(),
+                watch_dir: temp.path().to_path_buf(),
+                page_size: 1,
+                platform_base_url: Some("https://memos.example.com".to_string()),
+            },
+        );
+
+        let report = bridge.sync(McpSource::Memos).await.expect("sync memos");
+
+        assert_eq!(
+            report,
+            SyncReport {
+                fetched: 2,
+                written: 2,
+                skipped: 0
+            }
+        );
+        assert_eq!(executor.calls().len(), 2);
+    }
+
+    #[tokio::test]
+    async fn sync_bytestash_accepts_array_payloads() {
+        let temp = tempfile::tempdir().expect("tempdir");
+        let executor = MockExecutor::with_responses(vec![Ok(serde_json::json!({
+            "ok": true,
+            "data": [
+                {
+                    "id": "snippet-1",
+                    "title": "docker env",
+                    "description": "Inspect container env vars",
+                    "language": "bash",
+                    "categories": ["docker"],
+                    "fragments": [
+                        {
+                            "fileName": "inspect.sh",
+                            "code": "docker inspect app"
+                        }
+                    ]
+                }
+            ]
+        }))]);
+        let bridge = McpBridge::new(
+            executor.clone(),
+            BridgeConfig {
+                server: "lab".to_string(),
+                watch_dir: temp.path().to_path_buf(),
+                page_size: 50,
+                platform_base_url: Some("https://stash.example.com".to_string()),
+            },
+        );
+
+        let report = bridge
+            .sync(McpSource::Bytestash)
+            .await
+            .expect("sync bytestash");
+
+        assert_eq!(
+            report,
+            SyncReport {
+                fetched: 1,
+                written: 1,
+                skipped: 0
+            }
+        );
+        let calls = executor.calls();
+        assert_eq!(calls.len(), 1);
+        assert_eq!(calls[0].2, "snippets.list");
+        assert_eq!(calls[0].3, serde_json::json!({}));
+        assert!(
+            temp.path()
+                .join(relative_output_path(
+                    McpSource::Bytestash,
+                    "bytestash:snippet-1"
+                ))
+                .exists()
+        );
+    }
+
+    #[tokio::test]
+    async fn sync_paperless_resolves_lookup_tables_before_documents() {
+        let temp = tempfile::tempdir().expect("tempdir");
+        let executor = MockExecutor::with_responses(vec![
+            Ok(serde_json::json!({
+                "ok": true,
+                "data": {
+                    "results": [
+                        { "id": 1, "name": "finance" }
+                    ]
+                }
+            })),
+            Ok(serde_json::json!({
+                "ok": true,
+                "data": {
+                    "results": [
+                        { "id": 9, "name": "Bank" }
+                    ]
+                }
+            })),
+            Ok(serde_json::json!({
+                "ok": true,
+                "data": {
+                    "results": [
+                        {
+                            "id": 17,
+                            "title": "Statement",
+                            "content": "balance: 100",
+                            "tags": [1],
+                            "correspondent": 9,
+                            "created": "2026-04-01"
+                        }
+                    ],
+                    "next": null
+                }
+            })),
+        ]);
+        let bridge = McpBridge::new(
+            executor.clone(),
+            BridgeConfig {
+                server: "lab".to_string(),
+                watch_dir: temp.path().to_path_buf(),
+                page_size: 100,
+                platform_base_url: Some("https://paperless.example.com".to_string()),
+            },
+        );
+
+        let report = bridge
+            .sync(McpSource::Paperless)
+            .await
+            .expect("sync paperless");
+
+        assert_eq!(
+            report,
+            SyncReport {
+                fetched: 1,
+                written: 1,
+                skipped: 0
+            }
+        );
+        let calls = executor.calls();
+        assert_eq!(calls.len(), 3);
+        assert_eq!(calls[0].2, "tags.list");
+        assert_eq!(calls[1].2, "correspondents.list");
+        assert_eq!(calls[2].2, "documents.list");
+        assert_eq!(
+            calls[2].3,
+            serde_json::json!({ "page_size": 100, "page": 1 })
+        );
+        assert!(
+            temp.path()
+                .join(relative_output_path(McpSource::Paperless, "paperless:17"))
+                .exists()
+        );
+    }
+}
diff --git a/crates/noxa-rag/src/mcp_bridge/paperless.rs b/crates/noxa-rag/src/mcp_bridge/paperless.rs
new file mode 100644
index 0000000..8104806
--- /dev/null
+++ b/crates/noxa-rag/src/mcp_bridge/paperless.rs
@@ -0,0 +1,127 @@
+use std::collections::HashMap;
+
+use serde_json::Value;
+
+use crate::RagError;
+
+use super::{
+    BridgeDocument, McpBridge, McpSource, McporterExecutor, SyncReport, WriteStatus,
+    array_field, as_u64, as_u64_value, join_base_url, optional_string, required_base_url,
+    required_value,
+    io::{build_extraction, write_bridge_document},
+};
+
+impl<E> McpBridge<E>
+where
+    E: McporterExecutor,
+{
+    pub(super) async fn sync_paperless(&self) -> Result<SyncReport, RagError> {
+        let base_url = required_base_url(&self.config, McpSource::Paperless)?;
+        let tag_names = self.fetch_paperless_lookup("tags.list").await?;
+        let correspondent_names = self.fetch_paperless_lookup("correspondents.list").await?;
+        let mut report = SyncReport::default();
+        let mut page = 1_u32;
+
+        loop {
+            let data = self
+                .call_data(
+                    McpSource::Paperless,
+                    "documents.list",
+                    serde_json::json!({
+                        "page_size": self.config.page_size,
+                        "page": page,
+                    }),
+                )
+                .await?;
+            let records = array_field(&data, "results")?;
+            if records.is_empty() {
+                break;
+            }
+
+            for record in records {
+                let document =
+                    normalize_paperless_record(record, &tag_names, &correspondent_names, base_url)?;
+                report.fetched += 1;
+                match write_bridge_document(&self.config.watch_dir, &document).await? {
+                    WriteStatus::Written => report.written += 1,
+                    WriteStatus::Unchanged => report.skipped += 1,
+                }
+            }
+
+            if data.get("next").is_none() || data.get("next").is_some_and(Value::is_null) {
+                break;
+            }
+            page = page.saturating_add(1);
+        }
+
+        Ok(report)
+    }
+
+    pub(super) async fn fetch_paperless_lookup(
+        &self,
+        action: &str,
+    ) -> Result<HashMap<u64, String>, RagError> {
+        let data = self
+            .call_data(McpSource::Paperless, action, serde_json::json!({}))
+            .await?;
+        let items = if let Some(array) = data.as_array() {
+            array.iter().collect::<Vec<_>>()
+        } else {
+            array_field(&data, "results")?
+        };
+        let mut lookup = HashMap::new();
+        for item in items {
+            let Some(id) = item.get("id").and_then(as_u64) else {
+                continue;
+            };
+            let Some(name) = item.get("name").and_then(Value::as_str) else {
+                continue;
+            };
+            lookup.insert(id, name.to_string());
+        }
+        Ok(lookup)
+    }
+}
+
+pub fn normalize_paperless_record(
+    record: &Value,
+    tag_names: &HashMap<u64, String>,
+    correspondent_names: &HashMap<u64, String>,
+    platform_base_url: &str,
+) -> Result<BridgeDocument, RagError> {
+    let id = required_value(record, "id").and_then(as_u64_value)?;
+    let tags = record
+        .get("tags")
+        .and_then(Value::as_array)
+        .map(|items| {
+            items
+                .iter()
+                .filter_map(as_u64)
+                .filter_map(|value| tag_names.get(&value).cloned())
+                .collect::<Vec<_>>()
+        })
+        .unwrap_or_default();
+    let author = record
+        .get("correspondent")
+        .and_then(as_u64)
+        .and_then(|value| correspondent_names.get(&value).cloned());
+    let title = optional_string(record, "title");
+    let content = optional_string(record, "content").unwrap_or_default();
+    let url = join_base_url(platform_base_url, &format!("/api/documents/{id}/"))?;
+
+    Ok(BridgeDocument {
+        source: McpSource::Paperless,
+        external_id: format!("paperless:{id}"),
+        platform_url: Some(url.clone()),
+        extraction: build_extraction(
+            url,
+            title,
+            optional_string(record, "created").or_else(|| optional_string(record, "created_date")),
+            author,
+            None,
+            tags,
+            content.clone(),
+            content,
+        ),
+    })
+}

From cf7eed21e1512d5273f25875d1ebd9734fd4eda1 Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:23:47 -0400
Subject: [PATCH 03/23] fix(noxa-bkq): change upsert to wait=true to ensure
 ordering with delete_stale_by_url

---
 .../noxa-rag/src/store/qdrant/vector_store.rs | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/crates/noxa-rag/src/store/qdrant/vector_store.rs b/crates/noxa-rag/src/store/qdrant/vector_store.rs
index cb32a0c..a335621 100644
--- a/crates/noxa-rag/src/store/qdrant/vector_store.rs
+++ b/crates/noxa-rag/src/store/qdrant/vector_store.rs
@@ -38,15 +38,22 @@ async fn check_response(
 
 #[async_trait]
 impl VectorStore for QdrantStore {
-    /// PUT /collections/{name}/points?wait=false — idempotent with deterministic
-    /// UUID v5 point IDs, so we do not block on WAL+index flush. This saves the
-    /// ~30ms RTT per upsert that `wait=true` imposes. Delete operations still
-    /// use `wait=true` because stale-chunk cleanup must observably complete
-    /// before the URL-lock is released.
+    /// PUT /collections/{name}/points?wait=true — we use wait=true here even
+    /// though the deterministic UUID v5 IDs make the upsert idempotent, because
+    /// `delete_stale_by_url` runs immediately after with `must_not: has_id
+    /// [new_ids]`. With wait=false, Qdrant returns `Acknowledged` once the
+    /// operation is written to WAL but *before* it is applied to segments.
+    /// The delete filter evaluates against current segment state, so new point
+    /// IDs that are WAL-only are not yet visible — meaning the `has_id` guard
+    /// could fail to protect them if they were somehow targeted. Qdrant's docs
+    /// do not guarantee that subsequent requests see wait=false WAL entries
+    /// before segment flush (no per-connection ordering guarantee is documented).
+    /// wait=true ensures the upserted points are committed to segments before
+    /// the delete filter runs, closing the race at the cost of ~30ms RTT.
     async fn upsert(&self, points: Vec<Point>) -> Result<usize, RagError> {
         let n = points.len();
         let url = format!(
-            "{}/collections/{}/points?wait=false",
+            "{}/collections/{}/points?wait=true",
             self.base_url, self.collection
         );
 

From 1b46471e67a952d96fbec3470a708550a6346252 Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:26:18 -0400
Subject: [PATCH 04/23] fix(noxa-5gf): add measured decompression cap to
 ODT/PPTX in parse_office_zip_file

Replace bare entry.read_to_string() in the ODT/PPTX branch with the same
two-pass pattern already used by the DOCX branch: advisory entry.size() checks
remain as a fast-path, but actual reads are now capped by .take(per_entry_cap+1)
and measured against MAX_ODT_PPTX_PER_ENTRY_BYTES (10 MiB) and
MAX_ODT_PPTX_TOTAL_BYTES (50 MiB cumulative).

A crafted zip that declares tiny central-directory sizes but expands to
gigabytes is now rejected before allocating unbounded memory.

Tests: odt_decompression_bomb_rejected_by_measured_guard,
       odt_cumulative_decompression_bomb_rejected,
       odt_small_legitimate_file_parses_ok,
       docx_entry_count_bomb_rejected
---
 crates/noxa-rag/src/pipeline/parse/binary.rs | 189 ++++++++++++++++++-
 1 file changed, 187 insertions(+), 2 deletions(-)

diff --git a/crates/noxa-rag/src/pipeline/parse/binary.rs b/crates/noxa-rag/src/pipeline/parse/binary.rs
index 10f6df6..7cf2a55 100644
--- a/crates/noxa-rag/src/pipeline/parse/binary.rs
+++ b/crates/noxa-rag/src/pipeline/parse/binary.rs
@@ -32,6 +32,10 @@ pub(crate) fn parse_office_zip_file(
     // stream (NOT the attacker-controlled central directory size field). This defends
     // against zip bombs that lie about entry.size() in the central directory.
     const MAX_DOCX_EXTRACTED_BYTES: u64 = 50 * 1024 * 1024;
+    // Per-entry and cumulative measured caps for ODT/PPTX.
+    // Named separately from the DOCX constants so each format's limit is explicit.
+    const MAX_ODT_PPTX_PER_ENTRY_BYTES: u64 = 10 * 1024 * 1024; // 10 MiB per XML entry
+    const MAX_ODT_PPTX_TOTAL_BYTES: u64 = 50 * 1024 * 1024; // 50 MiB cumulative
 
     let cursor = std::io::Cursor::new(bytes);
     let mut archive = zip::ZipArchive::new(cursor)
@@ -117,17 +121,23 @@ pub(crate) fn parse_office_zip_file(
     let mut text_parts: Vec<String> = Vec::new();
     let mut slide_count = 0u32;
     let mut has_notes = false;
+    // Authoritative measured total for ODT/PPTX decompressed bytes.
+    // This is incremented from the actual read count, NOT the advisory entry.size().
+    let mut odt_pptx_measured_total: u64 = 0;
     for i in 0..archive.len() {
         let mut entry = archive
             .by_index(i)
             .map_err(|e| RagError::Parse(format!("{ext} entry {i}: {e}")))?;
+
+        // Advisory pre-checks using the central directory declared size.
+        // These are fast-path guards only — the central directory is attacker-controlled
+        // so they must be backed up by the measured read below.
         total_uncompressed_size = total_uncompressed_size.saturating_add(entry.size());
         if total_uncompressed_size > MAX_TOTAL_UNCOMPRESSED_SIZE {
             return Err(RagError::Parse(format!(
                 "{ext}: archive expands to more than 250 MiB — possible zip bomb"
             )));
         }
-
         if entry.size() > MAX_ENTRY_SIZE {
             return Err(RagError::Parse(format!(
                 "{ext}: entry '{}' decompresses to {} bytes (max 100 MiB) — possible zip bomb",
@@ -150,10 +160,27 @@ pub(crate) fn parse_office_zip_file(
             continue;
         }
 
+        // Authoritative measured guard: cap the actual decompression stream.
+        // A crafted file can declare a tiny entry.size() in the central directory
+        // while the actual decompressed content expands to gigabytes. We read at most
+        // (cap + 1) bytes so we can distinguish "exactly at budget" from "overran".
+        let remaining = MAX_ODT_PPTX_TOTAL_BYTES.saturating_sub(odt_pptx_measured_total);
+        let per_entry_cap = MAX_ODT_PPTX_PER_ENTRY_BYTES.min(remaining);
+        let read_cap = per_entry_cap.saturating_add(1);
         let mut xml_buf = String::new();
-        entry
+        (&mut entry)
+            .take(read_cap)
             .read_to_string(&mut xml_buf)
             .map_err(|e| RagError::Parse(format!("{ext} read '{name}': {e}")))?;
+        let read_bytes = xml_buf.len() as u64;
+        if read_bytes > per_entry_cap {
+            return Err(RagError::Parse(format!(
+                "{ext}: entry '{name}' exceeds decompression limit ({} MiB per entry, {} MiB total) — possible zip bomb",
+                MAX_ODT_PPTX_PER_ENTRY_BYTES / (1024 * 1024),
+                MAX_ODT_PPTX_TOTAL_BYTES / (1024 * 1024),
+            )));
+        }
+        odt_pptx_measured_total = odt_pptx_measured_total.saturating_add(read_bytes);
 
         let fragment = super::extract_xml_text(&xml_buf).unwrap_or_else(|_| xml_buf.clone());
         if !fragment.trim().is_empty() {
@@ -181,3 +208,161 @@ pub(crate) fn parse_office_zip_file(
         provenance,
     })
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Build an in-memory zip archive with a single entry.
+    /// `entry_name`: path inside the zip (e.g. "content.xml")
+    /// `content`: raw bytes to store (Deflated, which compresses repeated data heavily)
+    fn make_zip_with_entry(entry_name: &str, content: &[u8]) -> Vec<u8> {
+        use std::io::Write;
+        let buf = std::io::Cursor::new(Vec::new());
+        let mut writer = zip::ZipWriter::new(buf);
+        let options = zip::write::SimpleFileOptions::default()
+            .compression_method(zip::CompressionMethod::Deflated);
+        writer.start_file(entry_name, options).expect("start_file");
+        writer.write_all(content).expect("write_all");
+        writer.finish().expect("finish").into_inner()
+    }
+
+    /// Verify that the DOCX guard is untouched: the existing DOCX path compiles and
+    /// the entry-count bomb check fires correctly (the measured guard is tested by DOCX's
+    /// own test suite; we just ensure structural integrity here).
+    #[test]
+    fn docx_entry_count_bomb_rejected() {
+        // Build a zip with MAX_ENTRIES+1 (1001) empty entries as a DOCX.
+        use std::io::Write;
+        let buf = std::io::Cursor::new(Vec::new());
+        let mut writer = zip::ZipWriter::new(buf);
+        let options = zip::write::SimpleFileOptions::default()
+            .compression_method(zip::CompressionMethod::Stored);
+        for i in 0..1001usize {
+            writer
+                .start_file(format!("file{i}.txt"), options)
+                .expect("start_file");
+            writer.write_all(b"").expect("write_all");
+        }
+        let bytes = writer.finish().expect("finish").into_inner();
+
+        let result = parse_office_zip_file(
+            &bytes,
+            "file:///test.docx".to_string(),
+            "test".to_string(),
+            "docx",
+        );
+        assert!(
+            result.is_err(),
+            "expected error for zip with 1001 entries, got Ok"
+        );
+        let msg = result.unwrap_err().to_string();
+        assert!(
+            msg.contains("zip bomb") || msg.contains("entries"),
+            "unexpected error: {msg}"
+        );
+    }
+
+    /// Core regression for noxa-5gf: an ODT file whose single content.xml entry
+    /// expands to more than MAX_ODT_PPTX_PER_ENTRY_BYTES (10 MiB) must be rejected
+    /// via the *measured* decompression guard — even if the advisory entry.size()
+    /// value in the central directory declares a small size.
+    ///
+    /// We use a highly compressible XML payload (11 MiB of repeated ASCII) so the
+    /// in-memory zip is only ~40 KiB but decompresses beyond the 10 MiB cap.
+    #[test]
+    fn odt_decompression_bomb_rejected_by_measured_guard() {
+        // 11 MiB of valid-ish XML content — highly compressible.
+        const BOMB_SIZE: usize = 11 * 1024 * 1024;
+        let xml_content: String = std::iter::repeat("<text:p>AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA</text:p>\n")
+            .flat_map(|s| s.chars())
+            .take(BOMB_SIZE)
+            .collect();
+
+        // The zip crate writes the actual uncompressed size into the central directory,
+        // so entry.size() will be 11 MiB. The advisory check (MAX_ENTRY_SIZE = 100 MiB)
+        // would NOT fire here — only the measured per-entry cap (10 MiB) fires.
+        let zip_bytes = make_zip_with_entry("content.xml", xml_content.as_bytes());
+
+        let result = parse_office_zip_file(
+            &zip_bytes,
+            "file:///test.odt".to_string(),
+            "Test Document".to_string(),
+            "odt",
+        );
+
+        assert!(
+            result.is_err(),
+            "expected Err for ODT entry exceeding 10 MiB measured cap, got Ok"
+        );
+        let msg = result.unwrap_err().to_string();
+        assert!(
+            msg.contains("zip bomb") || msg.contains("decompression limit"),
+            "expected zip bomb error message, got: {msg}"
+        );
+    }
+
+    /// Verify the cumulative measured cap fires when multiple entries each stay
+    /// under the per-entry cap but together exceed MAX_ODT_PPTX_TOTAL_BYTES (50 MiB).
+    #[test]
+    fn odt_cumulative_decompression_bomb_rejected() {
+        // 6 entries × 9 MiB each = 54 MiB total > 50 MiB limit.
+        // Each 9 MiB entry is under the 10 MiB per-entry cap, but cumulative > 50 MiB.
+        const ENTRIES: usize = 6;
+        const ENTRY_SIZE: usize = 9 * 1024 * 1024;
+
+        use std::io::Write;
+        let buf = std::io::Cursor::new(Vec::new());
+        let mut writer = zip::ZipWriter::new(buf);
+        let options = zip::write::SimpleFileOptions::default()
+            .compression_method(zip::CompressionMethod::Deflated);
+
+        let xml_chunk: String = std::iter::repeat("<text:p>BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB</text:p>\n")
+            .flat_map(|s| s.chars())
+            .take(ENTRY_SIZE)
+            .collect();
+
+        for i in 0..ENTRIES {
+            // All entries contain "content" in the name to pass the target_prefix filter.
+            writer
+                .start_file(format!("content{i}.xml"), options)
+                .expect("start_file");
+            writer
+                .write_all(xml_chunk.as_bytes())
+                .expect("write_all");
+        }
+        let zip_bytes = writer.finish().expect("finish").into_inner();
+
+        let result = parse_office_zip_file(
+            &zip_bytes,
+            "file:///test.odt".to_string(),
+            "Test Document".to_string(),
+            "odt",
+        );
+
+        assert!(
+            result.is_err(),
+            "expected Err for cumulative ODT entries exceeding 50 MiB measured cap, got Ok"
+        );
+        let msg = result.unwrap_err().to_string();
+        assert!(
+            msg.contains("zip bomb") || msg.contains("decompression limit"),
+            "expected zip bomb error message, got: {msg}"
+        );
+    }
+
+    /// A legitimate small ODT must parse successfully.
+    #[test]
+    fn odt_small_legitimate_file_parses_ok() {
+        let xml_content = r#"<?xml version="1.0"?><office:document-content xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0" xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"><office:body><office:text><text:p>Hello world</text:p></office:text></office:body></office:document-content>"#;
+        let zip_bytes = make_zip_with_entry("content.xml", xml_content.as_bytes());
+
+        let result = parse_office_zip_file(
+            &zip_bytes,
+            "file:///test.odt".to_string(),
+            "Test".to_string(),
+            "odt",
+        );
+        assert!(result.is_ok(), "expected Ok for small ODT, got: {result:?}");
+    }
+}

From d11219b8a4f3cf7258a007400bda1bf5f7e6594a Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:26:29 -0400
Subject: [PATCH 05/23] fix(noxa-gs8): replace double tokenization with
 word_count approximation in chunk()

The token_estimate diagnostic field was previously computed by calling
tokenizer.encode() on every emitted chunk, doubling tokenizer invocations
on the spawn_blocking hot path (the splitter already runs the tokenizer
internally via ChunkConfig::with_sizer for accurate boundary placement).

ChunkCharIndex in text-splitter 0.25 does not expose a token count field,
so the fix replaces token_estimate(&text, tokenizer) with word_count(&text).
The splitter's boundary accuracy is unchanged; only the diagnostic metadata
field now uses a word-count approximation.

Also removes the now-unused token_estimate() helper to keep clippy clean.

Tests: chunk_token_estimate_uses_word_count_not_tokenizer,
       chunk_empty_content_returns_empty
---
 crates/noxa-rag/src/chunker.rs | 151 +++++++++++++++++++++++++++++++--
 1 file changed, 142 insertions(+), 9 deletions(-)

diff --git a/crates/noxa-rag/src/chunker.rs b/crates/noxa-rag/src/chunker.rs
index 9aa191b..e79699b 100644
--- a/crates/noxa-rag/src/chunker.rs
+++ b/crates/noxa-rag/src/chunker.rs
@@ -42,14 +42,6 @@ fn extract_domain(url: &str) -> String {
         .unwrap_or_default()
 }
 
-/// Approximate token count — use the tokenizer when possible, fall back to word count.
-fn token_estimate(text: &str, tokenizer: &Tokenizer) -> usize {
-    tokenizer
-        .encode(text, false)
-        .map(|enc| enc.len())
-        .unwrap_or_else(|_| text.split_whitespace().count())
-}
-
 /// Chunk an `ExtractionResult` into a `Vec<Chunk>`.
 ///
 /// - Uses `content.markdown` if non-empty, otherwise `content.plain_text`.
@@ -57,6 +49,12 @@ fn token_estimate(text: &str, tokenizer: &Tokenizer) -> usize {
 /// - Uses `ChunkConfig::with_overlap()` for sliding-window overlap (built into text-splitter ≥0.25).
 /// - Filters chunks below `config.min_words`.
 /// - Caps output at `config.max_chunks_per_page`.
+///
+/// # Token estimate
+/// The `Chunk::token_estimate` field is populated with a word-count approximation.
+/// The tokenizer is still used exclusively by the splitter for accurate boundary placement
+/// via `ChunkConfig::with_sizer`; re-encoding every emitted chunk would halve throughput
+/// on the `spawn_blocking` hot path for no practical gain (the field is diagnostic only).
 pub fn chunk(
     result: &ExtractionResult,
     config: &ChunkerConfig,
@@ -112,7 +110,10 @@ pub fn chunk(
         .into_iter()
         .enumerate()
         .map(|(chunk_index, (char_offset, text))| {
-            let t_est = token_estimate(&text, tokenizer);
+            // Use word count as a cheap approximation for the diagnostic token_estimate
+            // field. The tokenizer is already used by the splitter for boundary accuracy;
+            // re-encoding every chunk here would double tokenization work on the hot path.
+            let t_est = word_count(&text);
             let section_header = nearest_heading(&headings, char_offset).map(|s| s.to_string());
             Chunk {
                 text,
@@ -131,6 +132,86 @@ pub fn chunk(
 #[cfg(test)]
 mod tests {
     use super::*;
+    use noxa_core::types::{Content, Metadata};
+    use std::str::FromStr as _;
+
+    fn make_extraction_result(markdown: &str) -> ExtractionResult {
+        ExtractionResult {
+            metadata: Metadata {
+                title: None,
+                description: None,
+                author: None,
+                published_date: None,
+                language: None,
+                url: Some("https://example.com/test".to_string()),
+                site_name: None,
+                image: None,
+                favicon: None,
+                word_count: markdown.split_whitespace().count(),
+                content_hash: None,
+                source_type: None,
+                file_path: None,
+                last_modified: None,
+                is_truncated: None,
+                technologies: Vec::new(),
+                seed_url: None,
+                crawl_depth: None,
+                search_query: None,
+                fetched_at: None,
+            },
+            content: Content {
+                markdown: markdown.to_string(),
+                plain_text: String::new(),
+                links: Vec::new(),
+                images: Vec::new(),
+                code_blocks: Vec::new(),
+                raw_html: None,
+            },
+            domain_data: None,
+            vertical_data: None,
+            structured_data: Vec::new(),
+        }
+    }
+
+    /// Build a minimal whitespace-pretokenized WordLevel tokenizer suitable for
+    /// unit tests. Every word becomes a distinct token (unk token used for anything
+    /// not in the small vocab), which is sufficient for splitter boundary logic.
+    fn make_test_tokenizer() -> Tokenizer {
+        // A minimal valid tokenizer JSON: WordLevel model with whitespace pre-tokenizer.
+        // Using from_str avoids the ahash::AHashMap type constraint on WordLevelBuilder::vocab
+        // and the TokenizerImpl→Tokenizer conversion from TokenizerBuilder.
+        let json = serde_json::json!({
+            "version": "1.0",
+            "truncation": null,
+            "padding": null,
+            "added_tokens": [],
+            "normalizer": null,
+            "pre_tokenizer": {
+                "type": "Whitespace"
+            },
+            "post_processor": null,
+            "decoder": null,
+            "model": {
+                "type": "WordLevel",
+                "vocab": {
+                    "[UNK]": 0,
+                    "the": 1,
+                    "and": 2,
+                    "of": 3,
+                    "a": 4,
+                    "to": 5,
+                    "in": 6,
+                    "is": 7,
+                    "it": 8,
+                    "that": 9
+                },
+                "unk_token": "[UNK]"
+            }
+        });
+        json.to_string()
+            .parse::<Tokenizer>()
+            .expect("test tokenizer from JSON")
+    }
 
     #[test]
     fn domain_extraction() {
@@ -148,4 +229,56 @@ mod tests {
         assert_eq!(word_count("  "), 0);
         assert_eq!(word_count(""), 0);
     }
+
+    /// Verify that no double-tokenization occurs: token_estimate is populated with
+    /// word_count (not from the tokenizer), and chunks are produced for non-trivial input.
+    #[test]
+    fn chunk_token_estimate_uses_word_count_not_tokenizer() {
+        let tokenizer = make_test_tokenizer();
+        let config = crate::config::ChunkerConfig {
+            target_tokens: 50,
+            overlap_tokens: 0,
+            min_words: 1,
+            max_chunks_per_page: 200,
+        };
+
+        // A body of text long enough to produce at least one chunk.
+        let body = (0..200)
+            .map(|i| format!("word{i}"))
+            .collect::<Vec<_>>()
+            .join(" ");
+        let result = make_extraction_result(&body);
+        let chunks = chunk(&result, &config, &tokenizer);
+
+        assert!(
+            !chunks.is_empty(),
+            "expected at least one chunk for a 200-word body"
+        );
+
+        for c in &chunks {
+            // token_estimate must be populated (> 0 for any non-empty chunk).
+            assert!(
+                c.token_estimate > 0,
+                "token_estimate must be > 0, got {}",
+                c.token_estimate
+            );
+            // token_estimate == word_count of the chunk text (the word_count approximation).
+            let expected = word_count(&c.text);
+            assert_eq!(
+                c.token_estimate, expected,
+                "token_estimate should equal word_count for chunk at index {}",
+                c.chunk_index
+            );
+        }
+    }
+
+    /// Confirm empty content returns no chunks.
+    #[test]
+    fn chunk_empty_content_returns_empty() {
+        let tokenizer = make_test_tokenizer();
+        let config = crate::config::ChunkerConfig::default();
+        let result = make_extraction_result("");
+        let chunks = chunk(&result, &config, &tokenizer);
+        assert!(chunks.is_empty());
+    }
 }

From c9c9b150975a51e5c785ccfd956af62ae531840d Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:30:59 -0400
Subject: [PATCH 06/23] refactor(noxa-mqm): add record_parse_failure() to
 SessionCounters, remove direct field access

---
 crates/noxa-rag/src/pipeline.rs         | 5 +++++
 crates/noxa-rag/src/pipeline/process.rs | 3 +--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/crates/noxa-rag/src/pipeline.rs b/crates/noxa-rag/src/pipeline.rs
index a696fee..73abcdf 100644
--- a/crates/noxa-rag/src/pipeline.rs
+++ b/crates/noxa-rag/src/pipeline.rs
@@ -75,6 +75,11 @@ impl SessionCounters {
         self.files_failed
             .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
     }
+
+    fn record_parse_failure(&self) {
+        self.parse_failures
+            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+    }
 }
 
 struct IndexJob {
diff --git a/crates/noxa-rag/src/pipeline/process.rs b/crates/noxa-rag/src/pipeline/process.rs
index 0ab1725..7cf7b8b 100644
--- a/crates/noxa-rag/src/pipeline/process.rs
+++ b/crates/noxa-rag/src/pipeline/process.rs
@@ -1,6 +1,5 @@
 use std::path::Path;
 use std::sync::Arc;
-use std::sync::atomic::Ordering;
 
 use tokio::io::AsyncReadExt;
 
@@ -64,7 +63,7 @@ pub(crate) async fn validate_url_scheme(url: &str) -> Result<(), RagError> {
 async fn append_failed_job(path: &Path, error: &impl std::fmt::Display, ctx: &WorkerContext) {
     // Increment the parse-failure counter regardless of whether a log path is
     // configured — this ensures the heartbeat metric is always accurate.
-    ctx.counters.parse_failures.fetch_add(1, Ordering::Relaxed);
+    ctx.counters.record_parse_failure();
 
     let Some(ref log_path) = ctx.config.pipeline.failed_jobs_log else {
         return;

From c80b2ec4ae0eef23ad95fcfd40ae152563ce07df Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:32:14 -0400
Subject: [PATCH 07/23] refactor(noxa-26r): wrap RagConfig in Arc for
 WorkerContext
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pipeline::new still accepts RagConfig by value and wraps it with Arc::new
internally — call sites are unchanged. WorkerContext.config: Arc<RagConfig>
clones the Arc pointer (O(1)) instead of deep-cloning the config value per worker.
---
 crates/noxa-rag/src/pipeline.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/crates/noxa-rag/src/pipeline.rs b/crates/noxa-rag/src/pipeline.rs
index 73abcdf..408c94b 100644
--- a/crates/noxa-rag/src/pipeline.rs
+++ b/crates/noxa-rag/src/pipeline.rs
@@ -127,7 +127,7 @@ struct WorkerContext {
     embed: DynEmbedProvider,
     store: DynVectorStore,
     tokenizer: Arc<Tokenizer>,
-    config: RagConfig,
+    config: Arc<RagConfig>,
     url_locks: Arc<DashMap<String, Arc<tokio::sync::Mutex<()>>>>,
     git_branch_cache: Arc<DashMap<PathBuf, Option<String>>>,
     watch_roots: Arc<Vec<PathBuf>>,
@@ -158,7 +158,7 @@ impl WorkerContext {
 }
 
 pub struct Pipeline {
-    config: RagConfig,
+    config: Arc<RagConfig>,
     embed: DynEmbedProvider,
     store: DynVectorStore,
     tokenizer: Arc<Tokenizer>,
@@ -188,7 +188,7 @@ impl Pipeline {
         shutdown: CancellationToken,
     ) -> Self {
         Self {
-            config,
+            config: Arc::new(config),
             embed,
             store,
             tokenizer,

From 0d995327357795e374f01f3a9a5a8a4aca178e4e Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:32:31 -0400
Subject: [PATCH 08/23] fix(noxa-3b8): add 50 MiB size guard to
 startup_scan_key before JSON deserialization

---
 crates/noxa-rag/src/pipeline/scan.rs | 60 +++++++++++++++++++++++++++-
 1 file changed, 59 insertions(+), 1 deletion(-)

diff --git a/crates/noxa-rag/src/pipeline/scan.rs b/crates/noxa-rag/src/pipeline/scan.rs
index 11eb96d..b7bdb44 100644
--- a/crates/noxa-rag/src/pipeline/scan.rs
+++ b/crates/noxa-rag/src/pipeline/scan.rs
@@ -88,6 +88,13 @@ fn collect_indexable_paths_recursive(path: &Path, found: &mut Vec<PathBuf>) {
     }
 }
 
+/// Maximum JSON file size that `startup_scan_key` will deserialize to extract a content-hash
+/// and URL. Files larger than this are returned as `None` (causing unconditional queueing),
+/// which is safe because `process_job` enforces `max_file_size_bytes` independently.
+///
+/// Matches the default `max_file_size_bytes` in `PipelineConfig` (50 MiB).
+const MAX_SCAN_JSON_BYTES: u64 = 50 * 1024 * 1024;
+
 /// Compute the (content_hash, url) key used by the startup delta scan.
 ///
 /// For `.json` ExtractionResult files: peeks at `metadata.url` and `metadata.content_hash`
@@ -100,7 +107,10 @@ fn collect_indexable_paths_recursive(path: &Path, found: &mut Vec<PathBuf>) {
 ///
 /// Returns `None` when the path is a symlink (confinement-safety: avoid following symlinks
 /// out of the watch roots before `process_job` validates confinement), or when the file
-/// metadata cannot be read, or when a file:// URL cannot be constructed.
+/// metadata cannot be read, or when a file:// URL cannot be constructed, or when a `.json`
+/// file exceeds [`MAX_SCAN_JSON_BYTES`] (preventing unbounded heap allocation before pipeline
+/// workers start; the oversized file is still queued unconditionally and rejected by
+/// `process_job` under its own `max_file_size_bytes` guard).
 ///
 /// Must be called inside `spawn_blocking` — this function reads from disk synchronously.
 pub(crate) fn startup_scan_key(path: &Path) -> Option<(String, String)> {
@@ -114,6 +124,18 @@ pub(crate) fn startup_scan_key(path: &Path) -> Option<(String, String)> {
     }
 
     if path.extension().and_then(|e| e.to_str()) == Some("json") {
+        // Guard against unbounded heap allocation: serde_json::from_reader will read the
+        // full JSON value into memory. Reuse sym_meta (already fetched above) to avoid an
+        // extra stat syscall — for non-symlinks, symlink_metadata.len() == metadata.len().
+        if sym_meta.len() > MAX_SCAN_JSON_BYTES {
+            tracing::debug!(
+                path = %path.display(),
+                size = sym_meta.len(),
+                limit = MAX_SCAN_JSON_BYTES,
+                "startup_scan_key: JSON file exceeds size limit, queuing unconditionally"
+            );
+            return None;
+        }
         #[derive(serde::Deserialize)]
         struct Q {
             metadata: QM,
@@ -516,4 +538,40 @@ mod tests {
         assert!(hash.starts_with("mtime:"), "fallback hash: {hash}");
         assert!(url.starts_with("file://"), "fallback url: {url}");
     }
+
+    #[test]
+    fn startup_scan_key_large_json_returns_none() {
+        let tmp = tempfile::tempdir().expect("tempdir");
+        let file = tmp.path().join("big.json");
+
+        // Use a sparse file (set_len without writing data) to create a >50 MiB
+        // file instantly without actually writing 50 MiB to disk.
+        {
+            let f = std::fs::File::create(&file).expect("create big.json");
+            let over_limit = super::MAX_SCAN_JSON_BYTES + 1;
+            f.set_len(over_limit).expect("set_len for sparse file");
+        }
+
+        // The size guard must trigger before any attempt to deserialize.
+        assert_eq!(
+            startup_scan_key(&file),
+            None,
+            "JSON file exceeding MAX_SCAN_JSON_BYTES must return None to prevent unbounded heap allocation"
+        );
+    }
+
+    #[test]
+    fn startup_scan_key_json_at_size_limit_is_processed() {
+        // A JSON file exactly AT the limit (not over) should still be attempted.
+        // We write a tiny valid JSON file to verify the check is strictly >.
+        let tmp = tempfile::tempdir().expect("tempdir");
+        let file = tmp.path().join("doc.json");
+        let body = r#"{"metadata":{"url":"https://example.com/b","content_hash":"def456"}}"#;
+        fs::write(&file, body).expect("write json");
+
+        // Tiny file is well below the limit — must parse normally.
+        let (hash, url) = startup_scan_key(&file).expect("small json should succeed");
+        assert_eq!(hash, "def456");
+        assert_eq!(url, "https://example.com/b");
+    }
 }

From 87799147ef6fda75d5a4c8a8c007334f9a2adb61 Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:32:34 -0400
Subject: [PATCH 09/23] perf(noxa-c28): eliminate redundant chunk text clone in
 TeiProvider::embed

---
 crates/noxa-rag/src/embed/tei.rs | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/crates/noxa-rag/src/embed/tei.rs b/crates/noxa-rag/src/embed/tei.rs
index 7fd9d30..70f90ed 100644
--- a/crates/noxa-rag/src/embed/tei.rs
+++ b/crates/noxa-rag/src/embed/tei.rs
@@ -318,15 +318,20 @@ impl EmbedProvider for TeiProvider {
         // Keep EMBED_PIPELINE_DEPTH batches in-flight concurrently so HTTP
         // round-trip latency overlaps with GPU compute on the TEI server.
         // buffered() preserves batch ordering.
-        let batches: Vec<(usize, Vec<String>)> = texts
-            .chunks(BATCH_SIZE)
-            .enumerate()
-            .map(|(i, chunk)| (i, chunk.to_vec()))
+        //
+        // We stream (batch_idx, start, end) index tuples and slice `texts` inside
+        // the async block to avoid cloning the text data into owned Vec<String> batches.
+        let batch_ranges: Vec<(usize, usize, usize)> = (0..total_batches)
+            .map(|i| {
+                let start = i * BATCH_SIZE;
+                let end = (start + BATCH_SIZE).min(texts.len());
+                (i, start, end)
+            })
             .collect();
 
-        let results: Vec<Vec<Vec<f32>>> = futures::stream::iter(batches)
-            .map(|(batch_idx, batch)| async move {
-                self.embed_batch_adaptive(&batch, batch_idx, total_batches)
+        let results: Vec<Vec<Vec<f32>>> = futures::stream::iter(batch_ranges)
+            .map(|(batch_idx, start, end)| async move {
+                self.embed_batch_adaptive(&texts[start..end], batch_idx, total_batches)
                     .await
             })
             .buffered(EMBED_PIPELINE_DEPTH)

From 5b596877e7a278d1f34c07b110648d2e2f2d798c Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:33:18 -0400
Subject: [PATCH 10/23] docs(noxa-dkl): document is_indexable symlink TOCTOU
 window and defense-in-depth design

---
 crates/noxa-rag/src/pipeline/scan.rs | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/crates/noxa-rag/src/pipeline/scan.rs b/crates/noxa-rag/src/pipeline/scan.rs
index b7bdb44..2475548 100644
--- a/crates/noxa-rag/src/pipeline/scan.rs
+++ b/crates/noxa-rag/src/pipeline/scan.rs
@@ -47,6 +47,19 @@ pub(crate) fn has_indexable_extension(path: &Path) -> bool {
 /// temp files that are gone by the time we process them.
 ///
 /// Deferred (no confirmed use case, would add new crate deps): .epub, .mbox
+///
+/// # Symlink safety (TOCTOU note)
+///
+/// `path.is_file()` follows symlinks: it returns `true` for a symlink whose target is a
+/// regular file. Callers that care about confinement must guard against this with an
+/// explicit `path.is_symlink()` check **before** calling `is_indexable`. See
+/// [`collect_indexable_paths_recursive`] for the canonical call-site pattern (lines 79–82).
+///
+/// This function itself does **not** skip symlinks — that would require two `stat` calls
+/// per entry and the function predates the confinement requirement. The authoritative
+/// defence-in-depth is the `canonicalize + starts_with(watch_root)` confinement check
+/// inside `process_job`, which runs unconditionally regardless of how the path was
+/// discovered.
 pub(crate) fn is_indexable(path: &Path) -> bool {
     if !path.is_file() {
         return false;

From bfa3b53423181eac481b2f53f00c6812ccfdf359 Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:33:29 -0400
Subject: [PATCH 11/23] refactor(noxa-qgq): remove OnceLock<watch_roots> from
 Pipeline, pass roots to spawn_workers

watch_roots is now computed in runtime::run(), passed directly to spawn_workers()
as Arc<Vec<PathBuf>>, and forwarded to WorkerContext::from_pipeline() as a
parameter. Eliminates the runtime panic from OnceLock::get().expect() and
enforces the invariant at compile time instead.
---
 crates/noxa-rag/src/pipeline.rs         | 12 ++----------
 crates/noxa-rag/src/pipeline/runtime.rs |  7 ++-----
 crates/noxa-rag/src/pipeline/worker.rs  |  6 +++++-
 3 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/crates/noxa-rag/src/pipeline.rs b/crates/noxa-rag/src/pipeline.rs
index 408c94b..e450a23 100644
--- a/crates/noxa-rag/src/pipeline.rs
+++ b/crates/noxa-rag/src/pipeline.rs
@@ -137,7 +137,7 @@ struct WorkerContext {
 }
 
 impl WorkerContext {
-    fn from_pipeline(pipeline: &Pipeline) -> Self {
+    fn from_pipeline(pipeline: &Pipeline, watch_roots: Arc<Vec<PathBuf>>) -> Self {
         Self {
             embed: pipeline.embed.clone(),
             store: pipeline.store.clone(),
@@ -145,11 +145,7 @@ impl WorkerContext {
             config: pipeline.config.clone(),
             url_locks: pipeline.url_locks.clone(),
             git_branch_cache: pipeline.git_branch_cache.clone(),
-            watch_roots: pipeline
-                .watch_roots
-                .get()
-                .expect("watch_roots set before spawn_workers")
-                .clone(),
+            watch_roots,
             counters: pipeline.counters.clone(),
             failed_jobs_log_lock: pipeline.failed_jobs_log_lock.clone(),
             shutdown: pipeline.shutdown.clone(),
@@ -174,9 +170,6 @@ pub struct Pipeline {
     /// Serialises failed-jobs log rotation: check-size → rotate → append must be atomic
     /// across concurrent workers to avoid double-rename races.
     failed_jobs_log_lock: Arc<tokio::sync::Mutex<()>>,
-    /// Canonicalized watch roots, set once during `run()` before workers are spawned.
-    /// Workers access this directly instead of receiving it as a spawn parameter.
-    watch_roots: std::sync::OnceLock<Arc<Vec<PathBuf>>>,
 }
 
 impl Pipeline {
@@ -197,7 +190,6 @@ impl Pipeline {
             git_branch_cache: Arc::new(DashMap::new()),
             counters: Arc::new(SessionCounters::default()),
             failed_jobs_log_lock: Arc::new(tokio::sync::Mutex::new(())),
-            watch_roots: std::sync::OnceLock::new(),
         }
     }
 
diff --git a/crates/noxa-rag/src/pipeline/runtime.rs b/crates/noxa-rag/src/pipeline/runtime.rs
index ea4035b..74a4a72 100644
--- a/crates/noxa-rag/src/pipeline/runtime.rs
+++ b/crates/noxa-rag/src/pipeline/runtime.rs
@@ -83,14 +83,11 @@ pub(crate) async fn run(pipeline: &Pipeline) -> Result<(), RagError> {
     );
 
     let session_start = Instant::now();
-    pipeline
-        .watch_roots
-        .set(Arc::new(scan::canonical_watch_roots(&watch_dirs).await?))
-        .ok();
+    let watch_roots = Arc::new(scan::canonical_watch_roots(&watch_dirs).await?);
 
     let (tx, rx) = async_channel::bounded(pipeline.config.pipeline.job_queue_capacity);
 
-    let worker_handles = spawn_workers(pipeline, rx);
+    let worker_handles = spawn_workers(pipeline, rx, Arc::clone(&watch_roots));
 
     let bridge_handle = setup_watcher(
         &watch_dirs,
diff --git a/crates/noxa-rag/src/pipeline/worker.rs b/crates/noxa-rag/src/pipeline/worker.rs
index 8723e04..edd0285 100644
--- a/crates/noxa-rag/src/pipeline/worker.rs
+++ b/crates/noxa-rag/src/pipeline/worker.rs
@@ -1,3 +1,6 @@
+use std::path::PathBuf;
+use std::sync::Arc;
+
 use tokio::task::JoinHandle;
 use tracing::Instrument;
 
@@ -7,8 +10,9 @@ use super::{Pipeline, PipelineJob, WorkerContext};
 pub(super) fn spawn_workers(
     pipeline: &Pipeline,
     rx: async_channel::Receiver<PipelineJob>,
+    watch_roots: Arc<Vec<PathBuf>>,
 ) -> Vec<JoinHandle<()>> {
-    let ctx = WorkerContext::from_pipeline(pipeline);
+    let ctx = WorkerContext::from_pipeline(pipeline, watch_roots);
     let mut handles = Vec::with_capacity(pipeline.config.pipeline.embed_concurrency);
 
     for worker_id in 0..pipeline.config.pipeline.embed_concurrency {

From 9b220ca3cd4e937563ad6e1d732369aeaa55ca01 Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:33:32 -0400
Subject: [PATCH 12/23] fix(noxa-byr): omit TEI response body from retry logs
 when auth_token configured

---
 crates/noxa-rag/src/embed/tei.rs | 33 +++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/crates/noxa-rag/src/embed/tei.rs b/crates/noxa-rag/src/embed/tei.rs
index 70f90ed..c6b2d2c 100644
--- a/crates/noxa-rag/src/embed/tei.rs
+++ b/crates/noxa-rag/src/embed/tei.rs
@@ -236,17 +236,28 @@ impl TeiProvider {
             }
 
             if should_retry(status_u16, attempt) {
-                let body = resp.text().await.unwrap_or_default();
-                let preview: String = body.chars().take(512).collect();
-                tracing::warn!(
-                    batch = batch_idx + 1,
-                    attempt = attempt + 1,
-                    max_attempts = MAX_RETRIES + 1,
-                    status = status_u16,
-                    delay_ms,
-                    body = preview,
-                    "TEI retry"
-                );
+                if self.auth_token.is_some() {
+                    tracing::warn!(
+                        batch = batch_idx + 1,
+                        attempt = attempt + 1,
+                        max_attempts = MAX_RETRIES + 1,
+                        status = status_u16,
+                        delay_ms,
+                        "TEI retry (body omitted: auth token configured)"
+                    );
+                } else {
+                    let body = resp.text().await.unwrap_or_default();
+                    let preview: String = body.chars().take(512).collect();
+                    tracing::warn!(
+                        batch = batch_idx + 1,
+                        attempt = attempt + 1,
+                        max_attempts = MAX_RETRIES + 1,
+                        status = status_u16,
+                        delay_ms,
+                        body = preview,
+                        "TEI retry"
+                    );
+                }
                 tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await;
                 delay_ms = (delay_ms * 2).min(2_000);
                 continue;

From debc76045f74e4b0ac072c7897ea9cc4e9ebeac2 Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:34:15 -0400
Subject: [PATCH 13/23] perf(noxa-5tl): batch startup_scan spawn_blocking calls
 to reduce per-file dispatch overhead

---
 crates/noxa-rag/src/pipeline/startup_scan.rs | 141 ++++++++++++-------
 1 file changed, 89 insertions(+), 52 deletions(-)

diff --git a/crates/noxa-rag/src/pipeline/startup_scan.rs b/crates/noxa-rag/src/pipeline/startup_scan.rs
index 181a905..c37daf6 100644
--- a/crates/noxa-rag/src/pipeline/startup_scan.rs
+++ b/crates/noxa-rag/src/pipeline/startup_scan.rs
@@ -10,6 +10,14 @@ use crate::store::{DynVectorStore, HashExistsResult};
 use super::scan;
 use super::{IndexJob, PipelineJob};
 
+/// Number of paths batched into a single `spawn_blocking` call during startup scan.
+///
+/// With ~few µs per `startup_scan_key` call, a batch of 256 completes in the low-ms range,
+/// keeping cancellation latency bounded while amortizing per-task scheduler overhead across
+/// 256 paths instead of 1. At 10 000 files this produces ≈40 batches — comfortably spread
+/// across `scan_concurrency` blocking workers.
+const STARTUP_SCAN_BATCH: usize = 256;
+
 pub(super) fn spawn_startup_scan(
     tx: async_channel::Sender<PipelineJob>,
     store: DynVectorStore,
@@ -46,8 +54,17 @@ pub(super) fn spawn_startup_scan(
         let skipped = Arc::new(AtomicUsize::new(0));
         let backend_errors = Arc::new(AtomicUsize::new(0));
 
-        stream::iter(paths)
-            .for_each_concurrent(scan_concurrency, |path| {
+        // Batch paths into groups of STARTUP_SCAN_BATCH before dispatching to the blocking
+        // thread pool. This reduces per-task scheduler overhead from O(N) spawn_blocking calls
+        // (one per file) to O(N / STARTUP_SCAN_BATCH) calls, saving ~2–5µs per file on large
+        // startup scans while keeping cancellation latency bounded to one batch worth of work.
+        let batches: Vec<Vec<PathBuf>> = paths
+            .chunks(STARTUP_SCAN_BATCH)
+            .map(|c| c.to_vec())
+            .collect();
+
+        stream::iter(batches)
+            .for_each_concurrent(scan_concurrency, |batch| {
                 let tx = tx.clone();
                 let store = store.clone();
                 let shutdown = shutdown.clone();
@@ -60,60 +77,80 @@ pub(super) fn spawn_startup_scan(
                         return;
                     }
 
-                    let path2 = path.clone();
-                    let hash_and_url =
-                        tokio::task::spawn_blocking(move || scan::startup_scan_key(&path2))
-                            .await
-                            .ok()
-                            .flatten();
-
-                    let (hash, url) = match hash_and_url {
-                        Some(t) => t,
-                        None => {
-                            tracing::debug!(
-                                path = %path.display(),
-                                "startup scan: no url/hash, queuing"
-                            );
-                            let span =
-                                tracing::info_span!("index_job", path = %path.display());
-                            tokio::select! {
-                                _ = tx.send(PipelineJob::Index(IndexJob { path, span })) => {}
-                                _ = shutdown.cancelled() => {}
+                    // Compute startup_scan_key for every path in the batch inside a single
+                    // blocking task, returning (PathBuf, Option<(hash, url)>) per path.
+                    let keys: Vec<(PathBuf, Option<(String, String)>)> =
+                        match tokio::task::spawn_blocking(move || {
+                            batch
+                                .into_iter()
+                                .map(|p| {
+                                    let k = scan::startup_scan_key(&p);
+                                    (p, k)
+                                })
+                                .collect()
+                        })
+                        .await
+                        {
+                            Ok(v) => v,
+                            Err(e) => {
+                                tracing::error!(error = %e, "startup scan: batch spawn_blocking panicked");
+                                return;
                             }
-                            queued.fetch_add(1, Ordering::Relaxed);
+                        };
+
+                    for (path, hash_and_url) in keys {
+                        if shutdown.is_cancelled() {
                             return;
                         }
-                    };
-
-                    match store.url_with_file_hash_exists_checked(&url, &hash).await {
-                        HashExistsResult::Exists => {
-                            skipped.fetch_add(1, Ordering::Relaxed);
-                            tracing::debug!(
-                                path = %path.display(),
-                                url = %url,
-                                "startup scan: already indexed, skipping"
-                            );
-                        }
-                        HashExistsResult::NotIndexed => {
-                            let span =
-                                tracing::info_span!("index_job", path = %path.display());
-                            tokio::select! {
-                                _ = tx.send(PipelineJob::Index(IndexJob { path, span })) => {}
-                                _ = shutdown.cancelled() => {}
+
+                        let (hash, url) = match hash_and_url {
+                            Some(t) => t,
+                            None => {
+                                tracing::debug!(
+                                    path = %path.display(),
+                                    "startup scan: no url/hash, queuing"
+                                );
+                                let span =
+                                    tracing::info_span!("index_job", path = %path.display());
+                                tokio::select! {
+                                    _ = tx.send(PipelineJob::Index(IndexJob { path, span })) => {}
+                                    _ = shutdown.cancelled() => {}
+                                }
+                                queued.fetch_add(1, Ordering::Relaxed);
+                                continue;
+                            }
+                        };
+
+                        match store.url_with_file_hash_exists_checked(&url, &hash).await {
+                            HashExistsResult::Exists => {
+                                skipped.fetch_add(1, Ordering::Relaxed);
+                                tracing::debug!(
+                                    path = %path.display(),
+                                    url = %url,
+                                    "startup scan: already indexed, skipping"
+                                );
+                            }
+                            HashExistsResult::NotIndexed => {
+                                let span =
+                                    tracing::info_span!("index_job", path = %path.display());
+                                tokio::select! {
+                                    _ = tx.send(PipelineJob::Index(IndexJob { path, span })) => {}
+                                    _ = shutdown.cancelled() => {}
+                                }
+                                queued.fetch_add(1, Ordering::Relaxed);
+                            }
+                            HashExistsResult::BackendError(ref msg) => {
+                                // Do NOT re-queue on backend failure — a degraded Qdrant endpoint
+                                // must not trigger a full reindex storm.  The file will be
+                                // re-evaluated on next startup once the backend recovers.
+                                backend_errors.fetch_add(1, Ordering::Relaxed);
+                                tracing::warn!(
+                                    path = %path.display(),
+                                    url = %url,
+                                    error = %msg,
+                                    "startup scan: backend error during delta check — skipping requeue to avoid reindex storm"
+                                );
                             }
-                            queued.fetch_add(1, Ordering::Relaxed);
-                        }
-                        HashExistsResult::BackendError(ref msg) => {
-                            // Do NOT re-queue on backend failure — a degraded Qdrant endpoint
-                            // must not trigger a full reindex storm.  The file will be
-                            // re-evaluated on next startup once the backend recovers.
-                            backend_errors.fetch_add(1, Ordering::Relaxed);
-                            tracing::warn!(
-                                path = %path.display(),
-                                url = %url,
-                                error = %msg,
-                                "startup scan: backend error during delta check — skipping requeue to avoid reindex storm"
-                            );
                         }
                     }
                 }

From 35db1ad051f6df94ba5afad59c936c8142e4ced1 Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:39:40 -0400
Subject: [PATCH 14/23] refactor(noxa-ngd): normalize parse_html_file to sync
 fn via spawn_blocking_parse helper

---
 crates/noxa-rag/src/pipeline/parse/mod.rs  |  4 +++-
 crates/noxa-rag/src/pipeline/parse/text.rs | 19 +++++--------------
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/crates/noxa-rag/src/pipeline/parse/mod.rs b/crates/noxa-rag/src/pipeline/parse/mod.rs
index 6bf3488..e7baf10 100644
--- a/crates/noxa-rag/src/pipeline/parse/mod.rs
+++ b/crates/noxa-rag/src/pipeline/parse/mod.rs
@@ -87,7 +87,9 @@ pub(crate) async fn parse_file(path: &Path, bytes: Vec<u8>) -> Result<ParsedFile
         "txt" => Ok(parse_plain_text_file(bytes, file_url, title)),
         "yaml" | "yml" | "toml" => Ok(parse_plain_text_file(bytes, file_url, title)),
         "log" => Ok(parse_log_file(bytes, file_url, title)),
-        "html" | "htm" => parse_html_file(bytes, file_url).await,
+        "html" | "htm" => {
+            spawn_blocking_parse("HTML", move || parse_html_file(bytes, file_url)).await
+        }
         "ipynb" => {
             spawn_blocking_parse("ipynb", move || parse_ipynb_file(&bytes, file_url, title)).await
         }
diff --git a/crates/noxa-rag/src/pipeline/parse/text.rs b/crates/noxa-rag/src/pipeline/parse/text.rs
index 378bdb0..52ea0dd 100644
--- a/crates/noxa-rag/src/pipeline/parse/text.rs
+++ b/crates/noxa-rag/src/pipeline/parse/text.rs
@@ -110,24 +110,15 @@ pub(crate) fn parse_log_file(bytes: Vec<u8>, file_url: String, title: String) ->
     }
 }
 
-pub(crate) async fn parse_html_file(
+pub(crate) fn parse_html_file(
     bytes: Vec<u8>,
     file_url: String,
 ) -> Result<ParsedFile, RagError> {
     let html = String::from_utf8_lossy(&bytes).into_owned();
-    let url_for_extract = file_url.clone();
-    let extraction = tokio::task::spawn_blocking(
-        move || -> Result<noxa_core::types::ExtractionResult, RagError> {
-            let mut r = noxa_core::extract(&html, Some(&url_for_extract))
-                .map_err(|e| RagError::Parse(format!("HTML extract: {e}")))?;
-            r.metadata.url = Some(url_for_extract);
-            r.metadata.source_type = Some("file".to_string());
-            Ok(r)
-        },
-    )
-    .await
-    .map_err(|e| RagError::Parse(format!("HTML spawn_blocking: {e}")))??;
-
+    let mut extraction = noxa_core::extract(&html, Some(&file_url))
+        .map_err(|e| RagError::Parse(format!("HTML extract: {e}")))?;
+    extraction.metadata.url = Some(file_url);
+    extraction.metadata.source_type = Some("file".to_string());
     Ok(ParsedFile {
         extraction,
         provenance: IngestionProvenance::default(),

From c390dd3429530ea9fd1f74a37dc938ea64255e12 Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:40:44 -0400
Subject: [PATCH 15/23] fix(noxa-u90): split parse_ms timer into io_ms +
 parse_ms in process_job
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

t0 previously captured canonicalize + open + read_to_end time, then
parse_ms was recorded before parse_file() was called — measuring I/O, not
parse CPU. Split into io_t0 (file I/O through read_to_end) and parse_t0
(wrapping parse_file only). Add io_ms field to JobStats, total_io_ms to
SessionCounters/CounterSnapshot, and emit both in the per-job tracing::info
and the heartbeat log so operators can distinguish I/O-bound from
parse-bound workloads.
---
 crates/noxa-rag/src/pipeline.rs           | 9 +++++++++
 crates/noxa-rag/src/pipeline/heartbeat.rs | 1 +
 crates/noxa-rag/src/pipeline/process.rs   | 8 ++++++--
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/crates/noxa-rag/src/pipeline.rs b/crates/noxa-rag/src/pipeline.rs
index e450a23..326e66a 100644
--- a/crates/noxa-rag/src/pipeline.rs
+++ b/crates/noxa-rag/src/pipeline.rs
@@ -24,6 +24,7 @@ struct CounterSnapshot {
     failed: usize,
     parse_failures: usize,
     total_chunks: usize,
+    total_io_ms: u64,
     total_parse_ms: u64,
     total_chunk_ms: u64,
     total_embed_ms: u64,
@@ -38,6 +39,9 @@ struct SessionCounters {
     /// broader process errors so the heartbeat can report them independently.
     parse_failures: std::sync::atomic::AtomicUsize,
     total_chunks: std::sync::atomic::AtomicUsize,
+    /// Total time spent on file I/O (canonicalize, open, read) across all jobs.
+    total_io_ms: std::sync::atomic::AtomicU64,
+    /// Total time spent in the parser (CPU-bound format decoding) across all jobs.
     total_parse_ms: std::sync::atomic::AtomicU64,
     total_chunk_ms: std::sync::atomic::AtomicU64,
     total_embed_ms: std::sync::atomic::AtomicU64,
@@ -52,6 +56,7 @@ impl SessionCounters {
             failed: self.files_failed.load(Relaxed),
             parse_failures: self.parse_failures.load(Relaxed),
             total_chunks: self.total_chunks.load(Relaxed),
+            total_io_ms: self.total_io_ms.load(Relaxed),
             total_parse_ms: self.total_parse_ms.load(Relaxed),
             total_chunk_ms: self.total_chunk_ms.load(Relaxed),
             total_embed_ms: self.total_embed_ms.load(Relaxed),
@@ -65,6 +70,7 @@ impl SessionCounters {
             self.files_indexed.fetch_add(1, Relaxed);
         }
         self.total_chunks.fetch_add(stats.chunks, Relaxed);
+        self.total_io_ms.fetch_add(stats.io_ms, Relaxed);
         self.total_parse_ms.fetch_add(stats.parse_ms, Relaxed);
         self.total_chunk_ms.fetch_add(stats.chunk_ms, Relaxed);
         self.total_embed_ms.fetch_add(stats.embed_ms, Relaxed);
@@ -111,6 +117,9 @@ enum PipelineJob {
 #[derive(Default)]
 struct JobStats {
     chunks: usize,
+    /// Time spent on file I/O (canonicalize, open, read) — does not include parse CPU time.
+    io_ms: u64,
+    /// Time spent in the parser (CPU-bound format decoding) — does not include file I/O.
     parse_ms: u64,
     chunk_ms: u64,
     embed_ms: u64,
diff --git a/crates/noxa-rag/src/pipeline/heartbeat.rs b/crates/noxa-rag/src/pipeline/heartbeat.rs
index 99bc953..7113b0b 100644
--- a/crates/noxa-rag/src/pipeline/heartbeat.rs
+++ b/crates/noxa-rag/src/pipeline/heartbeat.rs
@@ -41,6 +41,7 @@ pub(super) fn spawn_heartbeat(
                         indexed        = snap.indexed,
                         failed         = snap.failed,
                         parse_failures = snap.parse_failures,
+                        io_ms          = snap.total_io_ms,
                         parse_ms       = snap.total_parse_ms,
                         chunk_ms       = snap.total_chunk_ms,
                         embed_ms       = snap.total_embed_ms,
diff --git a/crates/noxa-rag/src/pipeline/process.rs b/crates/noxa-rag/src/pipeline/process.rs
index 7cf7b8b..a3435c1 100644
--- a/crates/noxa-rag/src/pipeline/process.rs
+++ b/crates/noxa-rag/src/pipeline/process.rs
@@ -118,7 +118,7 @@ async fn append_failed_job(path: &Path, error: &impl std::fmt::Display, ctx: &Wo
 pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result<JobStats, RagError> {
     let job_start = std::time::Instant::now();
 
-    let t0 = std::time::Instant::now();
+    let io_t0 = std::time::Instant::now();
     // Canonicalize can fail for benign reasons — most commonly ENOENT when the
     // file was deleted between the watcher event and job execution. That's a
     // race, not a backend failure, so we must NOT return `Err` (which would
@@ -168,11 +168,12 @@ pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result<Jo
         tracing::warn!(path = ?job.path, max_file_bytes, "file exceeded size limit after read (grow-after-stat), skipping");
         return Ok(JobStats::default());
     }
-    let parse_ms = t0.elapsed().as_millis() as u64;
+    let io_ms = io_t0.elapsed().as_millis() as u64;
 
     // Compute xxHash3 of raw bytes before file_bytes is moved into parse_file.
     let file_hash = format!("{:016x}", xxhash_rust::xxh3::xxh3_64(&file_bytes));
 
+    let parse_t0 = std::time::Instant::now();
     let parsed = match parse::parse_file(&job.path, file_bytes).await {
         Ok(r) => r,
         Err(e) => {
@@ -181,6 +182,7 @@ pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result<Jo
             return Ok(JobStats::default());
         }
     };
+    let parse_ms = parse_t0.elapsed().as_millis() as u64;
     let mut result = parsed.extraction;
 
     if result.metadata.file_path.is_none() {
@@ -337,6 +339,7 @@ pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result<Jo
                 chunks = upserted,
                 embed_tokens = total_tokens,
                 embed_tokens_per_sec,
+                io_ms,
                 parse_ms,
                 chunk_ms,
                 embed_ms,
@@ -363,6 +366,7 @@ pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result<Jo
 
     Ok(JobStats {
         chunks: n_chunks,
+        io_ms,
         parse_ms,
         chunk_ms,
         embed_ms,

From 3a9479b862074e34dd1956a8d02068cb08f04d2a Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:40:51 -0400
Subject: [PATCH 16/23] fix(noxa-qkg): guard XML/OPML/feed parsers against
 entity expansion attacks

---
 crates/noxa-rag/src/pipeline/parse/mod.rs   |  2 +-
 crates/noxa-rag/src/pipeline/parse/rich.rs  |  7 +++
 crates/noxa-rag/src/pipeline/parse/tests.rs | 49 +++++++++++++++++++++
 crates/noxa-rag/src/pipeline/parse/text.rs  | 31 +++++++++++--
 4 files changed, 85 insertions(+), 4 deletions(-)

diff --git a/crates/noxa-rag/src/pipeline/parse/mod.rs b/crates/noxa-rag/src/pipeline/parse/mod.rs
index e7baf10..9f073f2 100644
--- a/crates/noxa-rag/src/pipeline/parse/mod.rs
+++ b/crates/noxa-rag/src/pipeline/parse/mod.rs
@@ -113,7 +113,7 @@ pub(crate) async fn parse_file(path: &Path, bytes: Vec<u8>) -> Result<ParsedFile
             .await
         }
         "jsonl" => Ok(parse_jsonl_file(bytes, file_url, title)),
-        "xml" | "opml" => Ok(parse_xml_file(bytes, file_url, title)),
+        "xml" | "opml" => parse_xml_file(bytes, file_url, title),
         "rss" | "atom" => parse_feed_file(bytes, file_url, title),
         "eml" => parse_email_file(&bytes, file_url, title),
         "vtt" | "srt" => Ok(parse_subtitle_file(bytes, file_url, title)),
diff --git a/crates/noxa-rag/src/pipeline/parse/rich.rs b/crates/noxa-rag/src/pipeline/parse/rich.rs
index aec10a4..27cb8ff 100644
--- a/crates/noxa-rag/src/pipeline/parse/rich.rs
+++ b/crates/noxa-rag/src/pipeline/parse/rich.rs
@@ -2,6 +2,7 @@ use crate::error::RagError;
 
 use super::{
     FormatProvenance, IngestionProvenance, ParsedFile, extract_xml_text, make_text_result,
+    text::contains_xml_entity_expansion_risk,
 };
 
 pub(crate) fn parse_feed_file(
@@ -9,6 +10,12 @@ pub(crate) fn parse_feed_file(
     file_url: String,
     title: String,
 ) -> Result<ParsedFile, RagError> {
+    if contains_xml_entity_expansion_risk(&bytes) {
+        return Err(RagError::Parse(
+            "XML entity expansion risk detected: file contains DOCTYPE/ENTITY declarations"
+                .to_string(),
+        ));
+    }
     let content = String::from_utf8_lossy(&bytes).into_owned();
     let (extraction, provenance) = parse_feed_text(&content, file_url, title)?;
     Ok(ParsedFile {
diff --git a/crates/noxa-rag/src/pipeline/parse/tests.rs b/crates/noxa-rag/src/pipeline/parse/tests.rs
index 6dbe019..bbd8835 100644
--- a/crates/noxa-rag/src/pipeline/parse/tests.rs
+++ b/crates/noxa-rag/src/pipeline/parse/tests.rs
@@ -414,3 +414,52 @@ fn build_point_payload_serializes_subtitle_variant() {
         Some("demo.mp4")
     );
 }
+
+/// Entity expansion (billion-laughs) in XML should be rejected before parsing.
+#[tokio::test]
+async fn parse_xml_rejects_entity_expansion() {
+    let tmp = tempfile::tempdir().expect("tempdir");
+    let path = tmp.path().join("evil.xml");
+    let payload = b"<?xml version=\"1.0\"?>\
+        <!DOCTYPE foo [<!ENTITY lol \"lol\">]>\
+        <root>&lol;</root>";
+    std::fs::write(&path, payload).expect("write");
+    let result = parse_file(&path, payload.to_vec()).await;
+    assert!(
+        result.is_err(),
+        "expected Err for DOCTYPE/ENTITY XML, got Ok"
+    );
+    let msg = result.unwrap_err().to_string();
+    assert!(
+        msg.contains("DOCTYPE") || msg.contains("ENTITY") || msg.contains("entity expansion"),
+        "error message should mention DOCTYPE/ENTITY: {msg}"
+    );
+}
+
+/// Entity expansion guard in RSS/Atom feed parser.
+#[tokio::test]
+async fn parse_feed_rejects_entity_expansion() {
+    let tmp = tempfile::tempdir().expect("tempdir");
+    let path = tmp.path().join("evil.rss");
+    let payload = b"<?xml version=\"1.0\"?>\
+        <!DOCTYPE foo [<!ENTITY lol \"lol\">]>\
+        <rss version=\"2.0\"><channel><title>&lol;</title></channel></rss>";
+    std::fs::write(&path, payload).expect("write");
+    let result = parse_file(&path, payload.to_vec()).await;
+    assert!(
+        result.is_err(),
+        "expected Err for DOCTYPE/ENTITY feed, got Ok"
+    );
+}
+
+/// Normal XML without DOCTYPE/ENTITY should still parse successfully.
+#[tokio::test]
+async fn parse_xml_allows_normal_content() {
+    let tmp = tempfile::tempdir().expect("tempdir");
+    let path = tmp.path().join("normal.xml");
+    let payload =
+        b"<?xml version=\"1.0\"?><root><item>hello world</item><item>foo bar</item></root>";
+    std::fs::write(&path, payload).expect("write");
+    let result = parse_file(&path, payload.to_vec()).await;
+    assert!(result.is_ok(), "normal XML should parse without error");
+}
diff --git a/crates/noxa-rag/src/pipeline/parse/text.rs b/crates/noxa-rag/src/pipeline/parse/text.rs
index 52ea0dd..2ca7d1f 100644
--- a/crates/noxa-rag/src/pipeline/parse/text.rs
+++ b/crates/noxa-rag/src/pipeline/parse/text.rs
@@ -151,14 +151,39 @@ pub(crate) fn parse_jsonl_file(bytes: Vec<u8>, file_url: String, title: String)
     }
 }
 
-pub(crate) fn parse_xml_file(bytes: Vec<u8>, file_url: String, title: String) -> ParsedFile {
+/// Scan the first 8 KiB of bytes for DOCTYPE/ENTITY declarations that could
+/// trigger exponential entity expansion (the "billion laughs" attack). Returns
+/// `true` if such declarations are found.
+///
+/// quick-xml 0.37.x does NOT apply DTD expansion limits, so this pre-scan is
+/// the primary — and not merely defensive — guard against out-of-memory
+/// attacks. Later versions of quick-xml may add limits, but we keep the scan
+/// regardless (defense-in-depth).
+pub(super) fn contains_xml_entity_expansion_risk(bytes: &[u8]) -> bool {
+    let scan_limit = bytes.len().min(8192);
+    let header = &bytes[..scan_limit];
+    let s = std::str::from_utf8(header).unwrap_or("");
+    s.contains("<!DOCTYPE") || s.contains("<!ENTITY")
+}
+
+pub(crate) fn parse_xml_file(
+    bytes: Vec<u8>,
+    file_url: String,
+    title: String,
+) -> Result<ParsedFile, RagError> {
+    if contains_xml_entity_expansion_risk(&bytes) {
+        return Err(RagError::Parse(
+            "XML entity expansion risk detected: file contains DOCTYPE/ENTITY declarations"
+                .to_string(),
+        ));
+    }
     let content = String::from_utf8_lossy(&bytes).into_owned();
     let text = extract_xml_text(&content).unwrap_or_else(|e| {
         tracing::warn!(error = %e, "xml text extraction failed; falling back to raw text");
         content.clone()
     });
     let word_count = text.split_whitespace().count();
-    ParsedFile {
+    Ok(ParsedFile {
         extraction: make_text_result(
             text.clone(),
             text,
@@ -168,7 +193,7 @@ pub(crate) fn parse_xml_file(bytes: Vec<u8>, file_url: String, title: String) ->
             word_count,
         ),
         provenance: IngestionProvenance::default(),
-    }
+    })
 }
 
 /// Extract plain text from XML/OPML/RSS/Atom by collecting all text and CDATA nodes.

From 4e71049f6c879114a73f3db5789aeffed35e5b77 Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:41:38 -0400
Subject: [PATCH 17/23] refactor(noxa-3g7): add UrlValidation/WorkerPanic
 RagError variants, surface confinement violations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace RagError::Generic at all five validate_url_scheme call sites with
RagError::UrlValidation so callers can pattern-match on URL-specific
failures without parsing error strings. Replace the chunker spawn_blocking
join-handle panic with RagError::WorkerPanic for the same reason.

Surface path confinement rejections to monitoring by calling
ctx.counters.record_failure() before the silent Ok(default) return at the
TOCTOU guard — confinement rejections are now visible in files_failed
heartbeat metrics even though no Err is propagated.
---
 crates/noxa-rag/src/error.rs            |  4 ++++
 crates/noxa-rag/src/pipeline/process.rs | 15 ++++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/crates/noxa-rag/src/error.rs b/crates/noxa-rag/src/error.rs
index eab5bf5..91b1788 100644
--- a/crates/noxa-rag/src/error.rs
+++ b/crates/noxa-rag/src/error.rs
@@ -32,6 +32,10 @@ pub enum RagError {
     Json(#[from] serde_json::Error),
     #[error("parse error: {0}")]
     Parse(String),
+    #[error("url validation failed: {0}")]
+    UrlValidation(String),
+    #[error("worker panicked: {0}")]
+    WorkerPanic(String),
     #[error("error: {0}")]
     Generic(String),
 }
diff --git a/crates/noxa-rag/src/pipeline/process.rs b/crates/noxa-rag/src/pipeline/process.rs
index a3435c1..ebf41e5 100644
--- a/crates/noxa-rag/src/pipeline/process.rs
+++ b/crates/noxa-rag/src/pipeline/process.rs
@@ -21,29 +21,29 @@ use super::{DeleteJob, IndexJob, JobStats, WorkerContext};
 /// - All other schemes: rejected.
 pub(crate) async fn validate_url_scheme(url: &str) -> Result<(), RagError> {
     if url.is_empty() {
-        return Err(RagError::Generic(
+        return Err(RagError::UrlValidation(
             "extraction result has no URL".to_string(),
         ));
     }
-    let parsed =
-        url::Url::parse(url).map_err(|e| RagError::Generic(format!("invalid URL {url:?}: {e}")))?;
+    let parsed = url::Url::parse(url)
+        .map_err(|e| RagError::UrlValidation(format!("invalid URL {url:?}: {e}")))?;
 
     match parsed.scheme() {
         "http" | "https" => {
             noxa_store::url_validation::validate_public_http_url(url)
                 .await
-                .map_err(|e| RagError::Generic(format!("URL {url:?} blocked: {e}")))?;
+                .map_err(|e| RagError::UrlValidation(format!("URL {url:?} blocked: {e}")))?;
         }
         "file" => match parsed.host_str() {
             None | Some("") | Some("localhost") => {}
             Some(host) => {
-                return Err(RagError::Generic(format!(
+                return Err(RagError::UrlValidation(format!(
                     "file:// URL with remote host {host:?} is not allowed (only local paths)"
                 )));
             }
         },
         other => {
-            return Err(RagError::Generic(format!(
+            return Err(RagError::UrlValidation(format!(
                 "URL scheme {other:?} is not allowed (only http/https/file)"
             )));
         }
@@ -146,6 +146,7 @@ pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result<Jo
             path = %job.path.display(),
             "path outside watch_dir — skipping (potential TOCTOU attack)"
         );
+        ctx.counters.record_failure();
         return Ok(JobStats::default());
     }
     let file = tokio::fs::File::open(&canonical).await?;
@@ -240,7 +241,7 @@ pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result<Jo
             crate::chunker::chunk(&result_for_chunk, &config_chunker, &tokenizer)
         })
         .await
-        .map_err(|e| RagError::Generic(format!("chunker panicked: {e}")))?
+        .map_err(|e| RagError::WorkerPanic(format!("chunker panicked: {e}")))?
     };
     if chunks.is_empty() {
         tracing::info!(url = %url, "no indexable content after chunking");

From 2a39b7d570c19d5e37396d07de81e5ad698842b9 Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:42:00 -0400
Subject: [PATCH 18/23] refactor(noxa-udb): move FormatProvenance match into
 apply() method, eliminate 14 mut locals in build_point_payload

---
 crates/noxa-rag/src/pipeline/parse/mod.rs | 167 +++++++++++-----------
 1 file changed, 85 insertions(+), 82 deletions(-)

diff --git a/crates/noxa-rag/src/pipeline/parse/mod.rs b/crates/noxa-rag/src/pipeline/parse/mod.rs
index 9f073f2..13a5a90 100644
--- a/crates/noxa-rag/src/pipeline/parse/mod.rs
+++ b/crates/noxa-rag/src/pipeline/parse/mod.rs
@@ -61,6 +61,68 @@ pub(crate) enum FormatProvenance {
     Generic,
 }
 
+impl FormatProvenance {
+    /// Apply format-specific fields onto `payload`. The payload must already
+    /// have its web fallback fields (seed_url, search_query, crawl_depth)
+    /// pre-initialised from `result.metadata.*` before this method is called.
+    /// Variant fields take precedence: a `Some(...)` from the variant
+    /// overwrites the pre-set metadata value; a `None` leaves it unchanged.
+    pub(crate) fn apply(&self, payload: &mut PointPayload) {
+        match self {
+            FormatProvenance::Web {
+                seed_url,
+                search_query,
+                crawl_depth,
+            } => {
+                if seed_url.is_some() {
+                    payload.seed_url = seed_url.clone();
+                }
+                if search_query.is_some() {
+                    payload.search_query = search_query.clone();
+                }
+                if crawl_depth.is_some() {
+                    payload.crawl_depth = *crawl_depth;
+                }
+            }
+            FormatProvenance::Email {
+                to,
+                message_id,
+                thread_id,
+                has_attachments,
+            } => {
+                payload.email_to = to.clone();
+                payload.email_message_id = message_id.clone();
+                payload.email_thread_id = thread_id.clone();
+                payload.email_has_attachments = *has_attachments;
+            }
+            FormatProvenance::Feed {
+                feed_url,
+                item_id,
+            } => {
+                payload.feed_url = feed_url.clone();
+                payload.feed_item_id = item_id.clone();
+            }
+            FormatProvenance::Subtitle {
+                start_s,
+                end_s,
+                source_file,
+            } => {
+                payload.subtitle_start_s = *start_s;
+                payload.subtitle_end_s = *end_s;
+                payload.subtitle_source_file = source_file.clone();
+            }
+            FormatProvenance::Presentation {
+                slide_count,
+                has_notes,
+            } => {
+                payload.pptx_slide_count = *slide_count;
+                payload.pptx_has_notes = *has_notes;
+            }
+            FormatProvenance::Generic => {}
+        }
+    }
+}
+
 #[derive(Debug, Clone, Default)]
 pub(crate) struct IngestionProvenance {
     pub external_id: Option<String>,
@@ -183,73 +245,11 @@ pub(crate) fn build_point_payload(
     url: &str,
     file_hash: Option<&str>,
 ) -> PointPayload {
-    // Default values for every format-specific field. Only the fields
-    // belonging to the active variant are overridden below; the rest stay
-    // at their type default (None / empty Vec), matching the old flat
-    // struct's behaviour where inactive fields were simply not set.
-    let mut seed_url: Option<String> = None;
-    let mut search_query: Option<String> = None;
-    let mut crawl_depth: Option<u32> = None;
-    let mut email_to: Vec<String> = Vec::new();
-    let mut email_message_id: Option<String> = None;
-    let mut email_thread_id: Option<String> = None;
-    let mut email_has_attachments: Option<bool> = None;
-    let mut feed_url: Option<String> = None;
-    let mut feed_item_id: Option<String> = None;
-    let mut pptx_slide_count: Option<u32> = None;
-    let mut pptx_has_notes: Option<bool> = None;
-    let mut subtitle_start_s: Option<f64> = None;
-    let mut subtitle_end_s: Option<f64> = None;
-    let mut subtitle_source_file: Option<String> = None;
-
-    match &provenance.format {
-        FormatProvenance::Web {
-            seed_url: s,
-            search_query: q,
-            crawl_depth: d,
-        } => {
-            seed_url = s.clone();
-            search_query = q.clone();
-            crawl_depth = *d;
-        }
-        FormatProvenance::Email {
-            to,
-            message_id,
-            thread_id,
-            has_attachments,
-        } => {
-            email_to = to.clone();
-            email_message_id = message_id.clone();
-            email_thread_id = thread_id.clone();
-            email_has_attachments = *has_attachments;
-        }
-        FormatProvenance::Feed {
-            feed_url: f,
-            item_id,
-        } => {
-            feed_url = f.clone();
-            feed_item_id = item_id.clone();
-        }
-        FormatProvenance::Subtitle {
-            start_s,
-            end_s,
-            source_file,
-        } => {
-            subtitle_start_s = *start_s;
-            subtitle_end_s = *end_s;
-            subtitle_source_file = source_file.clone();
-        }
-        FormatProvenance::Presentation {
-            slide_count,
-            has_notes,
-        } => {
-            pptx_slide_count = *slide_count;
-            pptx_has_notes = *has_notes;
-        }
-        FormatProvenance::Generic => {}
-    }
-
-    PointPayload {
+    // Pre-initialise web-provenance fields from result.metadata as fallbacks.
+    // FormatProvenance::apply() will override these when the active variant
+    // carries its own Some(...) values. Non-web variants leave these at the
+    // metadata default (which is typically None).
+    let mut payload = PointPayload {
         text: chunk.text.clone(),
         url: url.to_string(),
         domain: chunk.domain.clone(),
@@ -269,23 +269,26 @@ pub(crate) fn build_point_payload(
         git_branch,
         external_id: provenance.external_id.clone(),
         platform_url: provenance.platform_url.clone(),
-        seed_url: seed_url.or_else(|| result.metadata.seed_url.clone()),
-        search_query: search_query.or_else(|| result.metadata.search_query.clone()),
-        crawl_depth: crawl_depth.or(result.metadata.crawl_depth),
-        email_to,
-        email_message_id,
-        email_thread_id,
-        email_has_attachments,
-        feed_url,
-        feed_item_id,
-        pptx_slide_count,
-        pptx_has_notes,
-        subtitle_start_s,
-        subtitle_end_s,
-        subtitle_source_file,
+        seed_url: result.metadata.seed_url.clone(),
+        search_query: result.metadata.search_query.clone(),
+        crawl_depth: result.metadata.crawl_depth,
+        email_to: Vec::new(),
+        email_message_id: None,
+        email_thread_id: None,
+        email_has_attachments: None,
+        feed_url: None,
+        feed_item_id: None,
+        pptx_slide_count: None,
+        pptx_has_notes: None,
+        subtitle_start_s: None,
+        subtitle_end_s: None,
+        subtitle_source_file: None,
         section_header: chunk.section_header.clone(),
         file_hash: file_hash.map(str::to_owned),
-    }
+    };
+
+    provenance.format.apply(&mut payload);
+    payload
 }
 
 pub(crate) fn make_text_result(

From 8d2cc7b38ae3a07086ebdcb1327b2464233a4860 Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:42:57 -0400
Subject: [PATCH 19/23] perf(noxa-rso): check git_branch_cache before spawning
 blocking task in process_job

Add find_cached_branch() helper that walks ancestor directories of the
already-canonical file path against the DashMap cache (O(depth) in-memory
lookups, no I/O). If any ancestor matches a cached git root, return the
stored branch immediately without spawning a blocking thread.

On cache miss, canonicalize the detected git root before storing it so
future pre-checks via canonical path ancestors always match, regardless of
symlinks or non-normalized input paths. Also cache the miss (None) at the
immediate parent directory to skip re-stating .git for files outside any
git repository.

This eliminates spawn_blocking for every file after the first in each git
repository, which is the common case for large watched directories.
---
 crates/noxa-rag/src/pipeline/process.rs | 59 ++++++++++++++++++++++---
 1 file changed, 52 insertions(+), 7 deletions(-)

diff --git a/crates/noxa-rag/src/pipeline/process.rs b/crates/noxa-rag/src/pipeline/process.rs
index ebf41e5..e773cca 100644
--- a/crates/noxa-rag/src/pipeline/process.rs
+++ b/crates/noxa-rag/src/pipeline/process.rs
@@ -1,6 +1,7 @@
-use std::path::Path;
+use std::path::{Path, PathBuf};
 use std::sync::Arc;
 
+use dashmap::DashMap;
 use tokio::io::AsyncReadExt;
 
 use crate::error::RagError;
@@ -11,6 +12,29 @@ use super::parse;
 use super::scan;
 use super::{DeleteJob, IndexJob, JobStats, WorkerContext};
 
+/// Walk up from `path`'s parent checking each ancestor directory against `cache`.
+///
+/// Returns `Some(branch)` on a cache hit (where the inner `Option<String>` is the stored
+/// branch value — `None` means "not in a git repo"), or `None` when no ancestor is cached.
+///
+/// O(depth) DashMap lookups; typical repo depth ≤ 5, so this avoids a `spawn_blocking`
+/// dispatch for every file after the first in each git repository.
+fn find_cached_branch(
+    path: &Path,
+    cache: &DashMap<PathBuf, Option<String>>,
+) -> Option<Option<String>> {
+    let mut dir = path.parent()?;
+    loop {
+        if let Some(entry) = cache.get(dir) {
+            return Some(entry.clone());
+        }
+        match dir.parent() {
+            Some(parent) => dir = parent,
+            None => return None,
+        }
+    }
+}
+
 /// Validate that `url` is a permitted indexing target.
 ///
 /// - `http`/`https`: delegates to `noxa_store::url_validation::validate_public_http_url`, which
@@ -195,18 +219,39 @@ pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result<Jo
         result.metadata.last_modified =
             Some(chrono::DateTime::<chrono::Utc>::from(mtime).to_rfc3339());
     }
-    let git_branch = {
-        let path = job.path.clone();
+    let git_branch = if let Some(cached) =
+        find_cached_branch(&canonical, &ctx.git_branch_cache)
+    {
+        // Cache hit — no spawn_blocking needed. `canonical` ancestors were walked
+        // synchronously in async context (pure in-memory DashMap lookups, no I/O).
+        cached
+    } else {
+        // Cache miss — dispatch to a blocking thread to read .git/HEAD from disk.
+        // Store the result keyed by the canonical git root so future files in the
+        // same repo hit the pre-check without spawning.
+        let canonical_clone = canonical.clone();
         let cache = ctx.git_branch_cache.clone();
         tokio::task::spawn_blocking(move || {
-            // Walk up to find the git root first so we can use it as a stable cache key.
-            if let Some((git_root, branch)) = scan::detect_git_root_and_branch(&path) {
+            if let Some((git_root, branch)) = scan::detect_git_root_and_branch(&canonical_clone) {
+                // Canonicalize the git root so subsequent lookups via `canonical`
+                // ancestors always match, regardless of symlinks or path normalization.
+                let key = std::fs::canonicalize(&git_root).unwrap_or(git_root);
                 cache
-                    .entry(git_root)
+                    .entry(key)
                     .or_insert_with(|| Some(branch.clone()))
                     .clone()
             } else {
-                None
+                // Cache the miss so we don't re-stat .git for every file outside a repo.
+                let key = std::fs::canonicalize(
+                    canonical_clone.parent().unwrap_or(&canonical_clone),
+                )
+                .unwrap_or_else(|_| {
+                    canonical_clone
+                        .parent()
+                        .unwrap_or(&canonical_clone)
+                        .to_path_buf()
+                });
+                cache.entry(key).or_insert(None).clone()
             }
         })
         .await

From 8c09c8a5e0e9472b2e2a0a190d6352d8c26c6a86 Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 19:49:45 -0400
Subject: [PATCH 20/23] perf(noxa-346): factor per-file metadata into
 FileMetadata struct, eliminate N-per-chunk clones

---
 crates/noxa-rag/src/pipeline/parse/mod.rs | 170 +++++++++++++++-------
 crates/noxa-rag/src/pipeline/process.rs   |  15 +-
 2 files changed, 131 insertions(+), 54 deletions(-)

diff --git a/crates/noxa-rag/src/pipeline/parse/mod.rs b/crates/noxa-rag/src/pipeline/parse/mod.rs
index 13a5a90..22f5186 100644
--- a/crates/noxa-rag/src/pipeline/parse/mod.rs
+++ b/crates/noxa-rag/src/pipeline/parse/mod.rs
@@ -3,7 +3,7 @@ use std::path::Path;
 use noxa_core::types::ExtractionResult;
 
 use crate::error::RagError;
-use crate::types::PointPayload;
+use crate::types::{Chunk, PointPayload};
 
 mod binary;
 mod rich;
@@ -62,27 +62,16 @@ pub(crate) enum FormatProvenance {
 }
 
 impl FormatProvenance {
-    /// Apply format-specific fields onto `payload`. The payload must already
-    /// have its web fallback fields (seed_url, search_query, crawl_depth)
-    /// pre-initialised from `result.metadata.*` before this method is called.
-    /// Variant fields take precedence: a `Some(...)` from the variant
-    /// overwrites the pre-set metadata value; a `None` leaves it unchanged.
+    /// Apply format-specific fields onto `payload`.
+    ///
+    /// Web-variant web fields (seed_url, search_query, crawl_depth) are
+    /// pre-resolved in `FileMetadata::from_result_and_provenance` (variant
+    /// Some takes precedence over metadata fallback). This method therefore
+    /// only needs to handle non-web format variants.
     pub(crate) fn apply(&self, payload: &mut PointPayload) {
         match self {
-            FormatProvenance::Web {
-                seed_url,
-                search_query,
-                crawl_depth,
-            } => {
-                if seed_url.is_some() {
-                    payload.seed_url = seed_url.clone();
-                }
-                if search_query.is_some() {
-                    payload.search_query = search_query.clone();
-                }
-                if crawl_depth.is_some() {
-                    payload.crawl_depth = *crawl_depth;
-                }
+            FormatProvenance::Web { .. } => {
+                // Already resolved into FileMetadata — nothing to do here.
             }
             FormatProvenance::Email {
                 to,
@@ -130,6 +119,90 @@ pub(crate) struct IngestionProvenance {
     pub format: FormatProvenance,
 }
 
+/// Per-file metadata that is identical for every chunk produced from the same
+/// document. Built once before the chunk loop via
+/// `FileMetadata::from_result_and_provenance` so the file-level fields are
+/// cloned once rather than once per chunk.
+///
+/// `file_hash` is NOT included here because it is computed separately in
+/// `process.rs` and passed directly to `build_point_payload`.
+pub(crate) struct FileMetadata {
+    pub title: Option<String>,
+    pub author: Option<String>,
+    pub published_date: Option<String>,
+    pub language: Option<String>,
+    pub source_type: Option<String>,
+    pub content_hash: Option<String>,
+    pub technologies: Vec<String>,
+    pub is_truncated: Option<bool>,
+    pub file_path: Option<String>,
+    pub last_modified: Option<String>,
+    pub git_branch: Option<String>,
+    pub external_id: Option<String>,
+    pub platform_url: Option<String>,
+    /// Web-provenance: variant `Some` wins over metadata fallback.
+    pub seed_url: Option<String>,
+    pub search_query: Option<String>,
+    pub crawl_depth: Option<u32>,
+    /// Format variant — used by `build_point_payload` to apply non-web fields
+    /// (Email, Feed, Subtitle, Presentation). Stored here so `build_point_payload`
+    /// doesn't need to receive `&IngestionProvenance` separately.
+    pub format: FormatProvenance,
+}
+
+impl FileMetadata {
+    /// Build `FileMetadata` once per document. The merge rule for web-provenance
+    /// fields is: `FormatProvenance::Web { Some(x) }` takes precedence over
+    /// `result.metadata.*`; `None` in the variant falls back to `metadata.*`.
+    /// Non-web variants leave those fields at the metadata value (typically None).
+    pub(crate) fn from_result_and_provenance(
+        result: &ExtractionResult,
+        git_branch: Option<String>,
+        provenance: &IngestionProvenance,
+    ) -> Self {
+        let (seed_url, search_query, crawl_depth) = match &provenance.format {
+            FormatProvenance::Web {
+                seed_url,
+                search_query,
+                crawl_depth,
+            } => (
+                seed_url
+                    .clone()
+                    .or_else(|| result.metadata.seed_url.clone()),
+                search_query
+                    .clone()
+                    .or_else(|| result.metadata.search_query.clone()),
+                crawl_depth.or(result.metadata.crawl_depth),
+            ),
+            _ => (
+                result.metadata.seed_url.clone(),
+                result.metadata.search_query.clone(),
+                result.metadata.crawl_depth,
+            ),
+        };
+
+        Self {
+            title: result.metadata.title.clone(),
+            author: result.metadata.author.clone(),
+            published_date: result.metadata.published_date.clone(),
+            language: result.metadata.language.clone(),
+            source_type: result.metadata.source_type.clone(),
+            content_hash: result.metadata.content_hash.clone(),
+            technologies: result.metadata.technologies.clone(),
+            is_truncated: result.metadata.is_truncated,
+            file_path: result.metadata.file_path.clone(),
+            last_modified: result.metadata.last_modified.clone(),
+            git_branch,
+            external_id: provenance.external_id.clone(),
+            platform_url: provenance.platform_url.clone(),
+            seed_url,
+            search_query,
+            crawl_depth,
+            format: provenance.format.clone(),
+        }
+    }
+}
+
 pub(crate) async fn parse_file(path: &Path, bytes: Vec<u8>) -> Result<ParsedFile, RagError> {
     let ext = path
         .extension()
@@ -238,40 +311,38 @@ pub(crate) fn extract_ingestion_provenance(value: &serde_json::Value) -> Ingesti
 }
 
 pub(crate) fn build_point_payload(
-    chunk: &crate::types::Chunk,
-    result: &ExtractionResult,
-    git_branch: Option<String>,
-    provenance: &IngestionProvenance,
+    chunk: Chunk,
+    file_meta: &FileMetadata,
     url: &str,
     file_hash: Option<&str>,
 ) -> PointPayload {
-    // Pre-initialise web-provenance fields from result.metadata as fallbacks.
-    // FormatProvenance::apply() will override these when the active variant
-    // carries its own Some(...) values. Non-web variants leave these at the
-    // metadata default (which is typically None).
     let mut payload = PointPayload {
-        text: chunk.text.clone(),
-        url: url.to_string(),
-        domain: chunk.domain.clone(),
+        // Per-chunk fields — moved out of chunk (no clone for text/domain/section_header).
+        text: chunk.text,
+        domain: chunk.domain,
         chunk_index: chunk.chunk_index,
         total_chunks: chunk.total_chunks,
         token_estimate: chunk.token_estimate,
-        title: result.metadata.title.clone(),
-        author: result.metadata.author.clone(),
-        published_date: result.metadata.published_date.clone(),
-        language: result.metadata.language.clone(),
-        source_type: result.metadata.source_type.clone(),
-        content_hash: result.metadata.content_hash.clone(),
-        technologies: result.metadata.technologies.clone(),
-        is_truncated: result.metadata.is_truncated,
-        file_path: result.metadata.file_path.clone(),
-        last_modified: result.metadata.last_modified.clone(),
-        git_branch,
-        external_id: provenance.external_id.clone(),
-        platform_url: provenance.platform_url.clone(),
-        seed_url: result.metadata.seed_url.clone(),
-        search_query: result.metadata.search_query.clone(),
-        crawl_depth: result.metadata.crawl_depth,
+        section_header: chunk.section_header,
+        // Per-file fields — cloned once from FileMetadata (built once per document).
+        url: url.to_string(),
+        title: file_meta.title.clone(),
+        author: file_meta.author.clone(),
+        published_date: file_meta.published_date.clone(),
+        language: file_meta.language.clone(),
+        source_type: file_meta.source_type.clone(),
+        content_hash: file_meta.content_hash.clone(),
+        technologies: file_meta.technologies.clone(),
+        is_truncated: file_meta.is_truncated,
+        file_path: file_meta.file_path.clone(),
+        last_modified: file_meta.last_modified.clone(),
+        git_branch: file_meta.git_branch.clone(),
+        external_id: file_meta.external_id.clone(),
+        platform_url: file_meta.platform_url.clone(),
+        seed_url: file_meta.seed_url.clone(),
+        search_query: file_meta.search_query.clone(),
+        crawl_depth: file_meta.crawl_depth,
+        // Format-variant fields default to empty/None; apply() fills them in.
         email_to: Vec::new(),
         email_message_id: None,
         email_thread_id: None,
@@ -283,11 +354,12 @@ pub(crate) fn build_point_payload(
         subtitle_start_s: None,
         subtitle_end_s: None,
         subtitle_source_file: None,
-        section_header: chunk.section_header.clone(),
         file_hash: file_hash.map(str::to_owned),
     };
 
-    provenance.format.apply(&mut payload);
+    // apply() now only handles non-web variants (Email, Feed, Subtitle, Presentation).
+    // Web-provenance fields were already resolved into FileMetadata once per document.
+    file_meta.format.apply(&mut payload);
     payload
 }
 
diff --git a/crates/noxa-rag/src/pipeline/process.rs b/crates/noxa-rag/src/pipeline/process.rs
index e773cca..c0a23d8 100644
--- a/crates/noxa-rag/src/pipeline/process.rs
+++ b/crates/noxa-rag/src/pipeline/process.rs
@@ -321,9 +321,16 @@ pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result<Jo
     }
 
     let n_chunks = chunks.len();
+    // Build per-file metadata once; chunk loop clones from it instead of from
+    // result + provenance on every iteration (bead noxa-346).
+    let file_meta = parse::FileMetadata::from_result_and_provenance(
+        &result,
+        git_branch,
+        &parsed.provenance,
+    );
     let points: Vec<Point> = chunks
-        .iter()
-        .zip(vectors)
+        .into_iter()
+        .zip(vectors.into_iter())
         .enumerate()
         .map(|(i, (chunk, vector))| {
             let id = uuid::Uuid::new_v5(
@@ -335,9 +342,7 @@ pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result<Jo
                 vector,
                 payload: parse::build_point_payload(
                     chunk,
-                    &result,
-                    git_branch.clone(),
-                    &parsed.provenance,
+                    &file_meta,
                     &url,
                     Some(&file_hash),
                 ),

From bd4243a97952252b20d0394d0459b95b58c23d17 Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Sun, 26 Apr 2026 20:16:51 -0400
Subject: [PATCH 21/23] =?UTF-8?q?refactor:=20simplify=20=E2=80=94=20shared?=
 =?UTF-8?q?=20word=5Fcount,=20remove=20redundant=20comments,=20eliminate?=
 =?UTF-8?q?=20double-collect?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 crates/noxa-rag/src/chunker.rs               | 49 ++++----------------
 crates/noxa-rag/src/pipeline.rs              |  2 +-
 crates/noxa-rag/src/pipeline/parse/binary.rs |  4 +-
 crates/noxa-rag/src/pipeline/parse/rich.rs   |  6 +--
 crates/noxa-rag/src/pipeline/parse/text.rs   | 12 ++---
 crates/noxa-rag/src/pipeline/process.rs      |  3 --
 crates/noxa-rag/src/pipeline/startup_scan.rs | 11 +----
 7 files changed, 22 insertions(+), 65 deletions(-)

diff --git a/crates/noxa-rag/src/chunker.rs b/crates/noxa-rag/src/chunker.rs
index e79699b..2e07380 100644
--- a/crates/noxa-rag/src/chunker.rs
+++ b/crates/noxa-rag/src/chunker.rs
@@ -6,7 +6,7 @@ use crate::config::ChunkerConfig;
 use crate::types::Chunk;
 
 /// Count whitespace-separated words in a string.
-fn word_count(s: &str) -> usize {
+pub(crate) fn word_count(s: &str) -> usize {
     s.split_whitespace().count()
 }
 
@@ -110,9 +110,6 @@ pub fn chunk(
         .into_iter()
         .enumerate()
         .map(|(chunk_index, (char_offset, text))| {
-            // Use word count as a cheap approximation for the diagnostic token_estimate
-            // field. The tokenizer is already used by the splitter for boundary accuracy;
-            // re-encoding every chunk here would double tokenization work on the hot path.
             let t_est = word_count(&text);
             let section_header = nearest_heading(&headings, char_offset).map(|s| s.to_string());
             Chunk {
@@ -132,45 +129,17 @@ pub fn chunk(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use noxa_core::types::{Content, Metadata};
     use std::str::FromStr as _;
 
     fn make_extraction_result(markdown: &str) -> ExtractionResult {
-        ExtractionResult {
-            metadata: Metadata {
-                title: None,
-                description: None,
-                author: None,
-                published_date: None,
-                language: None,
-                url: Some("https://example.com/test".to_string()),
-                site_name: None,
-                image: None,
-                favicon: None,
-                word_count: markdown.split_whitespace().count(),
-                content_hash: None,
-                source_type: None,
-                file_path: None,
-                last_modified: None,
-                is_truncated: None,
-                technologies: Vec::new(),
-                seed_url: None,
-                crawl_depth: None,
-                search_query: None,
-                fetched_at: None,
-            },
-            content: Content {
-                markdown: markdown.to_string(),
-                plain_text: String::new(),
-                links: Vec::new(),
-                images: Vec::new(),
-                code_blocks: Vec::new(),
-                raw_html: None,
-            },
-            domain_data: None,
-            vertical_data: None,
-            structured_data: Vec::new(),
-        }
+        crate::pipeline::parse::make_text_result(
+            markdown.to_string(),
+            String::new(),
+            "https://example.com/test".to_string(),
+            None,
+            "test",
+            crate::chunker::word_count(markdown),
+        )
     }
 
     /// Build a minimal whitespace-pretokenized WordLevel tokenizer suitable for
diff --git a/crates/noxa-rag/src/pipeline.rs b/crates/noxa-rag/src/pipeline.rs
index 326e66a..f1f30f6 100644
--- a/crates/noxa-rag/src/pipeline.rs
+++ b/crates/noxa-rag/src/pipeline.rs
@@ -11,7 +11,7 @@ use crate::error::RagError;
 use crate::store::DynVectorStore;
 
 mod heartbeat;
-mod parse;
+pub(crate) mod parse;
 mod process;
 mod runtime;
 mod scan;
diff --git a/crates/noxa-rag/src/pipeline/parse/binary.rs b/crates/noxa-rag/src/pipeline/parse/binary.rs
index 7cf2a55..20bde35 100644
--- a/crates/noxa-rag/src/pipeline/parse/binary.rs
+++ b/crates/noxa-rag/src/pipeline/parse/binary.rs
@@ -12,7 +12,7 @@ pub(crate) fn parse_pdf_file(
     let result = noxa_pdf::extract_pdf(bytes, noxa_pdf::PdfMode::Auto)
         .map_err(|e| RagError::Parse(format!("PDF extract: {e}")))?;
     let text = noxa_pdf::to_markdown(&result);
-    let word_count = text.split_whitespace().count();
+    let word_count = crate::chunker::word_count(&text);
     Ok(ParsedFile {
         extraction: make_text_result(text.clone(), text, url, Some(title), "file", word_count),
         provenance: IngestionProvenance::default(),
@@ -189,7 +189,7 @@ pub(crate) fn parse_office_zip_file(
     }
 
     let text = text_parts.join("\n\n");
-    let word_count = text.split_whitespace().count();
+    let word_count = crate::chunker::word_count(&text);
     let extraction = make_text_result(text.clone(), text, url, Some(title), "file", word_count);
     let provenance = if ext == "pptx" {
         IngestionProvenance {
diff --git a/crates/noxa-rag/src/pipeline/parse/rich.rs b/crates/noxa-rag/src/pipeline/parse/rich.rs
index 27cb8ff..4200c36 100644
--- a/crates/noxa-rag/src/pipeline/parse/rich.rs
+++ b/crates/noxa-rag/src/pipeline/parse/rich.rs
@@ -40,7 +40,7 @@ pub(crate) fn parse_subtitle_file(bytes: Vec<u8>, file_url: String, title: Strin
     let content = String::from_utf8_lossy(&bytes).into_owned();
     let text = strip_subtitle_timestamps(&content);
     let provenance = subtitle_provenance(&content);
-    let word_count = text.split_whitespace().count();
+    let word_count = crate::chunker::word_count(&text);
     ParsedFile {
         extraction: make_text_result(
             text.clone(),
@@ -101,7 +101,7 @@ fn parse_email_text(
                 .params
                 .contains_key("filename")
     });
-    let word_count = body.split_whitespace().count();
+    let word_count = crate::chunker::word_count(&body);
 
     let mut extraction =
         make_text_result(body.clone(), body, file_url, subject, "email", word_count);
@@ -195,7 +195,7 @@ fn parse_feed_text(
         parts.push(extract_xml_text(content).unwrap_or_else(|_| content.to_string()));
     }
     let text = parts.join("\n\n");
-    let word_count = text.split_whitespace().count();
+    let word_count = crate::chunker::word_count(&text);
 
     let mut extraction = make_text_result(
         text.clone(),
diff --git a/crates/noxa-rag/src/pipeline/parse/text.rs b/crates/noxa-rag/src/pipeline/parse/text.rs
index 2ca7d1f..c783642 100644
--- a/crates/noxa-rag/src/pipeline/parse/text.rs
+++ b/crates/noxa-rag/src/pipeline/parse/text.rs
@@ -32,7 +32,7 @@ pub(crate) fn parse_ipynb_file(
     }
 
     let text = parts.join("\n\n");
-    let word_count = text.split_whitespace().count();
+    let word_count = crate::chunker::word_count(&text);
     Ok(ParsedFile {
         extraction: make_text_result(text.clone(), text, url, Some(title), "notebook", word_count),
         provenance: IngestionProvenance::default(),
@@ -63,7 +63,7 @@ pub(crate) fn parse_json_file(
 
 pub(crate) fn parse_markdown_file(bytes: Vec<u8>, file_url: String, title: String) -> ParsedFile {
     let content = String::from_utf8_lossy(&bytes).into_owned();
-    let word_count = content.split_whitespace().count();
+    let word_count = crate::chunker::word_count(&content);
     ParsedFile {
         extraction: make_text_result(
             content,
@@ -79,7 +79,7 @@ pub(crate) fn parse_markdown_file(bytes: Vec<u8>, file_url: String, title: Strin
 
 pub(crate) fn parse_plain_text_file(bytes: Vec<u8>, file_url: String, title: String) -> ParsedFile {
     let content = String::from_utf8_lossy(&bytes).into_owned();
-    let word_count = content.split_whitespace().count();
+    let word_count = crate::chunker::word_count(&content);
     ParsedFile {
         extraction: make_text_result(
             content.clone(),
@@ -96,7 +96,7 @@ pub(crate) fn parse_plain_text_file(bytes: Vec<u8>, file_url: String, title: Str
 pub(crate) fn parse_log_file(bytes: Vec<u8>, file_url: String, title: String) -> ParsedFile {
     let raw = String::from_utf8_lossy(&bytes).into_owned();
     let stripped = strip_ansi_escapes::strip_str(&raw);
-    let word_count = stripped.split_whitespace().count();
+    let word_count = crate::chunker::word_count(&stripped);
     ParsedFile {
         extraction: make_text_result(
             stripped.clone(),
@@ -137,7 +137,7 @@ pub(crate) fn parse_jsonl_file(bytes: Vec<u8>, file_url: String, title: String)
         })
         .collect::<Vec<_>>()
         .join("\n\n");
-    let word_count = text.split_whitespace().count();
+    let word_count = crate::chunker::word_count(&text);
     ParsedFile {
         extraction: make_text_result(
             text.clone(),
@@ -182,7 +182,7 @@ pub(crate) fn parse_xml_file(
         tracing::warn!(error = %e, "xml text extraction failed; falling back to raw text");
         content.clone()
     });
-    let word_count = text.split_whitespace().count();
+    let word_count = crate::chunker::word_count(&text);
     Ok(ParsedFile {
         extraction: make_text_result(
             text.clone(),
diff --git a/crates/noxa-rag/src/pipeline/process.rs b/crates/noxa-rag/src/pipeline/process.rs
index c0a23d8..a23159e 100644
--- a/crates/noxa-rag/src/pipeline/process.rs
+++ b/crates/noxa-rag/src/pipeline/process.rs
@@ -275,9 +275,6 @@ pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result<Jo
     let result = Arc::new(result);
 
     let t1 = std::time::Instant::now();
-    // KNOWLEDGE: chunker tokenization is CPU-bound (HuggingFace BPE). Wrap in
-    // spawn_blocking to avoid blocking async Tokio worker threads — same pattern
-    // as parse/mod.rs for PDF/DOCX. See bead noxa-3fi.2.
     let chunks = {
         let result_for_chunk = Arc::clone(&result);
         let config_chunker = ctx.config.chunker.clone();
diff --git a/crates/noxa-rag/src/pipeline/startup_scan.rs b/crates/noxa-rag/src/pipeline/startup_scan.rs
index c37daf6..6a8767c 100644
--- a/crates/noxa-rag/src/pipeline/startup_scan.rs
+++ b/crates/noxa-rag/src/pipeline/startup_scan.rs
@@ -54,16 +54,7 @@ pub(super) fn spawn_startup_scan(
         let skipped = Arc::new(AtomicUsize::new(0));
         let backend_errors = Arc::new(AtomicUsize::new(0));
 
-        // Batch paths into groups of STARTUP_SCAN_BATCH before dispatching to the blocking
-        // thread pool. This reduces per-task scheduler overhead from O(N) spawn_blocking calls
-        // (one per file) to O(N / STARTUP_SCAN_BATCH) calls, saving ~2–5µs per file on large
-        // startup scans while keeping cancellation latency bounded to one batch worth of work.
-        let batches: Vec<Vec<PathBuf>> = paths
-            .chunks(STARTUP_SCAN_BATCH)
-            .map(|c| c.to_vec())
-            .collect();
-
-        stream::iter(batches)
+        stream::iter(paths.chunks(STARTUP_SCAN_BATCH).map(|c| c.to_vec()))
             .for_each_concurrent(scan_concurrency, |batch| {
                 let tx = tx.clone();
                 let store = store.clone();

From 47d420bcd4b6703b147447d6884e7754ecc75010 Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Mon, 27 Apr 2026 06:49:17 -0400
Subject: [PATCH 22/23] fix: address PR #14 review comments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove mcp_bridge.rs (was deleted on disk but not staged in commit 59a195c;
  caused E0761 duplicate module root alongside mcp_bridge/)
- mcp_bridge/io.rs: remove destination before rename so write_bridge_document
  is idempotent on Windows (rename fails if dst exists)
- parse/text.rs: scan raw bytes in contains_xml_entity_expansion_risk instead
  of from_utf8().unwrap_or("") — non-UTF-8 bytes no longer silently suppress
  the <!DOCTYPE/<!ENTITY guard
- pipeline/scan.rs: replace hardcoded MAX_SCAN_JSON_BYTES with a max_json_bytes
  parameter threaded from pipeline.config.pipeline.max_file_size_bytes
- pipeline/scan.rs: fix startup_scan_key_json_at_size_limit_is_processed to
  actually exercise the boundary condition (len == limit, not a tiny file)

Resolves review thread PRRT_kwDOR_mP6c59t3qL
Resolves review thread PRRT_kwDOR_mP6c59t25j
Resolves review thread PRRT_kwDOR_mP6c59t25d
Resolves review thread PRRT_kwDOR_mP6c59t25O
Resolves review thread PRRT_kwDOR_mP6c59t3qN
Resolves review thread PRRT_kwDOR_mP6c59t25n
---
 crates/noxa-rag/src/mcp_bridge.rs            | 1429 ------------------
 crates/noxa-rag/src/mcp_bridge/io.rs         |    3 +
 crates/noxa-rag/src/pipeline/parse/tests.rs  |   81 +-
 crates/noxa-rag/src/pipeline/parse/text.rs   |   10 +-
 crates/noxa-rag/src/pipeline/runtime.rs      |    1 +
 crates/noxa-rag/src/pipeline/scan.rs         |   75 +-
 crates/noxa-rag/src/pipeline/startup_scan.rs |    5 +-
 7 files changed, 81 insertions(+), 1523 deletions(-)
 delete mode 100644 crates/noxa-rag/src/mcp_bridge.rs

diff --git a/crates/noxa-rag/src/mcp_bridge.rs b/crates/noxa-rag/src/mcp_bridge.rs
deleted file mode 100644
index 8e51e3a..0000000
--- a/crates/noxa-rag/src/mcp_bridge.rs
+++ /dev/null
@@ -1,1429 +0,0 @@
-use std::collections::{HashMap, HashSet};
-use std::path::{Path, PathBuf};
-
-use async_trait::async_trait;
-use noxa_core::{Content, ExtractionResult, Metadata};
-use serde::{Deserialize, Serialize};
-use serde_json::Value;
-use tokio::process::Command;
-use url::Url;
-
-use crate::RagError;
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum McpSource {
-    Linkding,
-    Memos,
-    Bytestash,
-    Paperless,
-}
-
-impl McpSource {
-    pub fn as_str(self) -> &'static str {
-        match self {
-            Self::Linkding => "linkding",
-            Self::Memos => "memos",
-            Self::Bytestash => "bytestash",
-            Self::Paperless => "paperless",
-        }
-    }
-}
-
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct BridgeConfig {
-    pub server: String,
-    /// Directory where bridge JSON files are written. Must be one of the dirs
-    /// listed in the daemon's `source.watch_dirs` so the daemon picks them up.
-    pub watch_dir: PathBuf,
-    pub page_size: u32,
-    pub platform_base_url: Option<String>,
-}
-
-#[derive(Debug, Clone, Default, PartialEq, Eq)]
-pub struct SyncReport {
-    pub fetched: usize,
-    pub written: usize,
-    pub skipped: usize,
-}
-
-#[derive(Debug, Clone)]
-pub struct BridgeDocument {
-    pub source: McpSource,
-    pub external_id: String,
-    pub platform_url: Option<String>,
-    pub extraction: ExtractionResult,
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum WriteStatus {
-    Written,
-    Unchanged,
-}
-
-#[async_trait]
-pub trait McporterExecutor: Send + Sync {
-    async fn call(
-        &self,
-        server: &str,
-        service: McpSource,
-        action: &str,
-        params: Value,
-    ) -> Result<Value, RagError>;
-}
-
-#[derive(Debug, Clone)]
-pub struct ProcessMcporterExecutor {
-    executable: String,
-}
-
-impl ProcessMcporterExecutor {
-    pub fn new(executable: impl Into<String>) -> Self {
-        Self {
-            executable: executable.into(),
-        }
-    }
-}
-
-#[async_trait]
-impl McporterExecutor for ProcessMcporterExecutor {
-    async fn call(
-        &self,
-        server: &str,
-        service: McpSource,
-        action: &str,
-        params: Value,
-    ) -> Result<Value, RagError> {
-        let selector = format!("{}.{}", server, service.as_str());
-        let args = serde_json::json!({
-            "action": action,
-            "params": params,
-        });
-        let output = Command::new(&self.executable)
-            .arg("call")
-            .arg(&selector)
-            .arg("--args")
-            .arg(args.to_string())
-            .arg("--output")
-            .arg("json")
-            .output()
-            .await
-            .map_err(|e| RagError::Generic(format!("failed to execute mcporter: {e}")))?;
-
-        if !output.status.success() {
-            return Err(RagError::Generic(format!(
-                "mcporter call {} {} failed: {}",
-                selector,
-                action,
-                String::from_utf8_lossy(&output.stderr).trim()
-            )));
-        }
-
-        serde_json::from_slice(&output.stdout)
-            .map_err(|e| RagError::Generic(format!("mcporter returned invalid JSON: {e}")))
-    }
-}
-
-pub struct McpBridge<E> {
-    executor: E,
-    config: BridgeConfig,
-}
-
-impl<E> McpBridge<E>
-where
-    E: McporterExecutor,
-{
-    pub fn new(executor: E, config: BridgeConfig) -> Self {
-        Self { executor, config }
-    }
-
-    pub async fn sync(&self, source: McpSource) -> Result<SyncReport, RagError> {
-        match source {
-            McpSource::Linkding => self.sync_linkding().await,
-            McpSource::Memos => self.sync_memos().await,
-            McpSource::Bytestash => self.sync_bytestash().await,
-            McpSource::Paperless => self.sync_paperless().await,
-        }
-    }
-
-    async fn sync_linkding(&self) -> Result<SyncReport, RagError> {
-        let mut report = SyncReport::default();
-        let mut offset = 0_u32;
-        let mut seen_ids = HashSet::new();
-
-        loop {
-            let data = self
-                .call_data(
-                    McpSource::Linkding,
-                    "bookmark.list",
-                    serde_json::json!({
-                        "limit": self.config.page_size,
-                        "offset": offset,
-                    }),
-                )
-                .await?;
-            let records = array_field(&data, "results")?;
-            if records.is_empty() {
-                break;
-            }
-
-            let mut new_records = 0_usize;
-            for record in records {
-                let document =
-                    normalize_linkding_record(record, self.config.platform_base_url.as_deref())?;
-                if !seen_ids.insert(document.external_id.clone()) {
-                    continue;
-                }
-                new_records += 1;
-                report.fetched += 1;
-                match write_bridge_document(&self.config.watch_dir, &document).await? {
-                    WriteStatus::Written => report.written += 1,
-                    WriteStatus::Unchanged => report.skipped += 1,
-                }
-            }
-
-            if data.get("next").is_none() || data.get("next").is_some_and(Value::is_null) {
-                break;
-            }
-            if new_records == 0 {
-                break;
-            }
-            offset = offset.saturating_add(self.config.page_size.max(1));
-        }
-
-        Ok(report)
-    }
-
-    async fn sync_memos(&self) -> Result<SyncReport, RagError> {
-        let base_url = required_base_url(&self.config, McpSource::Memos)?;
-        let mut report = SyncReport::default();
-        let mut page_token: Option<String> = None;
-
-        loop {
-            let mut params = serde_json::json!({ "page_size": self.config.page_size });
-            if let Some(token) = &page_token {
-                params["page_token"] = Value::String(token.clone());
-            }
-            let data = self
-                .call_data(McpSource::Memos, "memos.list", params)
-                .await?;
-            let records = array_field(&data, "memos")?;
-            if records.is_empty() {
-                break;
-            }
-
-            for record in records {
-                let document = normalize_memo_record(record, base_url)?;
-                report.fetched += 1;
-                match write_bridge_document(&self.config.watch_dir, &document).await? {
-                    WriteStatus::Written => report.written += 1,
-                    WriteStatus::Unchanged => report.skipped += 1,
-                }
-            }
-
-            let next_page_token = data
-                .get("nextPageToken")
-                .and_then(Value::as_str)
-                .map(str::trim)
-                .filter(|value| !value.is_empty())
-                .map(ToOwned::to_owned);
-            if next_page_token.is_none() || next_page_token == page_token {
-                break;
-            }
-            page_token = next_page_token;
-        }
-
-        Ok(report)
-    }
-
-    async fn sync_bytestash(&self) -> Result<SyncReport, RagError> {
-        let base_url = required_base_url(&self.config, McpSource::Bytestash)?;
-        let data = self
-            .call_data(McpSource::Bytestash, "snippets.list", serde_json::json!({}))
-            .await?;
-        let records = if let Some(array) = data.as_array() {
-            array.iter().collect::<Vec<_>>()
-        } else {
-            array_field(&data, "snippets")?
-        };
-
-        let mut report = SyncReport::default();
-        for record in records {
-            let document = normalize_bytestash_record(record, base_url)?;
-            report.fetched += 1;
-            match write_bridge_document(&self.config.watch_dir, &document).await? {
-                WriteStatus::Written => report.written += 1,
-                WriteStatus::Unchanged => report.skipped += 1,
-            }
-        }
-
-        Ok(report)
-    }
-
-    async fn sync_paperless(&self) -> Result<SyncReport, RagError> {
-        let base_url = required_base_url(&self.config, McpSource::Paperless)?;
-        let tag_names = self.fetch_paperless_lookup("tags.list").await?;
-        let correspondent_names = self.fetch_paperless_lookup("correspondents.list").await?;
-        let mut report = SyncReport::default();
-        let mut page = 1_u32;
-
-        loop {
-            let data = self
-                .call_data(
-                    McpSource::Paperless,
-                    "documents.list",
-                    serde_json::json!({
-                        "page_size": self.config.page_size,
-                        "page": page,
-                    }),
-                )
-                .await?;
-            let records = array_field(&data, "results")?;
-            if records.is_empty() {
-                break;
-            }
-
-            for record in records {
-                let document =
-                    normalize_paperless_record(record, &tag_names, &correspondent_names, base_url)?;
-                report.fetched += 1;
-                match write_bridge_document(&self.config.watch_dir, &document).await? {
-                    WriteStatus::Written => report.written += 1,
-                    WriteStatus::Unchanged => report.skipped += 1,
-                }
-            }
-
-            if data.get("next").is_none() || data.get("next").is_some_and(Value::is_null) {
-                break;
-            }
-            page = page.saturating_add(1);
-        }
-
-        Ok(report)
-    }
-
-    async fn call_data(
-        &self,
-        source: McpSource,
-        action: &str,
-        params: Value,
-    ) -> Result<Value, RagError> {
-        let raw = self
-            .executor
-            .call(&self.config.server, source, action, params)
-            .await?;
-        extract_mcporter_data(raw)
-    }
-
-    async fn fetch_paperless_lookup(&self, action: &str) -> Result<HashMap<u64, String>, RagError> {
-        let data = self
-            .call_data(McpSource::Paperless, action, serde_json::json!({}))
-            .await?;
-        let items = if let Some(array) = data.as_array() {
-            array.iter().collect::<Vec<_>>()
-        } else {
-            array_field(&data, "results")?
-        };
-        let mut lookup = HashMap::new();
-        for item in items {
-            let Some(id) = item.get("id").and_then(as_u64) else {
-                continue;
-            };
-            let Some(name) = item.get("name").and_then(Value::as_str) else {
-                continue;
-            };
-            lookup.insert(id, name.to_string());
-        }
-        Ok(lookup)
-    }
-}
-
-pub fn relative_output_path(source: McpSource, external_id: &str) -> PathBuf {
-    PathBuf::from("mcp").join(source.as_str()).join(format!(
-        "{}-{:016x}.json",
-        sanitize_component(external_id),
-        stable_component_hash(external_id)
-    ))
-}
-
-pub async fn write_bridge_document(
-    root: &Path,
-    document: &BridgeDocument,
-) -> Result<WriteStatus, RagError> {
-    let path = root.join(relative_output_path(document.source, &document.external_id));
-    if let Some(parent) = path.parent() {
-        tokio::fs::create_dir_all(parent).await?;
-    }
-
-    let payload = StoredExtractionResult {
-        extraction: document.extraction.clone(),
-        external_id: Some(document.external_id.clone()),
-        platform_url: document.platform_url.clone(),
-    };
-    let serialized = serde_json::to_vec_pretty(&payload)?;
-    if tokio::fs::read(&path).await.ok().as_deref() == Some(serialized.as_slice()) {
-        return Ok(WriteStatus::Unchanged);
-    }
-
-    let tmp_path = temp_output_path(&path);
-    tokio::fs::write(&tmp_path, &serialized).await?;
-    tokio::fs::rename(&tmp_path, &path).await?;
-    Ok(WriteStatus::Written)
-}
-
-pub fn normalize_linkding_record(
-    record: &Value,
-    platform_base_url: Option<&str>,
-) -> Result<BridgeDocument, RagError> {
-    let id = required_value(record, "id").and_then(as_u64_value)?;
-    let url = required_string(record, "url")?;
-    let title = optional_string(record, "title");
-    let description = optional_string(record, "description");
-    let notes = optional_string(record, "notes");
-    let markdown = join_non_empty([
-        title.as_deref().map(|value| format!("# {value}")),
-        description.clone(),
-        notes.clone(),
-    ]);
-    let plain_text = join_non_empty([title.clone(), description, notes]);
-    let platform_url = match platform_base_url {
-        Some(base) => Some(linkding_platform_url(base, &url)?),
-        None => None,
-    };
-
-    Ok(BridgeDocument {
-        source: McpSource::Linkding,
-        external_id: format!("linkding:{id}"),
-        platform_url,
-        extraction: build_extraction(
-            url,
-            title,
-            optional_string(record, "date_added"),
-            None,
-            None,
-            string_array(record.get("tag_names")),
-            markdown,
-            plain_text,
-        ),
-    })
-}
-
-pub fn normalize_memo_record(
-    record: &Value,
-    platform_base_url: &str,
-) -> Result<BridgeDocument, RagError> {
-    let name = required_string(record, "name")?;
-    let memo_id = name.strip_prefix("memos/").unwrap_or(&name).to_string();
-    let content = required_string(record, "content")?;
-    let title = first_line_title(&content);
-    let url = join_base_url(platform_base_url, &format!("/api/v1/{name}"))?;
-    let published_date =
-        optional_string(record, "displayTime").or_else(|| optional_string(record, "createTime"));
-
-    Ok(BridgeDocument {
-        source: McpSource::Memos,
-        external_id: format!("memos:{memo_id}"),
-        platform_url: Some(url.clone()),
-        extraction: build_extraction(
-            url,
-            title,
-            published_date,
-            None,
-            None,
-            string_array(record.get("tags")),
-            content.clone(),
-            content,
-        ),
-    })
-}
-
-pub fn normalize_bytestash_record(
-    record: &Value,
-    platform_base_url: &str,
-) -> Result<BridgeDocument, RagError> {
-    let id = required_string(record, "id")?;
-    let title = optional_string(record, "title");
-    let description = optional_string(record, "description");
-    let language = optional_string(record, "language");
-    let fragments = record
-        .get("fragments")
-        .and_then(Value::as_array)
-        .ok_or_else(|| RagError::Parse("bytestash record missing fragments array".to_string()))?;
-
-    let mut markdown_parts = Vec::new();
-    if let Some(value) = title.as_deref() {
-        markdown_parts.push(format!("# {value}"));
-    }
-    if let Some(value) = description.as_deref() {
-        markdown_parts.push(value.to_string());
-    }
-    for fragment in fragments {
-        let file_name = fragment
-            .get("fileName")
-            .or_else(|| fragment.get("file_name"))
-            .and_then(Value::as_str)
-            .unwrap_or("snippet");
-        let code = fragment
-            .get("code")
-            .and_then(Value::as_str)
-            .unwrap_or_default();
-        markdown_parts.push(format!(
-            "## {file_name}\n```{}\n{}\n```",
-            language.clone().unwrap_or_default(),
-            code
-        ));
-    }
-    let plain_text = join_non_empty([
-        title.clone(),
-        description.clone(),
-        Some(
-            fragments
-                .iter()
-                .filter_map(|fragment| fragment.get("code").and_then(Value::as_str))
-                .collect::<Vec<_>>()
-                .join("\n\n"),
-        ),
-    ]);
-    let url = join_base_url(platform_base_url, &format!("/api/snippets/{id}"))?;
-
-    Ok(BridgeDocument {
-        source: McpSource::Bytestash,
-        external_id: format!("bytestash:{id}"),
-        platform_url: Some(url.clone()),
-        extraction: build_extraction(
-            url,
-            title,
-            None,
-            None,
-            language,
-            string_array(record.get("categories")),
-            markdown_parts.join("\n\n"),
-            plain_text,
-        ),
-    })
-}
-
-pub fn normalize_paperless_record(
-    record: &Value,
-    tag_names: &std::collections::HashMap<u64, String>,
-    correspondent_names: &std::collections::HashMap<u64, String>,
-    platform_base_url: &str,
-) -> Result<BridgeDocument, RagError> {
-    let id = required_value(record, "id").and_then(as_u64_value)?;
-    let tags = record
-        .get("tags")
-        .and_then(Value::as_array)
-        .map(|items| {
-            items
-                .iter()
-                .filter_map(as_u64)
-                .filter_map(|value| tag_names.get(&value).cloned())
-                .collect::<Vec<_>>()
-        })
-        .unwrap_or_default();
-    let author = record
-        .get("correspondent")
-        .and_then(as_u64)
-        .and_then(|value| correspondent_names.get(&value).cloned());
-    let title = optional_string(record, "title");
-    let content = optional_string(record, "content").unwrap_or_default();
-    let url = join_base_url(platform_base_url, &format!("/api/documents/{id}/"))?;
-
-    Ok(BridgeDocument {
-        source: McpSource::Paperless,
-        external_id: format!("paperless:{id}"),
-        platform_url: Some(url.clone()),
-        extraction: build_extraction(
-            url,
-            title,
-            optional_string(record, "created").or_else(|| optional_string(record, "created_date")),
-            author,
-            None,
-            tags,
-            content.clone(),
-            content,
-        ),
-    })
-}
-
-fn sanitize_component(value: &str) -> String {
-    value
-        .chars()
-        .map(|ch| match ch {
-            'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_' => ch,
-            _ => '_',
-        })
-        .collect()
-}
-
-fn stable_component_hash(value: &str) -> u64 {
-    use std::hash::{DefaultHasher, Hasher};
-    let mut hasher = DefaultHasher::new();
-    hasher.write(value.as_bytes());
-    hasher.finish()
-}
-
-fn temp_output_path(path: &Path) -> PathBuf {
-    let suffix = format!(
-        "tmp-{}-{}",
-        std::process::id(),
-        uuid::Uuid::new_v4().simple()
-    );
-    path.with_extension(format!("json.{suffix}"))
-}
-
-#[derive(Debug, Serialize, Deserialize)]
-struct StoredExtractionResult {
-    #[serde(flatten)]
-    extraction: ExtractionResult,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    external_id: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    platform_url: Option<String>,
-}
-
-fn count_words(value: &str) -> usize {
-    value.split_whitespace().count()
-}
-
-fn extract_mcporter_data(raw: Value) -> Result<Value, RagError> {
-    let ok = raw.get("ok").and_then(Value::as_bool).unwrap_or(true);
-    if ok {
-        raw.get("data")
-            .cloned()
-            .ok_or_else(|| RagError::Parse("mcporter response missing data".to_string()))
-    } else {
-        let message = raw
-            .get("error")
-            .and_then(|value| value.get("message"))
-            .and_then(Value::as_str)
-            .unwrap_or("unknown mcporter error");
-        Err(RagError::Generic(message.to_string()))
-    }
-}
-
-fn array_field<'a>(value: &'a Value, key: &str) -> Result<Vec<&'a Value>, RagError> {
-    value
-        .get(key)
-        .and_then(Value::as_array)
-        .map(|items| items.iter().collect::<Vec<_>>())
-        .ok_or_else(|| RagError::Parse(format!("expected array field {key}")))
-}
-
-fn required_base_url(config: &BridgeConfig, source: McpSource) -> Result<&str, RagError> {
-    config.platform_base_url.as_deref().ok_or_else(|| {
-        RagError::Config(format!("{} requires --platform-base-url", source.as_str()))
-    })
-}
-
-fn join_base_url(base: &str, path: &str) -> Result<String, RagError> {
-    let base = base.trim_end_matches('/');
-    let url = format!("{base}{path}");
-    Url::parse(&url)
-        .map(|parsed| parsed.to_string())
-        .map_err(|e| RagError::Parse(format!("invalid base URL {base:?}: {e}")))
-}
-
-fn linkding_platform_url(base: &str, bookmark_url: &str) -> Result<String, RagError> {
-    let mut url = Url::parse(base)
-        .map_err(|e| RagError::Parse(format!("invalid linkding base URL {base:?}: {e}")))?;
-    let current_path = url.path().trim_end_matches('/');
-    let next_path = if current_path.is_empty() {
-        "/bookmarks".to_string()
-    } else {
-        format!("{current_path}/bookmarks")
-    };
-    url.set_path(&next_path);
-    let query = url::form_urlencoded::Serializer::new(String::new())
-        .append_pair("q", bookmark_url)
-        .finish();
-    url.set_query(Some(&query));
-    Ok(url.to_string())
-}
-
-fn required_value<'a>(value: &'a Value, key: &str) -> Result<&'a Value, RagError> {
-    value
-        .get(key)
-        .ok_or_else(|| RagError::Parse(format!("missing required field {key}")))
-}
-
-fn required_string(value: &Value, key: &str) -> Result<String, RagError> {
-    value
-        .get(key)
-        .and_then(Value::as_str)
-        .map(ToOwned::to_owned)
-        .ok_or_else(|| RagError::Parse(format!("missing required string field {key}")))
-}
-
-fn optional_string(value: &Value, key: &str) -> Option<String> {
-    value
-        .get(key)
-        .and_then(Value::as_str)
-        .map(ToOwned::to_owned)
-}
-
-fn as_u64(value: &Value) -> Option<u64> {
-    value
-        .as_u64()
-        .or_else(|| value.as_str().and_then(|raw| raw.parse::<u64>().ok()))
-}
-
-fn as_u64_value(value: &Value) -> Result<u64, RagError> {
-    as_u64(value).ok_or_else(|| RagError::Parse("expected integer id".to_string()))
-}
-
-fn string_array(value: Option<&Value>) -> Vec<String> {
-    value
-        .and_then(Value::as_array)
-        .map(|items| {
-            items
-                .iter()
-                .filter_map(|item| {
-                    item.as_str().map(ToOwned::to_owned).or_else(|| {
-                        item.get("name")
-                            .and_then(Value::as_str)
-                            .map(ToOwned::to_owned)
-                    })
-                })
-                .collect::<Vec<_>>()
-        })
-        .unwrap_or_default()
-}
-
-fn join_non_empty<I>(parts: I) -> String
-where
-    I: IntoIterator<Item = Option<String>>,
-{
-    parts
-        .into_iter()
-        .flatten()
-        .map(|value| value.trim().to_string())
-        .filter(|value| !value.is_empty())
-        .collect::<Vec<_>>()
-        .join("\n\n")
-}
-
-fn first_line_title(content: &str) -> Option<String> {
-    content
-        .lines()
-        .map(str::trim)
-        .find(|line| !line.is_empty() && !line.starts_with('#'))
-        .map(|line| line.chars().take(80).collect::<String>())
-}
-
-#[allow(clippy::too_many_arguments)]
-fn build_extraction(
-    url: String,
-    title: Option<String>,
-    published_date: Option<String>,
-    author: Option<String>,
-    language: Option<String>,
-    technologies: Vec<String>,
-    markdown: String,
-    plain_text: String,
-) -> ExtractionResult {
-    ExtractionResult {
-        metadata: Metadata {
-            title,
-            description: None,
-            author,
-            published_date,
-            language,
-            url: Some(url),
-            site_name: None,
-            image: None,
-            favicon: None,
-            word_count: count_words(&plain_text),
-            content_hash: None,
-            source_type: Some("mcp".to_string()),
-            file_path: None,
-            last_modified: None,
-            is_truncated: None,
-            technologies,
-            seed_url: None,
-            crawl_depth: None,
-            search_query: None,
-            fetched_at: None,
-        },
-        content: Content {
-            markdown,
-            plain_text,
-            links: Vec::new(),
-            images: Vec::new(),
-            code_blocks: Vec::new(),
-            raw_html: None,
-        },
-        domain_data: None,
-        vertical_data: None,
-        structured_data: Vec::new(),
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::collections::{HashMap, VecDeque};
-    use std::sync::{Arc, Mutex};
-
-    use super::*;
-
-    type ExecutorCall = (String, McpSource, String, Value);
-
-    #[derive(Clone, Default)]
-    struct MockExecutor {
-        calls: Arc<Mutex<Vec<ExecutorCall>>>,
-        responses: Arc<Mutex<VecDeque<Result<Value, RagError>>>>,
-    }
-
-    impl MockExecutor {
-        fn with_responses(responses: Vec<Result<Value, RagError>>) -> Self {
-            Self {
-                calls: Arc::new(Mutex::new(Vec::new())),
-                responses: Arc::new(Mutex::new(VecDeque::from(responses))),
-            }
-        }
-
-        fn calls(&self) -> Vec<ExecutorCall> {
-            self.calls.lock().expect("calls lock").clone()
-        }
-    }
-
-    #[async_trait]
-    impl McporterExecutor for MockExecutor {
-        async fn call(
-            &self,
-            server: &str,
-            service: McpSource,
-            action: &str,
-            params: Value,
-        ) -> Result<Value, RagError> {
-            self.calls.lock().expect("calls lock").push((
-                server.to_string(),
-                service,
-                action.to_string(),
-                params,
-            ));
-            self.responses
-                .lock()
-                .expect("responses lock")
-                .pop_front()
-                .expect("response queued")
-        }
-    }
-
-    #[test]
-    fn relative_output_path_sanitizes_external_ids() {
-        let path = relative_output_path(McpSource::Memos, "memos/G3BLCD3swV4Fuxk4uMT97d");
-
-        let rendered = path.display().to_string();
-        assert!(rendered.starts_with("mcp/memos/memos_G3BLCD3swV4Fuxk4uMT97d-"));
-        assert!(rendered.ends_with(".json"));
-    }
-
-    #[test]
-    fn relative_output_path_distinguishes_colliding_sanitized_ids() {
-        let first = relative_output_path(McpSource::Memos, "memos/G3");
-        let second = relative_output_path(McpSource::Memos, "memos_G3");
-        assert_ne!(first, second);
-    }
-
-    #[test]
-    fn normalize_linkding_record_maps_bookmark_fields() {
-        let record = serde_json::json!({
-            "id": 441,
-            "url": "https://pipenet.dev/",
-            "title": "pipenet",
-            "description": "",
-            "notes": "MCP client",
-            "tag_names": ["mcp", "rust"],
-            "date_added": "2026-02-02T15:23:27.821564-05:00"
-        });
-
-        let document =
-            normalize_linkding_record(&record, Some("https://ding.tootie.tv")).expect("maps");
-
-        assert_eq!(document.external_id, "linkding:441");
-        assert_eq!(
-            document.extraction.metadata.url.as_deref(),
-            Some("https://pipenet.dev/")
-        );
-        assert_eq!(
-            document.platform_url.as_deref(),
-            Some("https://ding.tootie.tv/bookmarks?q=https%3A%2F%2Fpipenet.dev%2F")
-        );
-        assert_eq!(
-            document.extraction.metadata.technologies,
-            vec!["mcp".to_string(), "rust".to_string()]
-        );
-        assert_eq!(
-            document.extraction.metadata.published_date.as_deref(),
-            Some("2026-02-02T15:23:27.821564-05:00")
-        );
-        assert!(document.extraction.content.markdown.contains("MCP client"));
-    }
-
-    #[test]
-    fn normalize_linkding_record_preserves_base_path_prefix() {
-        let record = serde_json::json!({
-            "id": 12,
-            "url": "https://example.com/doc",
-            "title": "Doc",
-            "description": "",
-            "notes": ""
-        });
-
-        let document =
-            normalize_linkding_record(&record, Some("https://ding.example/app")).expect("maps");
-
-        assert_eq!(
-            document.platform_url.as_deref(),
-            Some("https://ding.example/app/bookmarks?q=https%3A%2F%2Fexample.com%2Fdoc")
-        );
-    }
-
-    #[test]
-    fn normalize_memo_record_maps_tags_and_dates() {
-        let record = serde_json::json!({
-            "name": "memos/G3BLCD3swV4Fuxk4uMT97d",
-            "content": "Use `docker inspect`.\n#docker #cmd",
-            "tags": ["docker", "cmd"],
-            "createTime": "2024-05-07T00:26:59Z",
-            "displayTime": "2024-05-17T22:39:50Z"
-        });
-
-        let document = normalize_memo_record(&record, "https://memos.example.com").expect("maps");
-
-        assert_eq!(document.external_id, "memos:G3BLCD3swV4Fuxk4uMT97d");
-        assert_eq!(
-            document.extraction.metadata.url.as_deref(),
-            Some("https://memos.example.com/api/v1/memos/G3BLCD3swV4Fuxk4uMT97d")
-        );
-        assert_eq!(
-            document.platform_url.as_deref(),
-            Some("https://memos.example.com/api/v1/memos/G3BLCD3swV4Fuxk4uMT97d")
-        );
-        assert_eq!(
-            document.extraction.metadata.published_date.as_deref(),
-            Some("2024-05-17T22:39:50Z")
-        );
-        assert_eq!(
-            document.extraction.metadata.technologies,
-            vec!["docker".to_string(), "cmd".to_string()]
-        );
-    }
-
-    #[test]
-    fn normalize_bytestash_record_maps_fragments_and_categories() {
-        let record = serde_json::json!({
-            "id": "abc123",
-            "title": "docker env",
-            "description": "Inspect container env vars",
-            "language": "bash",
-            "categories": ["docker", "ops"],
-            "fragments": [
-                {
-                    "fileName": "inspect.sh",
-                    "code": "docker inspect my-container | jq '.[0].Config.Env'"
-                }
-            ]
-        });
-
-        let document =
-            normalize_bytestash_record(&record, "https://stash.example.com").expect("maps");
-
-        assert_eq!(document.external_id, "bytestash:abc123");
-        assert_eq!(
-            document.extraction.metadata.url.as_deref(),
-            Some("https://stash.example.com/api/snippets/abc123")
-        );
-        assert_eq!(
-            document.extraction.metadata.language.as_deref(),
-            Some("bash")
-        );
-        assert_eq!(
-            document.extraction.metadata.technologies,
-            vec!["docker".to_string(), "ops".to_string()]
-        );
-        assert!(document.extraction.content.markdown.contains("```bash\n"));
-    }
-
-    #[test]
-    fn normalize_paperless_record_maps_tag_ids_and_correspondent() {
-        let record = serde_json::json!({
-            "id": 6,
-            "title": "Alcohol Server Training Certificate",
-            "content": "SERVSAFE certificate text",
-            "tags": [1, 4],
-            "correspondent": 9,
-            "created": "2026-04-01"
-        });
-        let tag_names = HashMap::from([
-            (1_u64, "paperless".to_string()),
-            (4_u64, "certification".to_string()),
-        ]);
-        let correspondent_names = HashMap::from([(9_u64, "ServSafe".to_string())]);
-
-        let document = normalize_paperless_record(
-            &record,
-            &tag_names,
-            &correspondent_names,
-            "https://paperless.example.com",
-        )
-        .expect("maps");
-
-        assert_eq!(document.external_id, "paperless:6");
-        assert_eq!(
-            document.extraction.metadata.url.as_deref(),
-            Some("https://paperless.example.com/api/documents/6/")
-        );
-        assert_eq!(
-            document.extraction.metadata.author.as_deref(),
-            Some("ServSafe")
-        );
-        assert_eq!(
-            document.extraction.metadata.published_date.as_deref(),
-            Some("2026-04-01")
-        );
-        assert_eq!(
-            document.extraction.metadata.technologies,
-            vec!["paperless".to_string(), "certification".to_string()]
-        );
-    }
-
-    #[tokio::test]
-    async fn write_bridge_document_skips_unchanged_payloads() {
-        let temp = tempfile::tempdir().expect("tempdir");
-        let document = BridgeDocument {
-            source: McpSource::Linkding,
-            external_id: "linkding:441".to_string(),
-            platform_url: Some(
-                "https://ding.tootie.tv/bookmarks?q=https%3A%2F%2Fpipenet.dev%2F".to_string(),
-            ),
-            extraction: build_extraction(
-                "https://pipenet.dev/".to_string(),
-                Some("pipenet".to_string()),
-                Some("2026-02-02T15:23:27.821564-05:00".to_string()),
-                None,
-                None,
-                vec!["mcp".to_string()],
-                "# pipenet".to_string(),
-                "pipenet".to_string(),
-            ),
-        };
-
-        let first = write_bridge_document(temp.path(), &document)
-            .await
-            .expect("first write");
-        let second = write_bridge_document(temp.path(), &document)
-            .await
-            .expect("second write");
-
-        assert_eq!(first, WriteStatus::Written);
-        assert_eq!(second, WriteStatus::Unchanged);
-    }
-
-    #[tokio::test]
-    async fn sync_linkding_pages_and_writes_documents() {
-        let temp = tempfile::tempdir().expect("tempdir");
-        let executor = MockExecutor::with_responses(vec![
-            Ok(serde_json::json!({
-                "ok": true,
-                "data": {
-                    "results": [
-                        {
-                            "id": 1,
-                            "url": "https://example.com/one",
-                            "title": "One",
-                            "description": "",
-                            "notes": "",
-                            "tag_names": ["rust"],
-                            "date_added": "2026-04-01T00:00:00Z"
-                        }
-                    ],
-                    "next": "https://ding.example/api/bookmarks/?limit=1&offset=1"
-                }
-            })),
-            Ok(serde_json::json!({
-                "ok": true,
-                "data": {
-                    "results": [
-                        {
-                            "id": 2,
-                            "url": "https://example.com/two",
-                            "title": "Two",
-                            "description": "Second",
-                            "notes": "",
-                            "tag_names": [],
-                            "date_added": "2026-04-02T00:00:00Z"
-                        }
-                    ],
-                    "next": null
-                }
-            })),
-        ]);
-        let bridge = McpBridge::new(
-            executor.clone(),
-            BridgeConfig {
-                server: "lab".to_string(),
-                watch_dir: temp.path().to_path_buf(),
-                page_size: 1,
-                platform_base_url: Some("https://ding.example".to_string()),
-            },
-        );
-
-        let report = bridge.sync(McpSource::Linkding).await.expect("sync");
-
-        assert_eq!(
-            report,
-            SyncReport {
-                fetched: 2,
-                written: 2,
-                skipped: 0
-            }
-        );
-        let calls = executor.calls();
-        assert_eq!(calls.len(), 2);
-        assert_eq!(calls[0].0, "lab");
-        assert_eq!(calls[0].1, McpSource::Linkding);
-        assert_eq!(calls[0].2, "bookmark.list");
-        assert_eq!(calls[0].3, serde_json::json!({ "limit": 1, "offset": 0 }));
-        assert_eq!(calls[1].3, serde_json::json!({ "limit": 1, "offset": 1 }));
-        assert!(
-            temp.path()
-                .join(relative_output_path(McpSource::Linkding, "linkding:1"))
-                .exists()
-        );
-        assert!(
-            temp.path()
-                .join(relative_output_path(McpSource::Linkding, "linkding:2"))
-                .exists()
-        );
-    }
-
-    #[tokio::test]
-    async fn sync_linkding_stops_when_pagination_repeats_same_records() {
-        let temp = tempfile::tempdir().expect("tempdir");
-        let executor = MockExecutor::with_responses(vec![
-            Ok(serde_json::json!({
-                "ok": true,
-                "data": {
-                    "results": [
-                        {
-                            "id": 1,
-                            "url": "https://example.com/one",
-                            "title": "One",
-                            "description": "",
-                            "notes": "",
-                            "tag_names": [],
-                            "date_added": "2026-04-01T00:00:00Z"
-                        }
-                    ],
-                    "next": "https://ding.example/api/bookmarks/?limit=1&offset=1"
-                }
-            })),
-            Ok(serde_json::json!({
-                "ok": true,
-                "data": {
-                    "results": [
-                        {
-                            "id": 1,
-                            "url": "https://example.com/one",
-                            "title": "One",
-                            "description": "",
-                            "notes": "",
-                            "tag_names": [],
-                            "date_added": "2026-04-01T00:00:00Z"
-                        }
-                    ],
-                    "next": "https://ding.example/api/bookmarks/?limit=1&offset=2"
-                }
-            })),
-        ]);
-        let bridge = McpBridge::new(
-            executor.clone(),
-            BridgeConfig {
-                server: "lab".to_string(),
-                watch_dir: temp.path().to_path_buf(),
-                page_size: 1,
-                platform_base_url: Some("https://ding.example".to_string()),
-            },
-        );
-
-        let report = bridge.sync(McpSource::Linkding).await.expect("sync");
-
-        assert_eq!(
-            report,
-            SyncReport {
-                fetched: 1,
-                written: 1,
-                skipped: 0
-            }
-        );
-        let calls = executor.calls();
-        assert_eq!(calls.len(), 2);
-        assert!(
-            temp.path()
-                .join(relative_output_path(McpSource::Linkding, "linkding:1"))
-                .exists()
-        );
-    }
-
-    #[tokio::test]
-    async fn sync_memos_uses_page_tokens_and_writes_documents() {
-        let temp = tempfile::tempdir().expect("tempdir");
-        let executor = MockExecutor::with_responses(vec![
-            Ok(serde_json::json!({
-                "ok": true,
-                "data": {
-                    "memos": [
-                        {
-                            "name": "memos/first",
-                            "content": "first memo",
-                            "tags": ["ops"],
-                            "displayTime": "2026-04-01T00:00:00Z"
-                        }
-                    ],
-                    "nextPageToken": "page-2"
-                }
-            })),
-            Ok(serde_json::json!({
-                "ok": true,
-                "data": {
-                    "memos": [
-                        {
-                            "name": "memos/second",
-                            "content": "second memo",
-                            "tags": [],
-                            "createTime": "2026-04-02T00:00:00Z"
-                        }
-                    ],
-                    "nextPageToken": ""
-                }
-            })),
-        ]);
-        let bridge = McpBridge::new(
-            executor.clone(),
-            BridgeConfig {
-                server: "lab".to_string(),
-                watch_dir: temp.path().to_path_buf(),
-                page_size: 1,
-                platform_base_url: Some("https://memos.example.com".to_string()),
-            },
-        );
-
-        let report = bridge.sync(McpSource::Memos).await.expect("sync memos");
-
-        assert_eq!(
-            report,
-            SyncReport {
-                fetched: 2,
-                written: 2,
-                skipped: 0
-            }
-        );
-        let calls = executor.calls();
-        assert_eq!(calls.len(), 2);
-        assert_eq!(calls[0].2, "memos.list");
-        assert_eq!(calls[0].3, serde_json::json!({ "page_size": 1 }));
-        assert_eq!(
-            calls[1].3,
-            serde_json::json!({ "page_size": 1, "page_token": "page-2" })
-        );
-        assert!(
-            temp.path()
-                .join(relative_output_path(McpSource::Memos, "memos:first"))
-                .exists()
-        );
-        assert!(
-            temp.path()
-                .join(relative_output_path(McpSource::Memos, "memos:second"))
-                .exists()
-        );
-    }
-
-    #[tokio::test]
-    async fn sync_memos_stops_when_next_page_token_repeats() {
-        let temp = tempfile::tempdir().expect("tempdir");
-        let executor = MockExecutor::with_responses(vec![
-            Ok(serde_json::json!({
-                "ok": true,
-                "data": {
-                    "memos": [
-                        {
-                            "name": "memos/first",
-                            "content": "first memo",
-                            "tags": []
-                        }
-                    ],
-                    "nextPageToken": "page-2"
-                }
-            })),
-            Ok(serde_json::json!({
-                "ok": true,
-                "data": {
-                    "memos": [
-                        {
-                            "name": "memos/second",
-                            "content": "second memo",
-                            "tags": []
-                        }
-                    ],
-                    "nextPageToken": "page-2"
-                }
-            })),
-        ]);
-        let bridge = McpBridge::new(
-            executor.clone(),
-            BridgeConfig {
-                server: "lab".to_string(),
-                watch_dir: temp.path().to_path_buf(),
-                page_size: 1,
-                platform_base_url: Some("https://memos.example.com".to_string()),
-            },
-        );
-
-        let report = bridge.sync(McpSource::Memos).await.expect("sync memos");
-
-        assert_eq!(
-            report,
-            SyncReport {
-                fetched: 2,
-                written: 2,
-                skipped: 0
-            }
-        );
-        assert_eq!(executor.calls().len(), 2);
-    }
-
-    #[tokio::test]
-    async fn sync_bytestash_accepts_array_payloads() {
-        let temp = tempfile::tempdir().expect("tempdir");
-        let executor = MockExecutor::with_responses(vec![Ok(serde_json::json!({
-            "ok": true,
-            "data": [
-                {
-                    "id": "snippet-1",
-                    "title": "docker env",
-                    "description": "Inspect container env vars",
-                    "language": "bash",
-                    "categories": ["docker"],
-                    "fragments": [
-                        {
-                            "fileName": "inspect.sh",
-                            "code": "docker inspect app"
-                        }
-                    ]
-                }
-            ]
-        }))]);
-        let bridge = McpBridge::new(
-            executor.clone(),
-            BridgeConfig {
-                server: "lab".to_string(),
-                watch_dir: temp.path().to_path_buf(),
-                page_size: 50,
-                platform_base_url: Some("https://stash.example.com".to_string()),
-            },
-        );
-
-        let report = bridge
-            .sync(McpSource::Bytestash)
-            .await
-            .expect("sync bytestash");
-
-        assert_eq!(
-            report,
-            SyncReport {
-                fetched: 1,
-                written: 1,
-                skipped: 0
-            }
-        );
-        let calls = executor.calls();
-        assert_eq!(calls.len(), 1);
-        assert_eq!(calls[0].2, "snippets.list");
-        assert_eq!(calls[0].3, serde_json::json!({}));
-        assert!(
-            temp.path()
-                .join(relative_output_path(
-                    McpSource::Bytestash,
-                    "bytestash:snippet-1"
-                ))
-                .exists()
-        );
-    }
-
-    #[tokio::test]
-    async fn sync_paperless_resolves_lookup_tables_before_documents() {
-        let temp = tempfile::tempdir().expect("tempdir");
-        let executor = MockExecutor::with_responses(vec![
-            Ok(serde_json::json!({
-                "ok": true,
-                "data": {
-                    "results": [
-                        { "id": 1, "name": "finance" }
-                    ]
-                }
-            })),
-            Ok(serde_json::json!({
-                "ok": true,
-                "data": {
-                    "results": [
-                        { "id": 9, "name": "Bank" }
-                    ]
-                }
-            })),
-            Ok(serde_json::json!({
-                "ok": true,
-                "data": {
-                    "results": [
-                        {
-                            "id": 17,
-                            "title": "Statement",
-                            "content": "balance: 100",
-                            "tags": [1],
-                            "correspondent": 9,
-                            "created": "2026-04-01"
-                        }
-                    ],
-                    "next": null
-                }
-            })),
-        ]);
-        let bridge = McpBridge::new(
-            executor.clone(),
-            BridgeConfig {
-                server: "lab".to_string(),
-                watch_dir: temp.path().to_path_buf(),
-                page_size: 100,
-                platform_base_url: Some("https://paperless.example.com".to_string()),
-            },
-        );
-
-        let report = bridge
-            .sync(McpSource::Paperless)
-            .await
-            .expect("sync paperless");
-
-        assert_eq!(
-            report,
-            SyncReport {
-                fetched: 1,
-                written: 1,
-                skipped: 0
-            }
-        );
-        let calls = executor.calls();
-        assert_eq!(calls.len(), 3);
-        assert_eq!(calls[0].2, "tags.list");
-        assert_eq!(calls[1].2, "correspondents.list");
-        assert_eq!(calls[2].2, "documents.list");
-        assert_eq!(
-            calls[2].3,
-            serde_json::json!({ "page_size": 100, "page": 1 })
-        );
-        assert!(
-            temp.path()
-                .join(relative_output_path(McpSource::Paperless, "paperless:17"))
-                .exists()
-        );
-    }
-}
diff --git a/crates/noxa-rag/src/mcp_bridge/io.rs b/crates/noxa-rag/src/mcp_bridge/io.rs
index 826df46..d1de788 100644
--- a/crates/noxa-rag/src/mcp_bridge/io.rs
+++ b/crates/noxa-rag/src/mcp_bridge/io.rs
@@ -36,6 +36,9 @@ pub async fn write_bridge_document(
 
     let tmp_path = temp_output_path(&path);
     tokio::fs::write(&tmp_path, &serialized).await?;
+    // Remove destination before rename so the operation succeeds on Windows,
+    // where rename(src, dst) errors when dst already exists.
+    let _ = tokio::fs::remove_file(&path).await;
     tokio::fs::rename(&tmp_path, &path).await?;
     Ok(WriteStatus::Written)
 }
diff --git a/crates/noxa-rag/src/pipeline/parse/tests.rs b/crates/noxa-rag/src/pipeline/parse/tests.rs
index bbd8835..024aa1d 100644
--- a/crates/noxa-rag/src/pipeline/parse/tests.rs
+++ b/crates/noxa-rag/src/pipeline/parse/tests.rs
@@ -1,7 +1,7 @@
 use serde_json::json;
 use std::fs;
 
-use super::{FormatProvenance, IngestionProvenance, build_point_payload, parse_file};
+use super::{FileMetadata, FormatProvenance, IngestionProvenance, build_point_payload, parse_file};
 
 #[tokio::test]
 async fn parse_file_json_recovers_provenance_fields() {
@@ -126,10 +126,11 @@ async fn parse_file_json_keeps_crawler_provenance_in_point_payload() {
         .metadata
         .url
         .as_deref()
-        .expect("parser should set file url");
+        .expect("parser should set file url")
+        .to_string();
     let chunk = crate::types::Chunk {
         text: parsed.extraction.content.markdown.clone(),
-        source_url: url.to_string(),
+        source_url: url.clone(),
         domain: "example.com".to_string(),
         chunk_index: 0,
         total_chunks: 1,
@@ -138,14 +139,12 @@ async fn parse_file_json_keeps_crawler_provenance_in_point_payload() {
         section_header: None,
     };
 
-    let payload = build_point_payload(
-        &chunk,
+    let file_meta = FileMetadata::from_result_and_provenance(
         &parsed.extraction,
         None,
         &parsed.provenance,
-        url,
-        None,
     );
+    let payload = build_point_payload(chunk, &file_meta, &url, None);
     let json = serde_json::to_value(&payload).expect("serialize payload");
 
     assert_eq!(
@@ -210,12 +209,19 @@ fn sample_extraction_with_metadata() -> noxa_core::ExtractionResult {
     }
 }
 
+fn sample_file_metadata(provenance: &IngestionProvenance) -> FileMetadata {
+    FileMetadata::from_result_and_provenance(
+        &sample_extraction_with_metadata(),
+        None,
+        provenance,
+    )
+}
+
 /// Web variant: external_id/platform_url at the top level, plus seed_url
 /// and friends falling back to metadata when the variant fields are None.
 #[test]
 fn build_point_payload_serializes_web_variant() {
     let chunk = sample_chunk();
-    let extraction = sample_extraction_with_metadata();
     let provenance = IngestionProvenance {
         external_id: Some("linkding:42".to_string()),
         platform_url: Some("https://platform.example/items/42".to_string()),
@@ -225,15 +231,10 @@ fn build_point_payload_serializes_web_variant() {
             crawl_depth: None,
         },
     };
+    let url = chunk.source_url.clone();
+    let file_meta = sample_file_metadata(&provenance);
 
-    let payload = build_point_payload(
-        &chunk,
-        &extraction,
-        None,
-        &provenance,
-        &chunk.source_url,
-        None,
-    );
+    let payload = build_point_payload(chunk, &file_meta, &url, None);
     let json = serde_json::to_value(&payload).expect("serialize payload");
 
     assert_eq!(
@@ -266,7 +267,6 @@ fn build_point_payload_serializes_web_variant() {
 #[test]
 fn build_point_payload_serializes_email_variant() {
     let chunk = sample_chunk();
-    let extraction = sample_extraction_with_metadata();
     let provenance = IngestionProvenance {
         external_id: Some("msg@example.com".to_string()),
         platform_url: None,
@@ -277,15 +277,10 @@ fn build_point_payload_serializes_email_variant() {
             has_attachments: Some(true),
         },
     };
+    let url = chunk.source_url.clone();
+    let file_meta = sample_file_metadata(&provenance);
 
-    let payload = build_point_payload(
-        &chunk,
-        &extraction,
-        None,
-        &provenance,
-        &chunk.source_url,
-        None,
-    );
+    let payload = build_point_payload(chunk, &file_meta, &url, None);
     let json = serde_json::to_value(&payload).expect("serialize payload");
 
     assert_eq!(
@@ -312,7 +307,6 @@ fn build_point_payload_serializes_email_variant() {
 #[test]
 fn build_point_payload_serializes_feed_variant() {
     let chunk = sample_chunk();
-    let extraction = sample_extraction_with_metadata();
     let provenance = IngestionProvenance {
         external_id: Some("entry-1".to_string()),
         platform_url: None,
@@ -321,15 +315,10 @@ fn build_point_payload_serializes_feed_variant() {
             item_id: Some("entry-1".to_string()),
         },
     };
+    let url = chunk.source_url.clone();
+    let file_meta = sample_file_metadata(&provenance);
 
-    let payload = build_point_payload(
-        &chunk,
-        &extraction,
-        None,
-        &provenance,
-        &chunk.source_url,
-        None,
-    );
+    let payload = build_point_payload(chunk, &file_meta, &url, None);
     let json = serde_json::to_value(&payload).expect("serialize payload");
 
     assert_eq!(
@@ -346,7 +335,6 @@ fn build_point_payload_serializes_feed_variant() {
 #[test]
 fn build_point_payload_serializes_presentation_variant() {
     let chunk = sample_chunk();
-    let extraction = sample_extraction_with_metadata();
     let provenance = IngestionProvenance {
         external_id: None,
         platform_url: None,
@@ -355,15 +343,10 @@ fn build_point_payload_serializes_presentation_variant() {
             has_notes: Some(true),
         },
     };
+    let url = chunk.source_url.clone();
+    let file_meta = sample_file_metadata(&provenance);
 
-    let payload = build_point_payload(
-        &chunk,
-        &extraction,
-        None,
-        &provenance,
-        &chunk.source_url,
-        None,
-    );
+    let payload = build_point_payload(chunk, &file_meta, &url, None);
     let json = serde_json::to_value(&payload).expect("serialize payload");
 
     assert_eq!(
@@ -380,7 +363,6 @@ fn build_point_payload_serializes_presentation_variant() {
 #[test]
 fn build_point_payload_serializes_subtitle_variant() {
     let chunk = sample_chunk();
-    let extraction = sample_extraction_with_metadata();
     let provenance = IngestionProvenance {
         external_id: None,
         platform_url: None,
@@ -390,15 +372,10 @@ fn build_point_payload_serializes_subtitle_variant() {
             source_file: Some("demo.mp4".to_string()),
         },
     };
+    let url = chunk.source_url.clone();
+    let file_meta = sample_file_metadata(&provenance);
 
-    let payload = build_point_payload(
-        &chunk,
-        &extraction,
-        None,
-        &provenance,
-        &chunk.source_url,
-        None,
-    );
+    let payload = build_point_payload(chunk, &file_meta, &url, None);
     let json = serde_json::to_value(&payload).expect("serialize payload");
 
     assert_eq!(
diff --git a/crates/noxa-rag/src/pipeline/parse/text.rs b/crates/noxa-rag/src/pipeline/parse/text.rs
index c783642..3c3e370 100644
--- a/crates/noxa-rag/src/pipeline/parse/text.rs
+++ b/crates/noxa-rag/src/pipeline/parse/text.rs
@@ -160,10 +160,12 @@ pub(crate) fn parse_jsonl_file(bytes: Vec<u8>, file_url: String, title: String)
 /// attacks. Later versions of quick-xml may add limits, but we keep the scan
 /// regardless (defense-in-depth).
 pub(super) fn contains_xml_entity_expansion_risk(bytes: &[u8]) -> bool {
-    let scan_limit = bytes.len().min(8192);
-    let header = &bytes[..scan_limit];
-    let s = std::str::from_utf8(header).unwrap_or("");
-    s.contains("<!DOCTYPE") || s.contains("<!ENTITY")
+    let header = &bytes[..bytes.len().min(8192)];
+    // Scan raw bytes so non-UTF-8 sequences cannot silently suppress the guard.
+    // `from_utf8().unwrap_or("")` would return "" on any non-UTF-8 byte in the
+    // window, letting a <!DOCTYPE immediately following binary noise slip through.
+    header.windows(9).any(|w| w == b"<!DOCTYPE")
+        || header.windows(8).any(|w| w == b"<!ENTITY")
 }
 
 pub(crate) fn parse_xml_file(
diff --git a/crates/noxa-rag/src/pipeline/runtime.rs b/crates/noxa-rag/src/pipeline/runtime.rs
index 74a4a72..8fc6050 100644
--- a/crates/noxa-rag/src/pipeline/runtime.rs
+++ b/crates/noxa-rag/src/pipeline/runtime.rs
@@ -102,6 +102,7 @@ pub(crate) async fn run(pipeline: &Pipeline) -> Result<(), RagError> {
         pipeline.shutdown.clone(),
         watch_dirs,
         pipeline.config.pipeline.startup_scan_concurrency,
+        pipeline.config.pipeline.max_file_size_bytes,
     );
 
     let heartbeat_handle = spawn_heartbeat(
diff --git a/crates/noxa-rag/src/pipeline/scan.rs b/crates/noxa-rag/src/pipeline/scan.rs
index 2475548..32844e7 100644
--- a/crates/noxa-rag/src/pipeline/scan.rs
+++ b/crates/noxa-rag/src/pipeline/scan.rs
@@ -101,15 +101,12 @@ fn collect_indexable_paths_recursive(path: &Path, found: &mut Vec<PathBuf>) {
     }
 }
 
-/// Maximum JSON file size that `startup_scan_key` will deserialize to extract a content-hash
-/// and URL. Files larger than this are returned as `None` (causing unconditional queueing),
-/// which is safe because `process_job` enforces `max_file_size_bytes` independently.
-///
-/// Matches the default `max_file_size_bytes` in `PipelineConfig` (50 MiB).
-const MAX_SCAN_JSON_BYTES: u64 = 50 * 1024 * 1024;
-
 /// Compute the (content_hash, url) key used by the startup delta scan.
 ///
+/// `max_json_bytes` caps JSON deserialization to prevent unbounded heap allocation before
+/// pipeline workers start. Pass `pipeline.max_file_size_bytes` from the caller so the
+/// limit stays in sync with the per-job guard in `process_job`.
+///
 /// For `.json` ExtractionResult files: peeks at `metadata.url` and `metadata.content_hash`
 /// from inside the JSON (fast, avoids full deserialisation of large markdown content).
 /// Falls back to file:// URL + `mtime:<secs>:<size>` if the JSON lacks a URL.
@@ -118,15 +115,12 @@ const MAX_SCAN_JSON_BYTES: u64 = 50 * 1024 * 1024;
 /// O(stat) dedup key. Collisions cause re-indexing (not data loss), which is acceptable
 /// versus re-reading and hashing every file on startup.
 ///
-/// Returns `None` when the path is a symlink (confinement-safety: avoid following symlinks
-/// out of the watch roots before `process_job` validates confinement), or when the file
-/// metadata cannot be read, or when a file:// URL cannot be constructed, or when a `.json`
-/// file exceeds [`MAX_SCAN_JSON_BYTES`] (preventing unbounded heap allocation before pipeline
-/// workers start; the oversized file is still queued unconditionally and rejected by
-/// `process_job` under its own `max_file_size_bytes` guard).
+/// Returns `None` when the path is a symlink, metadata cannot be read, a file:// URL
+/// cannot be constructed, or a `.json` file exceeds `max_json_bytes` (file is still
+/// queued unconditionally and rejected by `process_job` under its own guard).
 ///
 /// Must be called inside `spawn_blocking` — this function reads from disk synchronously.
-pub(crate) fn startup_scan_key(path: &Path) -> Option<(String, String)> {
+pub(crate) fn startup_scan_key(path: &Path, max_json_bytes: u64) -> Option<(String, String)> {
     // Security: skip symlinks to avoid following links out of watch roots before
     // confinement validation runs in process_job. Mirrors the symlink skip in
     // collect_indexable_paths_recursive.
@@ -140,11 +134,11 @@ pub(crate) fn startup_scan_key(path: &Path) -> Option<(String, String)> {
         // Guard against unbounded heap allocation: serde_json::from_reader will read the
         // full JSON value into memory. Reuse sym_meta (already fetched above) to avoid an
         // extra stat syscall — for non-symlinks, symlink_metadata.len() == metadata.len().
-        if sym_meta.len() > MAX_SCAN_JSON_BYTES {
+        if sym_meta.len() > max_json_bytes {
             tracing::debug!(
                 path = %path.display(),
                 size = sym_meta.len(),
-                limit = MAX_SCAN_JSON_BYTES,
+                limit = max_json_bytes,
                 "startup_scan_key: JSON file exceeds size limit, queuing unconditionally"
             );
             return None;
@@ -488,13 +482,13 @@ mod tests {
 
         // Symlink must return None — do not read its target bytes.
         assert_eq!(
-            startup_scan_key(&link),
+            startup_scan_key(&link, 50 * 1024 * 1024),
             None,
             "symlinks must be skipped to prevent reading files outside watch roots"
         );
 
         // Regular file still produces a key.
-        assert!(startup_scan_key(&target).is_some());
+        assert!(startup_scan_key(&target, 50 * 1024 * 1024).is_some());
     }
 
     #[test]
@@ -503,7 +497,7 @@ mod tests {
         let file = tmp.path().join("doc.md");
         fs::write(&file, "hello world").expect("write file");
 
-        let (hash, url) = startup_scan_key(&file).expect("key for md file");
+        let (hash, url) = startup_scan_key(&file, 50 * 1024 * 1024).expect("key for md file");
         assert!(
             hash.starts_with("mtime:"),
             "non-JSON key should be mtime-based, got: {hash}"
@@ -518,12 +512,12 @@ mod tests {
         let tmp = tempfile::tempdir().expect("tempdir");
         let file = tmp.path().join("doc.txt");
         fs::write(&file, "one").expect("write v1");
-        let (k1, _) = startup_scan_key(&file).expect("v1 key");
+        let (k1, _) = startup_scan_key(&file, 50 * 1024 * 1024).expect("v1 key");
 
         // Different size ensures the size component changes, so even if mtime
         // resolution is 1s and the test runs faster than that, the key still differs.
         fs::write(&file, "two two two").expect("write v2");
-        let (k2, _) = startup_scan_key(&file).expect("v2 key");
+        let (k2, _) = startup_scan_key(&file, 50 * 1024 * 1024).expect("v2 key");
 
         assert_ne!(k1, k2, "key must change when file size changes");
     }
@@ -535,7 +529,7 @@ mod tests {
         let body = r#"{"metadata":{"url":"https://example.com/a","content_hash":"abc123"}}"#;
         fs::write(&file, body).expect("write json");
 
-        let (hash, url) = startup_scan_key(&file).expect("json key");
+        let (hash, url) = startup_scan_key(&file, 50 * 1024 * 1024).expect("json key");
         assert_eq!(hash, "abc123");
         assert_eq!(url, "https://example.com/a");
     }
@@ -547,7 +541,7 @@ mod tests {
         // No url inside metadata -> falls through to file:// + mtime:size.
         fs::write(&file, r#"{"metadata":{"content_hash":"x"}}"#).expect("write json");
 
-        let (hash, url) = startup_scan_key(&file).expect("json fallback key");
+        let (hash, url) = startup_scan_key(&file, 50 * 1024 * 1024).expect("json fallback key");
         assert!(hash.starts_with("mtime:"), "fallback hash: {hash}");
         assert!(url.starts_with("file://"), "fallback url: {url}");
     }
@@ -557,34 +551,43 @@ mod tests {
         let tmp = tempfile::tempdir().expect("tempdir");
         let file = tmp.path().join("big.json");
 
-        // Use a sparse file (set_len without writing data) to create a >50 MiB
-        // file instantly without actually writing 50 MiB to disk.
+        // Use a sparse file (set_len without writing data) to create a >1 KiB
+        // file instantly. We use a small limit (1024) to avoid touching the real
+        // 50 MiB default — the guard logic is identical regardless of the limit value.
+        const LIMIT: u64 = 1024;
         {
             let f = std::fs::File::create(&file).expect("create big.json");
-            let over_limit = super::MAX_SCAN_JSON_BYTES + 1;
-            f.set_len(over_limit).expect("set_len for sparse file");
+            f.set_len(LIMIT + 1).expect("set_len for sparse file");
         }
 
-        // The size guard must trigger before any attempt to deserialize.
         assert_eq!(
-            startup_scan_key(&file),
+            startup_scan_key(&file, LIMIT),
             None,
-            "JSON file exceeding MAX_SCAN_JSON_BYTES must return None to prevent unbounded heap allocation"
+            "JSON file exceeding the limit must return None"
         );
     }
 
     #[test]
     fn startup_scan_key_json_at_size_limit_is_processed() {
-        // A JSON file exactly AT the limit (not over) should still be attempted.
-        // We write a tiny valid JSON file to verify the check is strictly >.
+        // A file exactly AT the limit (len == limit, so `len > limit` is false)
+        // must still be attempted. We set the file to exactly LIMIT bytes.
+        const LIMIT: u64 = 1024;
         let tmp = tempfile::tempdir().expect("tempdir");
         let file = tmp.path().join("doc.json");
         let body = r#"{"metadata":{"url":"https://example.com/b","content_hash":"def456"}}"#;
         fs::write(&file, body).expect("write json");
 
-        // Tiny file is well below the limit — must parse normally.
-        let (hash, url) = startup_scan_key(&file).expect("small json should succeed");
-        assert_eq!(hash, "def456");
-        assert_eq!(url, "https://example.com/b");
+        // Extend the file to exactly LIMIT bytes with null padding (sparse).
+        {
+            let f = std::fs::OpenOptions::new().write(true).open(&file).expect("open");
+            f.set_len(LIMIT).expect("set_len");
+        }
+
+        // The guard is `len > limit`, so len == limit must NOT return None.
+        // JSON parsing will fail (trailing nulls), so we expect Some with a mtime key.
+        assert!(
+            startup_scan_key(&file, LIMIT).is_some(),
+            "file at exactly the limit must not be rejected by the size guard"
+        );
     }
 }
diff --git a/crates/noxa-rag/src/pipeline/startup_scan.rs b/crates/noxa-rag/src/pipeline/startup_scan.rs
index 6a8767c..1396d5c 100644
--- a/crates/noxa-rag/src/pipeline/startup_scan.rs
+++ b/crates/noxa-rag/src/pipeline/startup_scan.rs
@@ -24,6 +24,7 @@ pub(super) fn spawn_startup_scan(
     shutdown: tokio_util::sync::CancellationToken,
     watch_dirs: Vec<PathBuf>,
     scan_concurrency: usize,
+    max_json_bytes: u64,
 ) -> JoinHandle<()> {
     tokio::spawn(async move {
         let paths = match tokio::task::spawn_blocking({
@@ -75,7 +76,7 @@ pub(super) fn spawn_startup_scan(
                             batch
                                 .into_iter()
                                 .map(|p| {
-                                    let k = scan::startup_scan_key(&p);
+                                    let k = scan::startup_scan_key(&p, max_json_bytes);
                                     (p, k)
                                 })
                                 .collect()
@@ -268,7 +269,7 @@ mod tests {
         let shutdown = CancellationToken::new();
         let (tx, rx) = async_channel::bounded::<PipelineJob>(256);
 
-        let handle = spawn_startup_scan(tx.clone(), store, shutdown.clone(), vec![watch_dir], 16);
+        let handle = spawn_startup_scan(tx.clone(), store, shutdown.clone(), vec![watch_dir], 16, 50 * 1024 * 1024);
 
         handle.await.expect("startup scan panicked");
         drop(tx);

From b8bab4380675a54592b6a1d0623886d33d8df74a Mon Sep 17 00:00:00 2001
From: Jacob Magar <jmagar@gmail.com>
Date: Mon, 27 Apr 2026 07:15:22 -0400
Subject: [PATCH 23/23] fix: address PR #14 round-2 review comments
 (coderabbitai)

- executor.rs: add 30s timeout to mcporter subprocess; truncate stderr
  to 512 chars to prevent large/sensitive output leaking into error msgs
- io.rs: replace DefaultHasher (not stable across Rust versions) with
  xxh3_64 for deterministic filenames in stable_component_hash
- linkding.rs: simplify next-page termination check to map_or(true, is_null)
- binary.rs: separate per-entry vs cumulative error messages in ODT/PPTX
  decompression guard so the rejection reason is explicit
- parse/tests.rs: strengthen parse_feed_rejects_entity_expansion to assert
  the error message mentions DOCTYPE/ENTITY (mirrors XML test)
- chunker.rs: remove unused `use std::str::FromStr as _` import (CI warning)
- cargo fmt --all: fix import grouping in bytestash.rs, linkding.rs, memos.rs,
  parse/tests.rs; fix OpenOptions chain wrapping in scan.rs

No action taken on:
- io.rs:43 rename/remove_file: kept as-is; cross-platform safe regardless
- bytestash.rs:38 pagination: no official pagination API confirmed by reviewer
- scan.rs:568 boundary test: reviewer acknowledged implementation is correct

Resolves review thread PRRT_kwDOR_mP6c59zaLc
Resolves review thread PRRT_kwDOR_mP6c59zaLq
Resolves review thread PRRT_kwDOR_mP6c59zaLs
Resolves review thread PRRT_kwDOR_mP6c59zaL3
Resolves review thread PRRT_kwDOR_mP6c59zaL5
Resolves review thread PRRT_kwDOR_mP6c59zaL8
Resolves review thread PRRT_kwDOR_mP6c59zaMA
Resolves review thread PRRT_kwDOR_mP6c59zaMJ
Resolves review thread PRRT_kwDOR_mP6c59zaMM
Resolves review thread PRRT_kwDOR_mP6c59zaMR
Resolves review thread PRRT_kwDOR_mP6c59zaMT
Resolves review thread PRRT_kwDOR_mP6c59zaMm
Resolves review thread PRRT_kwDOR_mP6c59zaMq
Resolves review thread PRRT_kwDOR_mP6c59zaMs
Resolves review thread PRRT_kwDOR_mP6c59zaMz
---
 crates/noxa-rag/src/chunker.rs               |  1 -
 crates/noxa-rag/src/mcp_bridge/bytestash.rs  |  6 ++--
 crates/noxa-rag/src/mcp_bridge/executor.rs   | 38 ++++++++++++++------
 crates/noxa-rag/src/mcp_bridge/io.rs         |  7 ++--
 crates/noxa-rag/src/mcp_bridge/linkding.rs   |  8 ++---
 crates/noxa-rag/src/mcp_bridge/memos.rs      |  6 ++--
 crates/noxa-rag/src/mcp_bridge/mod.rs        |  9 +++--
 crates/noxa-rag/src/mcp_bridge/paperless.rs  |  6 ++--
 crates/noxa-rag/src/pipeline/parse/binary.rs | 20 +++++++----
 crates/noxa-rag/src/pipeline/parse/mod.rs    |  5 +--
 crates/noxa-rag/src/pipeline/parse/tests.rs  | 18 ++++------
 crates/noxa-rag/src/pipeline/parse/text.rs   |  8 ++---
 crates/noxa-rag/src/pipeline/process.rs      | 35 +++++++-----------
 crates/noxa-rag/src/pipeline/scan.rs         |  5 ++-
 crates/noxa-rag/src/pipeline/startup_scan.rs |  9 ++++-
 crates/noxa-rag/src/types.rs                 |  1 -
 16 files changed, 96 insertions(+), 86 deletions(-)

diff --git a/crates/noxa-rag/src/chunker.rs b/crates/noxa-rag/src/chunker.rs
index 2e07380..bea2b48 100644
--- a/crates/noxa-rag/src/chunker.rs
+++ b/crates/noxa-rag/src/chunker.rs
@@ -129,7 +129,6 @@ pub fn chunk(
 #[cfg(test)]
 mod tests {
     use super::*;
-    use std::str::FromStr as _;
 
     fn make_extraction_result(markdown: &str) -> ExtractionResult {
         crate::pipeline::parse::make_text_result(
diff --git a/crates/noxa-rag/src/mcp_bridge/bytestash.rs b/crates/noxa-rag/src/mcp_bridge/bytestash.rs
index 921217f..084d211 100644
--- a/crates/noxa-rag/src/mcp_bridge/bytestash.rs
+++ b/crates/noxa-rag/src/mcp_bridge/bytestash.rs
@@ -3,10 +3,10 @@ use serde_json::Value;
 use crate::RagError;
 
 use super::{
-    BridgeDocument, McpBridge, McpSource, McporterExecutor, SyncReport, WriteStatus,
-    array_field, join_base_url, join_non_empty, optional_string, required_base_url,
-    required_string, string_array,
+    BridgeDocument, McpBridge, McpSource, McporterExecutor, SyncReport, WriteStatus, array_field,
     io::{build_extraction, write_bridge_document},
+    join_base_url, join_non_empty, optional_string, required_base_url, required_string,
+    string_array,
 };
 
 impl<E> McpBridge<E>
diff --git a/crates/noxa-rag/src/mcp_bridge/executor.rs b/crates/noxa-rag/src/mcp_bridge/executor.rs
index f363fa1..70f3bec 100644
--- a/crates/noxa-rag/src/mcp_bridge/executor.rs
+++ b/crates/noxa-rag/src/mcp_bridge/executor.rs
@@ -33,23 +33,39 @@ impl McporterExecutor for ProcessMcporterExecutor {
             "action": action,
             "params": params,
         });
-        let output = Command::new(&self.executable)
-            .arg("call")
-            .arg(&selector)
-            .arg("--args")
-            .arg(args.to_string())
-            .arg("--output")
-            .arg("json")
-            .output()
-            .await
-            .map_err(|e| RagError::Generic(format!("failed to execute mcporter: {e}")))?;
+        const MCPORTER_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(30);
+
+        let output = tokio::time::timeout(
+            MCPORTER_TIMEOUT,
+            Command::new(&self.executable)
+                .arg("call")
+                .arg(&selector)
+                .arg("--args")
+                .arg(args.to_string())
+                .arg("--output")
+                .arg("json")
+                .output(),
+        )
+        .await
+        .map_err(|_| {
+            RagError::Generic(format!(
+                "mcporter call {} {} timed out after {}s",
+                selector,
+                action,
+                MCPORTER_TIMEOUT.as_secs()
+            ))
+        })?
+        .map_err(|e| RagError::Generic(format!("failed to execute mcporter: {e}")))?;
 
         if !output.status.success() {
+            // Truncate stderr to avoid leaking arbitrarily large subprocess output.
+            let stderr = String::from_utf8_lossy(&output.stderr);
+            let preview: String = stderr.chars().take(512).collect();
             return Err(RagError::Generic(format!(
                 "mcporter call {} {} failed: {}",
                 selector,
                 action,
-                String::from_utf8_lossy(&output.stderr).trim()
+                preview.trim()
             )));
         }
 
diff --git a/crates/noxa-rag/src/mcp_bridge/io.rs b/crates/noxa-rag/src/mcp_bridge/io.rs
index d1de788..2bf8ae1 100644
--- a/crates/noxa-rag/src/mcp_bridge/io.rs
+++ b/crates/noxa-rag/src/mcp_bridge/io.rs
@@ -102,10 +102,9 @@ fn sanitize_component(value: &str) -> String {
 }
 
 fn stable_component_hash(value: &str) -> u64 {
-    use std::hash::{DefaultHasher, Hasher};
-    let mut hasher = DefaultHasher::new();
-    hasher.write(value.as_bytes());
-    hasher.finish()
+    // DefaultHasher is explicitly documented as not stable across Rust versions.
+    // Use xxh3_64 (already a workspace dep) for stable, deterministic filenames.
+    xxhash_rust::xxh3::xxh3_64(value.as_bytes())
 }
 
 fn temp_output_path(path: &Path) -> PathBuf {
diff --git a/crates/noxa-rag/src/mcp_bridge/linkding.rs b/crates/noxa-rag/src/mcp_bridge/linkding.rs
index 7f7be87..27b866c 100644
--- a/crates/noxa-rag/src/mcp_bridge/linkding.rs
+++ b/crates/noxa-rag/src/mcp_bridge/linkding.rs
@@ -6,10 +6,10 @@ use url::Url;
 use crate::RagError;
 
 use super::{
-    BridgeDocument, McpBridge, McpSource, McporterExecutor, SyncReport, WriteStatus,
-    array_field, as_u64_value, join_non_empty, optional_string, required_string, required_value,
-    string_array,
+    BridgeDocument, McpBridge, McpSource, McporterExecutor, SyncReport, WriteStatus, array_field,
+    as_u64_value,
     io::{build_extraction, write_bridge_document},
+    join_non_empty, optional_string, required_string, required_value, string_array,
 };
 
 impl<E> McpBridge<E>
@@ -52,7 +52,7 @@ where
                 }
             }
 
-            if data.get("next").is_none() || data.get("next").is_some_and(Value::is_null) {
+            if data.get("next").map_or(true, Value::is_null) {
                 break;
             }
             if new_records == 0 {
diff --git a/crates/noxa-rag/src/mcp_bridge/memos.rs b/crates/noxa-rag/src/mcp_bridge/memos.rs
index a81937b..1d8e134 100644
--- a/crates/noxa-rag/src/mcp_bridge/memos.rs
+++ b/crates/noxa-rag/src/mcp_bridge/memos.rs
@@ -3,10 +3,10 @@ use serde_json::Value;
 use crate::RagError;
 
 use super::{
-    BridgeDocument, McpBridge, McpSource, McporterExecutor, SyncReport, WriteStatus,
-    array_field, first_line_title, join_base_url, optional_string, required_base_url,
-    required_string, string_array,
+    BridgeDocument, McpBridge, McpSource, McporterExecutor, SyncReport, WriteStatus, array_field,
+    first_line_title,
     io::{build_extraction, write_bridge_document},
+    join_base_url, optional_string, required_base_url, required_string, string_array,
 };
 
 impl<E> McpBridge<E>
diff --git a/crates/noxa-rag/src/mcp_bridge/mod.rs b/crates/noxa-rag/src/mcp_bridge/mod.rs
index 559b3b5..1317425 100644
--- a/crates/noxa-rag/src/mcp_bridge/mod.rs
+++ b/crates/noxa-rag/src/mcp_bridge/mod.rs
@@ -6,18 +6,18 @@ use url::Url;
 
 use crate::RagError;
 
+mod bytestash;
 pub mod executor;
 mod io;
 mod linkding;
 mod memos;
-mod bytestash;
 mod paperless;
 
+pub use bytestash::normalize_bytestash_record;
 pub use executor::ProcessMcporterExecutor;
 pub use io::{relative_output_path, write_bridge_document};
 pub use linkding::normalize_linkding_record;
 pub use memos::normalize_memo_record;
-pub use bytestash::normalize_bytestash_record;
 pub use paperless::normalize_paperless_record;
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -145,7 +145,10 @@ pub(super) fn array_field<'a>(value: &'a Value, key: &str) -> Result<Vec<&'a Val
         .ok_or_else(|| RagError::Parse(format!("expected array field {key}")))
 }
 
-pub(super) fn required_base_url(config: &BridgeConfig, source: McpSource) -> Result<&str, RagError> {
+pub(super) fn required_base_url(
+    config: &BridgeConfig,
+    source: McpSource,
+) -> Result<&str, RagError> {
     config.platform_base_url.as_deref().ok_or_else(|| {
         RagError::Config(format!("{} requires --platform-base-url", source.as_str()))
     })
diff --git a/crates/noxa-rag/src/mcp_bridge/paperless.rs b/crates/noxa-rag/src/mcp_bridge/paperless.rs
index 8104806..1f60113 100644
--- a/crates/noxa-rag/src/mcp_bridge/paperless.rs
+++ b/crates/noxa-rag/src/mcp_bridge/paperless.rs
@@ -5,10 +5,10 @@ use serde_json::Value;
 use crate::RagError;
 
 use super::{
-    BridgeDocument, McpBridge, McpSource, McporterExecutor, SyncReport, WriteStatus,
-    array_field, as_u64, as_u64_value, join_base_url, optional_string, required_base_url,
-    required_value,
+    BridgeDocument, McpBridge, McpSource, McporterExecutor, SyncReport, WriteStatus, array_field,
+    as_u64, as_u64_value,
     io::{build_extraction, write_bridge_document},
+    join_base_url, optional_string, required_base_url, required_value,
 };
 
 impl<E> McpBridge<E>
diff --git a/crates/noxa-rag/src/pipeline/parse/binary.rs b/crates/noxa-rag/src/pipeline/parse/binary.rs
index 20bde35..55e7a6e 100644
--- a/crates/noxa-rag/src/pipeline/parse/binary.rs
+++ b/crates/noxa-rag/src/pipeline/parse/binary.rs
@@ -174,10 +174,20 @@ pub(crate) fn parse_office_zip_file(
             .map_err(|e| RagError::Parse(format!("{ext} read '{name}': {e}")))?;
         let read_bytes = xml_buf.len() as u64;
         if read_bytes > per_entry_cap {
+            // Distinguish which cap fired: per-entry or cumulative.
+            let reason = if per_entry_cap < MAX_ODT_PPTX_PER_ENTRY_BYTES {
+                format!(
+                    "cumulative budget ({} MiB total) reached",
+                    MAX_ODT_PPTX_TOTAL_BYTES / (1024 * 1024)
+                )
+            } else {
+                format!(
+                    "per-entry limit ({} MiB) exceeded",
+                    MAX_ODT_PPTX_PER_ENTRY_BYTES / (1024 * 1024)
+                )
+            };
             return Err(RagError::Parse(format!(
-                "{ext}: entry '{name}' exceeds decompression limit ({} MiB per entry, {} MiB total) — possible zip bomb",
-                MAX_ODT_PPTX_PER_ENTRY_BYTES / (1024 * 1024),
-                MAX_ODT_PPTX_TOTAL_BYTES / (1024 * 1024),
+                "{ext}: entry '{name}' — {reason} — possible zip bomb"
             )));
         }
         odt_pptx_measured_total = odt_pptx_measured_total.saturating_add(read_bytes);
@@ -327,9 +337,7 @@ mod tests {
             writer
                 .start_file(format!("content{i}.xml"), options)
                 .expect("start_file");
-            writer
-                .write_all(xml_chunk.as_bytes())
-                .expect("write_all");
+            writer.write_all(xml_chunk.as_bytes()).expect("write_all");
         }
         let zip_bytes = writer.finish().expect("finish").into_inner();
 
diff --git a/crates/noxa-rag/src/pipeline/parse/mod.rs b/crates/noxa-rag/src/pipeline/parse/mod.rs
index 22f5186..7437582 100644
--- a/crates/noxa-rag/src/pipeline/parse/mod.rs
+++ b/crates/noxa-rag/src/pipeline/parse/mod.rs
@@ -84,10 +84,7 @@ impl FormatProvenance {
                 payload.email_thread_id = thread_id.clone();
                 payload.email_has_attachments = *has_attachments;
             }
-            FormatProvenance::Feed {
-                feed_url,
-                item_id,
-            } => {
+            FormatProvenance::Feed { feed_url, item_id } => {
                 payload.feed_url = feed_url.clone();
                 payload.feed_item_id = item_id.clone();
             }
diff --git a/crates/noxa-rag/src/pipeline/parse/tests.rs b/crates/noxa-rag/src/pipeline/parse/tests.rs
index 024aa1d..5c64777 100644
--- a/crates/noxa-rag/src/pipeline/parse/tests.rs
+++ b/crates/noxa-rag/src/pipeline/parse/tests.rs
@@ -139,11 +139,8 @@ async fn parse_file_json_keeps_crawler_provenance_in_point_payload() {
         section_header: None,
     };
 
-    let file_meta = FileMetadata::from_result_and_provenance(
-        &parsed.extraction,
-        None,
-        &parsed.provenance,
-    );
+    let file_meta =
+        FileMetadata::from_result_and_provenance(&parsed.extraction, None, &parsed.provenance);
     let payload = build_point_payload(chunk, &file_meta, &url, None);
     let json = serde_json::to_value(&payload).expect("serialize payload");
 
@@ -210,11 +207,7 @@ fn sample_extraction_with_metadata() -> noxa_core::ExtractionResult {
 }
 
 fn sample_file_metadata(provenance: &IngestionProvenance) -> FileMetadata {
-    FileMetadata::from_result_and_provenance(
-        &sample_extraction_with_metadata(),
-        None,
-        provenance,
-    )
+    FileMetadata::from_result_and_provenance(&sample_extraction_with_metadata(), None, provenance)
 }
 
 /// Web variant: external_id/platform_url at the top level, plus seed_url
@@ -423,9 +416,10 @@ async fn parse_feed_rejects_entity_expansion() {
         <rss version=\"2.0\"><channel><title>&lol;</title></channel></rss>";
     std::fs::write(&path, payload).expect("write");
     let result = parse_file(&path, payload.to_vec()).await;
+    let msg = result.unwrap_err().to_string();
     assert!(
-        result.is_err(),
-        "expected Err for DOCTYPE/ENTITY feed, got Ok"
+        msg.contains("DOCTYPE") || msg.contains("ENTITY") || msg.contains("entity expansion"),
+        "expected entity-expansion rejection, got: {msg}"
     );
 }
 
diff --git a/crates/noxa-rag/src/pipeline/parse/text.rs b/crates/noxa-rag/src/pipeline/parse/text.rs
index 3c3e370..44656a0 100644
--- a/crates/noxa-rag/src/pipeline/parse/text.rs
+++ b/crates/noxa-rag/src/pipeline/parse/text.rs
@@ -110,10 +110,7 @@ pub(crate) fn parse_log_file(bytes: Vec<u8>, file_url: String, title: String) ->
     }
 }
 
-pub(crate) fn parse_html_file(
-    bytes: Vec<u8>,
-    file_url: String,
-) -> Result<ParsedFile, RagError> {
+pub(crate) fn parse_html_file(bytes: Vec<u8>, file_url: String) -> Result<ParsedFile, RagError> {
     let html = String::from_utf8_lossy(&bytes).into_owned();
     let mut extraction = noxa_core::extract(&html, Some(&file_url))
         .map_err(|e| RagError::Parse(format!("HTML extract: {e}")))?;
@@ -164,8 +161,7 @@ pub(super) fn contains_xml_entity_expansion_risk(bytes: &[u8]) -> bool {
     // Scan raw bytes so non-UTF-8 sequences cannot silently suppress the guard.
     // `from_utf8().unwrap_or("")` would return "" on any non-UTF-8 byte in the
     // window, letting a <!DOCTYPE immediately following binary noise slip through.
-    header.windows(9).any(|w| w == b"<!DOCTYPE")
-        || header.windows(8).any(|w| w == b"<!ENTITY")
+    header.windows(9).any(|w| w == b"<!DOCTYPE") || header.windows(8).any(|w| w == b"<!ENTITY")
 }
 
 pub(crate) fn parse_xml_file(
diff --git a/crates/noxa-rag/src/pipeline/process.rs b/crates/noxa-rag/src/pipeline/process.rs
index a23159e..ddd8146 100644
--- a/crates/noxa-rag/src/pipeline/process.rs
+++ b/crates/noxa-rag/src/pipeline/process.rs
@@ -219,9 +219,7 @@ pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result<Jo
         result.metadata.last_modified =
             Some(chrono::DateTime::<chrono::Utc>::from(mtime).to_rfc3339());
     }
-    let git_branch = if let Some(cached) =
-        find_cached_branch(&canonical, &ctx.git_branch_cache)
-    {
+    let git_branch = if let Some(cached) = find_cached_branch(&canonical, &ctx.git_branch_cache) {
         // Cache hit — no spawn_blocking needed. `canonical` ancestors were walked
         // synchronously in async context (pure in-memory DashMap lookups, no I/O).
         cached
@@ -242,15 +240,14 @@ pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result<Jo
                     .clone()
             } else {
                 // Cache the miss so we don't re-stat .git for every file outside a repo.
-                let key = std::fs::canonicalize(
-                    canonical_clone.parent().unwrap_or(&canonical_clone),
-                )
-                .unwrap_or_else(|_| {
-                    canonical_clone
-                        .parent()
-                        .unwrap_or(&canonical_clone)
-                        .to_path_buf()
-                });
+                let key =
+                    std::fs::canonicalize(canonical_clone.parent().unwrap_or(&canonical_clone))
+                        .unwrap_or_else(|_| {
+                            canonical_clone
+                                .parent()
+                                .unwrap_or(&canonical_clone)
+                                .to_path_buf()
+                        });
                 cache.entry(key).or_insert(None).clone()
             }
         })
@@ -320,11 +317,8 @@ pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result<Jo
     let n_chunks = chunks.len();
     // Build per-file metadata once; chunk loop clones from it instead of from
     // result + provenance on every iteration (bead noxa-346).
-    let file_meta = parse::FileMetadata::from_result_and_provenance(
-        &result,
-        git_branch,
-        &parsed.provenance,
-    );
+    let file_meta =
+        parse::FileMetadata::from_result_and_provenance(&result, git_branch, &parsed.provenance);
     let points: Vec<Point> = chunks
         .into_iter()
         .zip(vectors.into_iter())
@@ -337,12 +331,7 @@ pub(crate) async fn process_job(job: IndexJob, ctx: &WorkerContext) -> Result<Jo
             Point {
                 id,
                 vector,
-                payload: parse::build_point_payload(
-                    chunk,
-                    &file_meta,
-                    &url,
-                    Some(&file_hash),
-                ),
+                payload: parse::build_point_payload(chunk, &file_meta, &url, Some(&file_hash)),
             }
         })
         .collect();
diff --git a/crates/noxa-rag/src/pipeline/scan.rs b/crates/noxa-rag/src/pipeline/scan.rs
index 32844e7..64d6ce6 100644
--- a/crates/noxa-rag/src/pipeline/scan.rs
+++ b/crates/noxa-rag/src/pipeline/scan.rs
@@ -579,7 +579,10 @@ mod tests {
 
         // Extend the file to exactly LIMIT bytes with null padding (sparse).
         {
-            let f = std::fs::OpenOptions::new().write(true).open(&file).expect("open");
+            let f = std::fs::OpenOptions::new()
+                .write(true)
+                .open(&file)
+                .expect("open");
             f.set_len(LIMIT).expect("set_len");
         }
 
diff --git a/crates/noxa-rag/src/pipeline/startup_scan.rs b/crates/noxa-rag/src/pipeline/startup_scan.rs
index 1396d5c..33b2da4 100644
--- a/crates/noxa-rag/src/pipeline/startup_scan.rs
+++ b/crates/noxa-rag/src/pipeline/startup_scan.rs
@@ -269,7 +269,14 @@ mod tests {
         let shutdown = CancellationToken::new();
         let (tx, rx) = async_channel::bounded::<PipelineJob>(256);
 
-        let handle = spawn_startup_scan(tx.clone(), store, shutdown.clone(), vec![watch_dir], 16, 50 * 1024 * 1024);
+        let handle = spawn_startup_scan(
+            tx.clone(),
+            store,
+            shutdown.clone(),
+            vec![watch_dir],
+            16,
+            50 * 1024 * 1024,
+        );
 
         handle.await.expect("startup scan panicked");
         drop(tx);
diff --git a/crates/noxa-rag/src/types.rs b/crates/noxa-rag/src/types.rs
index 8ecef7b..7c9493d 100644
--- a/crates/noxa-rag/src/types.rs
+++ b/crates/noxa-rag/src/types.rs
@@ -211,7 +211,6 @@ pub struct SearchMetadataFilter {
     pub hnsw_ef: Option<usize>,
 }
 
-
 #[cfg(test)]
 mod tests {
     use super::SearchResult;