diff --git a/.dockerignore b/.dockerignore
index 9fafc2e4baea..ffa72eaf51e4 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -19,6 +19,7 @@
 !pageserver/
 !pgxn/
 !proxy/
+!object_storage/
 !storage_scrubber/
 !safekeeper/
 !storage_broker/
diff --git a/.gitignore b/.gitignore
index a07a65ccef1c..45eb4dbf0ee8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+/artifact_cache
 /pg_install
 /target
 /tmp_check
diff --git a/Cargo.lock b/Cargo.lock
index dbbf2c335737..5d2cdcea272e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2837,6 +2837,7 @@ dependencies = [
  "utils",
  "uuid",
  "workspace_hack",
+ "x509-cert",
 ]
 
 [[package]]
@@ -3991,6 +3992,33 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "object_storage"
+version = "0.0.1"
+dependencies = [
+ "anyhow",
+ "axum",
+ "axum-extra",
+ "camino",
+ "camino-tempfile",
+ "futures",
+ "http-body-util",
+ "itertools 0.10.5",
+ "jsonwebtoken",
+ "prometheus",
+ "rand 0.8.5",
+ "remote_storage",
+ "serde",
+ "serde_json",
+ "test-log",
+ "tokio",
+ "tokio-util",
+ "tower 0.5.2",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "once_cell"
 version = "1.20.2"
@@ -4693,7 +4721,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
 dependencies = [
  "base64 0.22.1",
  "byteorder",
@@ -4727,7 +4755,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.6"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
 dependencies = [
  "bytes",
  "chrono",
@@ -6925,6 +6953,28 @@ dependencies = [
  "syn 2.0.100",
 ]
 
+[[package]]
+name = "test-log"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7f46083d221181166e5b6f6b1e5f1d499f3a76888826e6cb1d057554157cd0f"
+dependencies = [
+ "env_logger",
+ "test-log-macros",
+ "tracing-subscriber",
+]
+
+[[package]]
+name = "test-log-macros"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "888d0c3c6db53c0fdab160d2ed5e12ba745383d3e85813f2ea0f2b1475ab553f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.100",
+]
+
 [[package]]
 name = "thiserror"
 version = "1.0.69"
@@ -7172,7 +7222,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.10"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/Cargo.toml b/Cargo.toml
index 1f605681dbb4..d957fa90708f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -40,6 +40,7 @@ members = [
     "libs/proxy/postgres-protocol2",
     "libs/proxy/postgres-types2",
     "libs/proxy/tokio-postgres2",
+    "object_storage",
 ]
 
 [workspace.package]
@@ -208,6 +209,7 @@ tracing-opentelemetry = "0.28"
 tracing-serde = "0.2.0"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
+test-log = { version = "0.2.17", default-features = false, features = ["log"] }
 twox-hash = { version = "1.6.3", default-features = false }
 typed-json = "0.1"
 url = "2.2"
diff --git a/Dockerfile b/Dockerfile
index 01540e192586..848bfab92196 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -89,6 +89,7 @@ RUN set -e \
       --bin storage_broker  \
       --bin storage_controller  \
       --bin proxy  \
+      --bin object_storage \
       --bin neon_local \
       --bin storage_scrubber \
       --locked --release
@@ -121,6 +122,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/object_storage      /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber    /usr/local/bin
 
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index da11ac2860fd..16fd51d79aa5 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -29,13 +29,12 @@
 //! ```sh
 //! compute_ctl -D /var/db/postgres/compute \
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
-//!             -S /var/db/postgres/specs/current.json \
+//!             -c /var/db/postgres/configs/config.json \
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
 //! ```
 use std::ffi::OsString;
 use std::fs::File;
-use std::path::Path;
 use std::process::exit;
 use std::sync::mpsc;
 use std::thread;
@@ -43,8 +42,7 @@ use std::time::Duration;
 
 use anyhow::{Context, Result};
 use clap::Parser;
-use compute_api::responses::ComputeCtlConfig;
-use compute_api::spec::ComputeSpec;
+use compute_api::responses::ComputeConfig;
 use compute_tools::compute::{
     BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal,
 };
@@ -118,16 +116,21 @@ struct Cli {
     #[arg(long)]
     pub set_disk_quota_for_fs: Option<String>,
 
-    #[arg(short = 's', long = "spec", group = "spec")]
-    pub spec_json: Option<String>,
-
-    #[arg(short = 'S', long, group = "spec-path")]
-    pub spec_path: Option<OsString>,
+    // TODO(tristan957): remove alias after compatibility tests are no longer
+    // an issue
+    #[arg(short = 'c', long, alias = "spec-path")]
+    pub config: Option<OsString>,
 
     #[arg(short = 'i', long, group = "compute-id")]
     pub compute_id: String,
 
-    #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")]
+    #[arg(
+        short = 'p',
+        long,
+        conflicts_with = "config",
+        value_name = "CONTROL_PLANE_API_BASE_URL",
+        requires = "compute-id"
+    )]
     pub control_plane_uri: Option<String>,
 }
 
@@ -136,7 +139,7 @@ fn main() -> Result<()> {
 
     let scenario = failpoint_support::init();
 
-    // For historical reasons, the main thread that processes the spec and launches postgres
+    // For historical reasons, the main thread that processes the config and launches postgres
     // is synchronous, but we always have this tokio runtime available and we "enter" it so
     // that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...)
     // from all parts of compute_ctl.
@@ -152,7 +155,7 @@ fn main() -> Result<()> {
 
     let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
 
-    let cli_spec = try_spec_from_cli(&cli)?;
+    let config = get_config(&cli)?;
 
     let compute_node = ComputeNode::new(
         ComputeNodeParams {
@@ -172,10 +175,8 @@ fn main() -> Result<()> {
             cgroup: cli.cgroup,
             #[cfg(target_os = "linux")]
             vm_monitor_addr: cli.vm_monitor_addr,
-            live_config_allowed: cli_spec.live_config_allowed,
         },
-        cli_spec.spec,
-        cli_spec.compute_ctl_config,
+        config,
     )?;
 
     let exit_code = compute_node.run()?;
@@ -200,37 +201,17 @@ async fn init() -> Result<()> {
     Ok(())
 }
 
-fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
-    // First, try to get cluster spec from the cli argument
-    if let Some(ref spec_json) = cli.spec_json {
-        info!("got spec from cli argument {}", spec_json);
-        return Ok(CliSpecParams {
-            spec: Some(serde_json::from_str(spec_json)?),
-            compute_ctl_config: ComputeCtlConfig::default(),
-            live_config_allowed: false,
-        });
+fn get_config(cli: &Cli) -> Result<ComputeConfig> {
+    // First, read the config from the path if provided
+    if let Some(ref config) = cli.config {
+        let file = File::open(config)?;
+        return Ok(serde_json::from_reader(&file)?);
     }
 
-    // Second, try to read it from the file if path is provided
-    if let Some(ref spec_path) = cli.spec_path {
-        let file = File::open(Path::new(spec_path))?;
-        return Ok(CliSpecParams {
-            spec: Some(serde_json::from_reader(file)?),
-            compute_ctl_config: ComputeCtlConfig::default(),
-            live_config_allowed: true,
-        });
-    }
-
-    if cli.control_plane_uri.is_none() {
-        panic!("must specify --control-plane-uri");
-    };
-
-    match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
-        Ok(resp) => Ok(CliSpecParams {
-            spec: resp.0,
-            compute_ctl_config: resp.1,
-            live_config_allowed: true,
-        }),
+    // If the config wasn't provided in the CLI arguments, then retrieve it from
+    // the control plane
+    match get_config_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
+        Ok(config) => Ok(config),
         Err(e) => {
             error!(
                 "cannot get response from control plane: {}\n\
@@ -242,14 +223,6 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
     }
 }
 
-struct CliSpecParams {
-    /// If a spec was provided via CLI or file, the [`ComputeSpec`]
-    spec: Option<ComputeSpec>,
-    #[allow(dead_code)]
-    compute_ctl_config: ComputeCtlConfig,
-    live_config_allowed: bool,
-}
-
 fn deinit_and_exit(exit_code: Option<i32>) -> ! {
     // Shutdown trace pipeline gracefully, so that it has a chance to send any
     // pending traces before we exit. Shutting down OTEL tracing provider may
diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs
index db3e07e086b8..082ba62b8e99 100644
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -98,13 +98,15 @@ pub async fn get_database_schema(
         .kill_on_drop(true)
         .spawn()?;
 
-    let stdout = cmd.stdout.take().ok_or_else(|| {
-        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.")
-    })?;
+    let stdout = cmd
+        .stdout
+        .take()
+        .ok_or_else(|| std::io::Error::other("Failed to capture stdout."))?;
 
-    let stderr = cmd.stderr.take().ok_or_else(|| {
-        std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.")
-    })?;
+    let stderr = cmd
+        .stderr
+        .take()
+        .ok_or_else(|| std::io::Error::other("Failed to capture stderr."))?;
 
     let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new());
     let stderr_reader = BufReader::new(stderr);
@@ -128,8 +130,7 @@ pub async fn get_database_schema(
                 }
             });
 
-            return Err(SchemaDumpError::IO(std::io::Error::new(
-                std::io::ErrorKind::Other,
+            return Err(SchemaDumpError::IO(std::io::Error::other(
                 "failed to start pg_dump",
             )));
         }
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 70b91c781a64..c7b4bdd24013 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -11,7 +11,7 @@ use std::{env, fs};
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
-use compute_api::responses::{ComputeCtlConfig, ComputeMetrics, ComputeStatus};
+use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus};
 use compute_api::spec::{
     ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
 };
@@ -93,20 +93,6 @@ pub struct ComputeNodeParams {
 
     /// the address of extension storage proxy gateway
     pub ext_remote_storage: Option<String>,
-
-    /// We should only allow live re- / configuration of the compute node if
-    /// it uses 'pull model', i.e. it can go to control-plane and fetch
-    /// the latest configuration. Otherwise, there could be a case:
-    /// - we start compute with some spec provided as argument
-    /// - we push new spec and it does reconfiguration
-    /// - but then something happens and compute pod / VM is destroyed,
-    ///   so k8s controller starts it again with the **old** spec
-    ///
-    /// and the same for empty computes:
-    /// - we started compute without any spec
-    /// - we push spec and it does configuration
-    /// - but then it is restarted without any spec again
-    pub live_config_allowed: bool,
 }
 
 /// Compute node info shared across several `compute_ctl` threads.
@@ -317,11 +303,7 @@ struct StartVmMonitorResult {
 }
 
 impl ComputeNode {
-    pub fn new(
-        params: ComputeNodeParams,
-        cli_spec: Option<ComputeSpec>,
-        compute_ctl_config: ComputeCtlConfig,
-    ) -> Result<Self> {
+    pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result<Self> {
         let connstr = params.connstr.as_str();
         let conn_conf = postgres::config::Config::from_str(connstr)
             .context("cannot build postgres config from connstr")?;
@@ -329,8 +311,8 @@ impl ComputeNode {
             .context("cannot build tokio postgres config from connstr")?;
 
         let mut new_state = ComputeState::new();
-        if let Some(cli_spec) = cli_spec {
-            let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?;
+        if let Some(spec) = config.spec {
+            let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
             new_state.pspec = Some(pspec);
         }
 
@@ -341,7 +323,7 @@ impl ComputeNode {
             state: Mutex::new(new_state),
             state_changed: Condvar::new(),
             ext_download_progress: RwLock::new(HashMap::new()),
-            compute_ctl_config,
+            compute_ctl_config: config.compute_ctl_config,
         })
     }
 
@@ -537,11 +519,14 @@ impl ComputeNode {
 
         let pspec = compute_state.pspec.as_ref().expect("spec must be set");
         info!(
-            "starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}",
+            "starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}",
             pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
             pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
             pspec.tenant_id,
             pspec.timeline_id,
+            pspec.spec.project_id.as_deref().unwrap_or("None"),
+            pspec.spec.branch_id.as_deref().unwrap_or("None"),
+            pspec.spec.endpoint_id.as_deref().unwrap_or("None"),
             pspec.spec.features,
             pspec.spec.remote_extensions,
         );
@@ -645,31 +630,28 @@ impl ComputeNode {
             });
         }
 
-        // Configure and start rsyslog for HIPAA if necessary
-        if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
-            let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
-            if remote_endpoint.is_empty() {
-                anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
-            }
+        // Configure and start rsyslog for compliance audit logging
+        match pspec.spec.audit_log_level {
+            ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
+                let remote_endpoint =
+                    std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
+                if remote_endpoint.is_empty() {
+                    anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
+                }
 
-            let log_directory_path = Path::new(&self.params.pgdata).join("log");
-            let log_directory_path = log_directory_path.to_string_lossy().to_string();
-            configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
+                let log_directory_path = Path::new(&self.params.pgdata).join("log");
+                let log_directory_path = log_directory_path.to_string_lossy().to_string();
+                configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
 
-            // Launch a background task to clean up the audit logs
-            launch_pgaudit_gc(log_directory_path);
+                // Launch a background task to clean up the audit logs
+                launch_pgaudit_gc(log_directory_path);
+            }
+            _ => {}
         }
 
         // Configure and start rsyslog for Postgres logs export
-        if self.has_feature(ComputeFeature::PostgresLogsExport) {
-            if let Some(ref project_id) = pspec.spec.cluster.cluster_id {
-                let host = PostgresLogsRsyslogConfig::default_host(project_id);
-                let conf = PostgresLogsRsyslogConfig::new(Some(&host));
-                configure_postgres_logs_export(conf)?;
-            } else {
-                warn!("not configuring rsyslog for Postgres logs export: project ID is missing")
-            }
-        }
+        let conf = PostgresLogsRsyslogConfig::new(pspec.spec.logs_export_host.as_deref());
+        configure_postgres_logs_export(conf)?;
 
         // Launch remaining service threads
         let _monitor_handle = launch_monitor(self);
@@ -1573,6 +1555,10 @@ impl ComputeNode {
             });
         }
 
+        // Reconfigure rsyslog for Postgres logs export
+        let conf = PostgresLogsRsyslogConfig::new(spec.logs_export_host.as_deref());
+        configure_postgres_logs_export(conf)?;
+
         // Write new config
         let pgdata_path = Path::new(&self.params.pgdata);
         config::write_postgres_conf(
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 614ab076ffec..71c6123c3bf3 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -7,7 +7,7 @@ use std::io::prelude::*;
 use std::path::Path;
 
 use compute_api::responses::TlsConfig;
-use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, GenericOption};
+use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
 
 use crate::pg_helpers::{
     GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
@@ -89,6 +89,15 @@ pub fn write_postgres_conf(
             escape_conf_value(&s.to_string())
         )?;
     }
+    if let Some(s) = &spec.project_id {
+        writeln!(file, "neon.project_id={}", escape_conf_value(s))?;
+    }
+    if let Some(s) = &spec.branch_id {
+        writeln!(file, "neon.branch_id={}", escape_conf_value(s))?;
+    }
+    if let Some(s) = &spec.endpoint_id {
+        writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?;
+    }
 
     // tls
     if let Some(tls_config) = tls_config {
@@ -169,7 +178,7 @@ pub fn write_postgres_conf(
     // and don't allow the user or the control plane admin to change them.
     match spec.audit_log_level {
         ComputeAudit::Disabled => {}
-        ComputeAudit::Log => {
+        ComputeAudit::Log | ComputeAudit::Base => {
             writeln!(file, "# Managed by compute_ctl base audit settings: start")?;
             writeln!(file, "pgaudit.log='ddl,role'")?;
             // Disable logging of catalog queries to reduce the noise
@@ -193,16 +202,20 @@ pub fn write_postgres_conf(
             }
             writeln!(file, "# Managed by compute_ctl base audit settings: end")?;
         }
-        ComputeAudit::Hipaa => {
+        ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
             writeln!(
                 file,
                 "# Managed by compute_ctl compliance audit settings: begin"
             )?;
-            // This log level is very verbose
-            // but this is necessary for HIPAA compliance.
-            // Exclude 'misc' category, because it doesn't contain anythig relevant.
-            writeln!(file, "pgaudit.log='all, -misc'")?;
-            writeln!(file, "pgaudit.log_parameter=on")?;
+            // Enable logging of parameters.
+            // This is very verbose and may contain sensitive data.
+            if spec.audit_log_level == ComputeAudit::Full {
+                writeln!(file, "pgaudit.log_parameter=on")?;
+                writeln!(file, "pgaudit.log='all'")?;
+            } else {
+                writeln!(file, "pgaudit.log_parameter=off")?;
+                writeln!(file, "pgaudit.log='all, -misc'")?;
+            }
             // Disable logging of catalog queries
             // The catalog doesn't contain sensitive data, so we don't need to audit it.
             writeln!(file, "pgaudit.log_catalog=off")?;
@@ -255,7 +268,7 @@ pub fn write_postgres_conf(
 
     // We need Postgres to send logs to rsyslog so that we can forward them
     // further to customers' log aggregation systems.
-    if spec.features.contains(&ComputeFeature::PostgresLogsExport) {
+    if spec.logs_export_host.is_some() {
         writeln!(file, "log_destination='stderr,syslog'")?;
     }
 
diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs
index 89d55e1af32b..f221752c38c3 100644
--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -6,20 +6,15 @@ use axum_extra::{
     TypedHeader,
     headers::{Authorization, authorization::Bearer},
 };
+use compute_api::requests::ComputeClaims;
 use futures::future::BoxFuture;
 use http::{Request, Response, StatusCode};
 use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
-use serde::Deserialize;
 use tower_http::auth::AsyncAuthorizeRequest;
-use tracing::warn;
+use tracing::{debug, warn};
 
 use crate::http::{JsonResponse, extract::RequestId};
 
-#[derive(Clone, Debug, Deserialize)]
-pub(in crate::http) struct Claims {
-    compute_id: String,
-}
-
 #[derive(Clone, Debug)]
 pub(in crate::http) struct Authorize {
     compute_id: String,
@@ -97,7 +92,7 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
             if data.claims.compute_id != compute_id {
                 return Err(JsonResponse::error(
                     StatusCode::UNAUTHORIZED,
-                    "invalid claims in authorization token",
+                    "invalid compute ID in authorization token claims",
                 ));
             }
 
@@ -112,13 +107,19 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
 
 impl Authorize {
     /// Verify the token using the JSON Web Key set and return the token data.
-    fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
+    fn verify(
+        jwks: &JwkSet,
+        token: &str,
+        validation: &Validation,
+    ) -> Result<TokenData<ComputeClaims>> {
+        debug!("verifying token {}", token);
+
         for jwk in jwks.keys.iter() {
             let decoding_key = match DecodingKey::from_jwk(jwk) {
                 Ok(key) => key,
                 Err(e) => {
                     warn!(
-                        "Failed to construct decoding key from {}: {}",
+                        "failed to construct decoding key from {}: {}",
                         jwk.common.key_id.as_ref().unwrap(),
                         e
                     );
@@ -127,11 +128,11 @@ impl Authorize {
                 }
             };
 
-            match jsonwebtoken::decode::<Claims>(token, &decoding_key, validation) {
+            match jsonwebtoken::decode::<ComputeClaims>(token, &decoding_key, validation) {
                 Ok(data) => return Ok(data),
                 Err(e) => {
                     warn!(
-                        "Failed to decode authorization token using {}: {}",
+                        "failed to decode authorization token using {}: {}",
                         jwk.common.key_id.as_ref().unwrap(),
                         e
                     );
@@ -141,6 +142,6 @@ impl Authorize {
             }
         }
 
-        Err(anyhow!("Failed to verify authorization token"))
+        Err(anyhow!("failed to verify authorization token"))
     }
 }
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 7c8f72440f6b..bbdb7d091728 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -306,36 +306,6 @@ paths:
               schema:
                 $ref: "#/components/schemas/GenericError"
 
-  /configure_telemetry:
-    post:
-      tags:
-        - Configure
-      summary: Configure rsyslog
-      description: |
-        This API endpoint configures rsyslog to forward Postgres logs
-        to a specified otel collector.
-      operationId: configureTelemetry
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              type: object
-              properties:
-                logs_export_host:
-                  type: string
-                  description: |
-                    Hostname and the port of the otel collector. Leave empty to disable logs forwarding.
-                    Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:54526
-      responses:
-        204:
-          description: "Telemetry configured successfully"
-        500:
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/GenericError"
-
 components:
   securitySchemes:
     JWT:
diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs
index 5c9dd22c3dab..f7a19da61156 100644
--- a/compute_tools/src/http/routes/configure.rs
+++ b/compute_tools/src/http/routes/configure.rs
@@ -1,11 +1,9 @@
 use std::sync::Arc;
 
-use axum::body::Body;
 use axum::extract::State;
 use axum::response::Response;
-use compute_api::requests::{ConfigurationRequest, ConfigureTelemetryRequest};
+use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse};
-use compute_api::spec::ComputeFeature;
 use http::StatusCode;
 use tokio::task;
 use tracing::info;
@@ -13,7 +11,6 @@ use tracing::info;
 use crate::compute::{ComputeNode, ParsedSpec};
 use crate::http::JsonResponse;
 use crate::http::extract::Json;
-use crate::rsyslog::{PostgresLogsRsyslogConfig, configure_postgres_logs_export};
 
 // Accept spec in JSON format and request compute configuration. If anything
 // goes wrong after we set the compute status to `ConfigurationPending` and
@@ -25,13 +22,6 @@ pub(in crate::http) async fn configure(
     State(compute): State<Arc<ComputeNode>>,
     request: Json<ConfigurationRequest>,
 ) -> Response {
-    if !compute.params.live_config_allowed {
-        return JsonResponse::error(
-            StatusCode::PRECONDITION_FAILED,
-            "live configuration is not allowed for this compute node".to_string(),
-        );
-    }
-
     let pspec = match ParsedSpec::try_from(request.spec.clone()) {
         Ok(p) => p,
         Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
@@ -95,25 +85,3 @@ pub(in crate::http) async fn configure(
 
     JsonResponse::success(StatusCode::OK, body)
 }
-
-pub(in crate::http) async fn configure_telemetry(
-    State(compute): State<Arc<ComputeNode>>,
-    request: Json<ConfigureTelemetryRequest>,
-) -> Response {
-    if !compute.has_feature(ComputeFeature::PostgresLogsExport) {
-        return JsonResponse::error(
-            StatusCode::PRECONDITION_FAILED,
-            "Postgres logs export feature is not enabled".to_string(),
-        );
-    }
-
-    let conf = PostgresLogsRsyslogConfig::new(request.logs_export_host.as_deref());
-    if let Err(err) = configure_postgres_logs_export(conf) {
-        return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, err.to_string());
-    }
-
-    Response::builder()
-        .status(StatusCode::NO_CONTENT)
-        .body(Body::from(""))
-        .unwrap()
-}
diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs
index 179369e3efd9..10f767e97ca7 100644
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -87,7 +87,6 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
                 let authenticated_router = Router::<Arc<ComputeNode>>::new()
                     .route("/check_writability", post(check_writability::is_writable))
                     .route("/configure", post(configure::configure))
-                    .route("/configure_telemetry", post(configure::configure_telemetry))
                     .route("/database_schema", get(database_schema::get_schema_dump))
                     .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
                     .route("/insights", get(insights::get_insights))
diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs
index 52f1795703bb..fa00476fd2e8 100644
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -19,13 +19,13 @@ pub(crate) static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
 // but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec.
 // And it's fair to call it a 'RPC' (Remote Procedure Call).
 pub enum CPlaneRequestRPC {
-    GetSpec,
+    GetConfig,
 }
 
 impl CPlaneRequestRPC {
     pub fn as_str(&self) -> &str {
         match self {
-            CPlaneRequestRPC::GetSpec => "GetSpec",
+            CPlaneRequestRPC::GetConfig => "GetConfig",
         }
     }
 }
diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs
index 80594db3f152..ba08302df2b0 100644
--- a/compute_tools/src/rsyslog.rs
+++ b/compute_tools/src/rsyslog.rs
@@ -119,16 +119,9 @@ impl<'a> PostgresLogsRsyslogConfig<'a> {
         };
         Ok(config_content)
     }
-
-    /// Returns the default host for otel collector that receives Postgres logs
-    pub fn default_host(project_id: &str) -> String {
-        format!(
-            "config-{}-collector.neon-telemetry.svc.cluster.local:10514",
-            project_id
-        )
-    }
 }
 
+/// Writes rsyslogd configuration for Postgres logs export and restarts rsyslog.
 pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result<()> {
     let new_config = conf.build()?;
     let current_config = PostgresLogsRsyslogConfig::current_config()?;
@@ -261,16 +254,5 @@ mod tests {
             let res = conf.build();
             assert!(res.is_err());
         }
-
-        {
-            // Verify config with default host
-            let host = PostgresLogsRsyslogConfig::default_host("shy-breeze-123");
-            let conf = PostgresLogsRsyslogConfig::new(Some(&host));
-            let res = conf.build();
-            assert!(res.is_ok());
-            let conf_str = res.unwrap();
-            assert!(conf_str.contains(r#"shy-breeze-123"#));
-            assert!(conf_str.contains(r#"port="10514""#));
-        }
     }
 }
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index a76af21e9f28..4b38e6e29c72 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -3,9 +3,8 @@ use std::path::Path;
 
 use anyhow::{Result, anyhow, bail};
 use compute_api::responses::{
-    ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
+    ComputeConfig, ControlPlaneComputeStatus, ControlPlaneConfigResponse,
 };
-use compute_api::spec::ComputeSpec;
 use reqwest::StatusCode;
 use tokio_postgres::Client;
 use tracing::{error, info, instrument};
@@ -21,7 +20,7 @@ use crate::params::PG_HBA_ALL_MD5;
 fn do_control_plane_request(
     uri: &str,
     jwt: &str,
-) -> Result<ControlPlaneSpecResponse, (bool, String, String)> {
+) -> Result<ControlPlaneConfigResponse, (bool, String, String)> {
     let resp = reqwest::blocking::Client::new()
         .get(uri)
         .header("Authorization", format!("Bearer {}", jwt))
@@ -29,14 +28,14 @@ fn do_control_plane_request(
         .map_err(|e| {
             (
                 true,
-                format!("could not perform spec request to control plane: {:?}", e),
+                format!("could not perform request to control plane: {:?}", e),
                 UNKNOWN_HTTP_STATUS.to_string(),
             )
         })?;
 
     let status = resp.status();
     match status {
-        StatusCode::OK => match resp.json::<ControlPlaneSpecResponse>() {
+        StatusCode::OK => match resp.json::<ControlPlaneConfigResponse>() {
             Ok(spec_resp) => Ok(spec_resp),
             Err(e) => Err((
                 true,
@@ -69,40 +68,35 @@ fn do_control_plane_request(
     }
 }
 
-/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN`
-/// env variable is set, it will be used for authorization.
-pub fn get_spec_from_control_plane(
-    base_uri: &str,
-    compute_id: &str,
-) -> Result<(Option<ComputeSpec>, ComputeCtlConfig)> {
+/// Request config from the control-plane by compute_id. If
+/// `NEON_CONTROL_PLANE_TOKEN` env variable is set, it will be used for
+/// authorization.
+pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result<ComputeConfig> {
     let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
-    let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
-        Ok(v) => v,
-        Err(_) => "".to_string(),
-    };
+    let jwt: String = std::env::var("NEON_CONTROL_PLANE_TOKEN").unwrap_or_default();
     let mut attempt = 1;
 
-    info!("getting spec from control plane: {}", cp_uri);
+    info!("getting config from control plane: {}", cp_uri);
 
     // Do 3 attempts to get spec from the control plane using the following logic:
     // - network error -> then retry
     // - compute id is unknown or any other error -> bail out
     // - no spec for compute yet (Empty state) -> return Ok(None)
-    // - got spec -> return Ok(Some(spec))
+    // - got config -> return Ok(Some(config))
     while attempt < 4 {
         let result = match do_control_plane_request(&cp_uri, &jwt) {
-            Ok(spec_resp) => {
+            Ok(config_resp) => {
                 CPLANE_REQUESTS_TOTAL
                     .with_label_values(&[
-                        CPlaneRequestRPC::GetSpec.as_str(),
+                        CPlaneRequestRPC::GetConfig.as_str(),
                         &StatusCode::OK.to_string(),
                     ])
                     .inc();
-                match spec_resp.status {
-                    ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)),
+                match config_resp.status {
+                    ControlPlaneComputeStatus::Empty => Ok(config_resp.into()),
                     ControlPlaneComputeStatus::Attached => {
-                        if let Some(spec) = spec_resp.spec {
-                            Ok((Some(spec), spec_resp.compute_ctl_config))
+                        if config_resp.spec.is_some() {
+                            Ok(config_resp.into())
                         } else {
                             bail!("compute is attached, but spec is empty")
                         }
@@ -111,7 +105,7 @@ pub fn get_spec_from_control_plane(
             }
             Err((retry, msg, status)) => {
                 CPLANE_REQUESTS_TOTAL
-                    .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status])
+                    .with_label_values(&[CPlaneRequestRPC::GetConfig.as_str(), &status])
                     .inc();
                 if retry {
                     Err(anyhow!(msg))
@@ -122,7 +116,7 @@ pub fn get_spec_from_control_plane(
         };
 
         if let Err(e) = &result {
-            error!("attempt {} to get spec failed with: {}", attempt, e);
+            error!("attempt {} to get config failed with: {}", attempt, e);
         } else {
             return result;
         }
@@ -133,13 +127,13 @@ pub fn get_spec_from_control_plane(
 
     // All attempts failed, return error.
     Err(anyhow::anyhow!(
-        "Exhausted all attempts to retrieve the spec from the control plane"
+        "Exhausted all attempts to retrieve the config from the control plane"
     ))
 }
 
 /// Check `pg_hba.conf` and update if needed to allow external connections.
 pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
-    // XXX: consider making it a part of spec.json
+    // XXX: consider making it a part of config.json
     let pghba_path = pgdata_path.join("pg_hba.conf");
 
     if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? {
@@ -153,7 +147,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
 
 /// Create a standby.signal file
 pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> {
-    // XXX: consider making it a part of spec.json
+    // XXX: consider making it a part of config.json
     let signalfile = pgdata_path.join("standby.signal");
 
     if !signalfile.exists() {
diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs
index e7d67f6ac524..0d1389dbad04 100644
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -278,12 +278,12 @@ impl ComputeNode {
             // so that all config operations are audit logged.
             match spec.audit_log_level
             {
-                ComputeAudit::Hipaa => {
+                ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => {
                     phases.push(CreatePgauditExtension);
                     phases.push(CreatePgauditlogtofileExtension);
                     phases.push(DisablePostgresDBPgAudit);
                 }
-                ComputeAudit::Log => {
+                ComputeAudit::Log | ComputeAudit::Base => {
                     phases.push(CreatePgauditExtension);
                     phases.push(DisablePostgresDBPgAudit);
                 }
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 747268f80b29..db9715dc6264 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -20,8 +20,10 @@ use compute_api::spec::ComputeMode;
 use control_plane::endpoint::ComputeControlPlane;
 use control_plane::local_env::{
     InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
-    SafekeeperConf,
+    ObjectStorageConf, SafekeeperConf,
 };
+use control_plane::object_storage::OBJECT_STORAGE_DEFAULT_PORT;
+use control_plane::object_storage::ObjectStorage;
 use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::{
@@ -39,7 +41,7 @@ use pageserver_api::controller_api::{
 use pageserver_api::models::{
     ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
 };
-use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
+use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
 use safekeeper_api::membership::SafekeeperGeneration;
@@ -91,6 +93,8 @@ enum NeonLocalCmd {
     #[command(subcommand)]
     Safekeeper(SafekeeperCmd),
     #[command(subcommand)]
+    ObjectStorage(ObjectStorageCmd),
+    #[command(subcommand)]
     Endpoint(EndpointCmd),
     #[command(subcommand)]
     Mappings(MappingsCmd),
@@ -454,6 +458,32 @@ enum SafekeeperCmd {
     Restart(SafekeeperRestartCmdArgs),
 }
 
+#[derive(clap::Subcommand)]
+#[clap(about = "Manage object storage")]
+enum ObjectStorageCmd {
+    Start(ObjectStorageStartCmd),
+    Stop(ObjectStorageStopCmd),
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Start object storage")]
+struct ObjectStorageStartCmd {
+    #[clap(short = 't', long, help = "timeout until we fail the command")]
+    #[arg(default_value = "10s")]
+    start_timeout: humantime::Duration,
+}
+
+#[derive(clap::Args)]
+#[clap(about = "Stop object storage")]
+struct ObjectStorageStopCmd {
+    #[arg(value_enum, default_value = "fast")]
+    #[clap(
+        short = 'm',
+        help = "If 'immediate', don't flush repository data at shutdown"
+    )]
+    stop_mode: StopMode,
+}
+
 #[derive(clap::Args)]
 #[clap(about = "Start local safekeeper")]
 struct SafekeeperStartCmdArgs {
@@ -759,6 +789,7 @@ fn main() -> Result<()> {
             }
             NeonLocalCmd::StorageBroker(subcmd) => rt.block_on(handle_storage_broker(&subcmd, env)),
             NeonLocalCmd::Safekeeper(subcmd) => rt.block_on(handle_safekeeper(&subcmd, env)),
+            NeonLocalCmd::ObjectStorage(subcmd) => rt.block_on(handle_object_storage(&subcmd, env)),
             NeonLocalCmd::Endpoint(subcmd) => rt.block_on(handle_endpoint(&subcmd, env)),
             NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env),
         };
@@ -975,6 +1006,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
                     }
                 })
                 .collect(),
+            object_storage: ObjectStorageConf {
+                port: OBJECT_STORAGE_DEFAULT_PORT,
+            },
             pg_distrib_dir: None,
             neon_distrib_dir: None,
             default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
@@ -1083,7 +1117,7 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any
                         stripe_size: args
                             .shard_stripe_size
                             .map(ShardStripeSize)
-                            .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
+                            .unwrap_or(DEFAULT_STRIPE_SIZE),
                     },
                     placement_policy: args.placement_policy.clone(),
                     config: tenant_conf,
@@ -1396,7 +1430,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                     vec![(parsed.0, parsed.1.unwrap_or(5432))],
                     // If caller is telling us what pageserver to use, this is not a tenant which is
                     // full managed by storage controller, therefore not sharded.
-                    ShardParameters::DEFAULT_STRIPE_SIZE,
+                    DEFAULT_STRIPE_SIZE,
                 )
             } else {
                 // Look up the currently attached location of the tenant, and its striping metadata,
@@ -1683,6 +1717,41 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) ->
     Ok(())
 }
 
+async fn handle_object_storage(subcmd: &ObjectStorageCmd, env: &local_env::LocalEnv) -> Result<()> {
+    use ObjectStorageCmd::*;
+    let storage = ObjectStorage::from_env(env);
+
+    // In tests like test_forward_compatibility or test_graceful_cluster_restart
+    // old neon binaries (without object_storage) are present
+    if !storage.bin.exists() {
+        eprintln!(
+            "{} binary not found. Ignore if this is a compatibility test",
+            storage.bin
+        );
+        return Ok(());
+    }
+
+    match subcmd {
+        Start(ObjectStorageStartCmd { start_timeout }) => {
+            if let Err(e) = storage.start(start_timeout).await {
+                eprintln!("object_storage start failed: {e}");
+                exit(1);
+            }
+        }
+        Stop(ObjectStorageStopCmd { stop_mode }) => {
+            let immediate = match stop_mode {
+                StopMode::Fast => false,
+                StopMode::Immediate => true,
+            };
+            if let Err(e) = storage.stop(immediate) {
+                eprintln!("proxy stop failed: {e}");
+                exit(1);
+            }
+        }
+    };
+    Ok(())
+}
+
 async fn handle_storage_broker(subcmd: &StorageBrokerCmd, env: &local_env::LocalEnv) -> Result<()> {
     match subcmd {
         StorageBrokerCmd::Start(args) => {
@@ -1777,6 +1846,13 @@ async fn handle_start_all_impl(
                     .map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id)))
             });
         }
+
+        js.spawn(async move {
+            ObjectStorage::from_env(env)
+                .start(&retry_timeout)
+                .await
+                .map_err(|e| e.context("start object_storage"))
+        });
     })();
 
     let mut errors = Vec::new();
@@ -1874,6 +1950,11 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
         }
     }
 
+    let storage = ObjectStorage::from_env(env);
+    if let Err(e) = storage.stop(immediate) {
+        eprintln!("object_storage stop failed: {:#}", e);
+    }
+
     for ps_conf in &env.pageservers {
         let pageserver = PageServerNode::from_env(env, ps_conf);
         if let Err(e) = pageserver.stop(immediate) {
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index b46d61682794..2fa7a62f8fdc 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -29,7 +29,7 @@
 //!     compute.log               - log output of `compute_ctl` and `postgres`
 //!     endpoint.json             - serialized `EndpointConf` struct
 //!     postgresql.conf           - postgresql settings
-//!     spec.json                 - passed to `compute_ctl`
+//!     config.json                 - passed to `compute_ctl`
 //!     pgdata/
 //!         postgresql.conf       - copy of postgresql.conf created by `compute_ctl`
 //!         zenith.signal
@@ -46,7 +46,9 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
 
 use anyhow::{Context, Result, anyhow, bail};
 use compute_api::requests::ConfigurationRequest;
-use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
+use compute_api::responses::{
+    ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse,
+};
 use compute_api::spec::{
     Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
     RemoteExtSpec, Role,
@@ -619,86 +621,101 @@ impl Endpoint {
             remote_extensions = None;
         };
 
-        // Create spec file
-        let mut spec = ComputeSpec {
-            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
-            format_version: 1.0,
-            operation_uuid: None,
-            features: self.features.clone(),
-            swap_size_bytes: None,
-            disk_quota_bytes: None,
-            disable_lfc_resizing: None,
-            cluster: Cluster {
-                cluster_id: None, // project ID: not used
-                name: None,       // project name: not used
-                state: None,
-                roles: if create_test_user {
-                    vec![Role {
+        // Create config file
+        let config = {
+            let mut spec = ComputeSpec {
+                skip_pg_catalog_updates: self.skip_pg_catalog_updates,
+                format_version: 1.0,
+                operation_uuid: None,
+                features: self.features.clone(),
+                swap_size_bytes: None,
+                disk_quota_bytes: None,
+                disable_lfc_resizing: None,
+                cluster: Cluster {
+                    cluster_id: None, // project ID: not used
+                    name: None,       // project name: not used
+                    state: None,
+                    roles: if create_test_user {
+                        vec![Role {
+                            name: PgIdent::from_str("test").unwrap(),
+                            encrypted_password: None,
+                            options: None,
+                        }]
+                    } else {
+                        Vec::new()
+                    },
+                    databases: if create_test_user {
+                        vec![Database {
+                            name: PgIdent::from_str("neondb").unwrap(),
+                            owner: PgIdent::from_str("test").unwrap(),
+                            options: None,
+                            restrict_conn: false,
+                            invalid: false,
+                        }]
+                    } else {
+                        Vec::new()
+                    },
+                    settings: None,
+                    postgresql_conf: Some(postgresql_conf.clone()),
+                },
+                delta_operations: None,
+                tenant_id: Some(self.tenant_id),
+                timeline_id: Some(self.timeline_id),
+                project_id: None,
+                branch_id: None,
+                endpoint_id: Some(self.endpoint_id.clone()),
+                mode: self.mode,
+                pageserver_connstring: Some(pageserver_connstring),
+                safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
+                safekeeper_connstrings,
+                storage_auth_token: auth_token.clone(),
+                remote_extensions,
+                pgbouncer_settings: None,
+                shard_stripe_size: Some(shard_stripe_size),
+                local_proxy_config: None,
+                reconfigure_concurrency: self.reconfigure_concurrency,
+                drop_subscriptions_before_start: self.drop_subscriptions_before_start,
+                audit_log_level: ComputeAudit::Disabled,
+                logs_export_host: None::<String>,
+            };
+
+            // this strange code is needed to support respec() in tests
+            if self.cluster.is_some() {
+                debug!("Cluster is already set in the endpoint spec, using it");
+                spec.cluster = self.cluster.clone().unwrap();
+
+                debug!("spec.cluster {:?}", spec.cluster);
+
+                // fill missing fields again
+                if create_test_user {
+                    spec.cluster.roles.push(Role {
                         name: PgIdent::from_str("test").unwrap(),
                         encrypted_password: None,
                         options: None,
-                    }]
-                } else {
-                    Vec::new()
-                },
-                databases: if create_test_user {
-                    vec![Database {
+                    });
+                    spec.cluster.databases.push(Database {
                         name: PgIdent::from_str("neondb").unwrap(),
                         owner: PgIdent::from_str("test").unwrap(),
                         options: None,
                         restrict_conn: false,
                         invalid: false,
-                    }]
-                } else {
-                    Vec::new()
-                },
-                settings: None,
-                postgresql_conf: Some(postgresql_conf.clone()),
-            },
-            delta_operations: None,
-            tenant_id: Some(self.tenant_id),
-            timeline_id: Some(self.timeline_id),
-            mode: self.mode,
-            pageserver_connstring: Some(pageserver_connstring),
-            safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
-            safekeeper_connstrings,
-            storage_auth_token: auth_token.clone(),
-            remote_extensions,
-            pgbouncer_settings: None,
-            shard_stripe_size: Some(shard_stripe_size),
-            local_proxy_config: None,
-            reconfigure_concurrency: self.reconfigure_concurrency,
-            drop_subscriptions_before_start: self.drop_subscriptions_before_start,
-            audit_log_level: ComputeAudit::Disabled,
-        };
+                    });
+                }
+                spec.cluster.postgresql_conf = Some(postgresql_conf);
+            }
 
-        // this strange code is needed to support respec() in tests
-        if self.cluster.is_some() {
-            debug!("Cluster is already set in the endpoint spec, using it");
-            spec.cluster = self.cluster.clone().unwrap();
-
-            debug!("spec.cluster {:?}", spec.cluster);
-
-            // fill missing fields again
-            if create_test_user {
-                spec.cluster.roles.push(Role {
-                    name: PgIdent::from_str("test").unwrap(),
-                    encrypted_password: None,
-                    options: None,
-                });
-                spec.cluster.databases.push(Database {
-                    name: PgIdent::from_str("neondb").unwrap(),
-                    owner: PgIdent::from_str("test").unwrap(),
-                    options: None,
-                    restrict_conn: false,
-                    invalid: false,
-                });
+            ComputeConfig {
+                spec: Some(spec),
+                compute_ctl_config: ComputeCtlConfig::default(),
             }
-            spec.cluster.postgresql_conf = Some(postgresql_conf);
-        }
+        };
 
+        // TODO(tristan957): Remove the write to spec.json after compatibility
+        // tests work themselves out
         let spec_path = self.endpoint_path().join("spec.json");
-        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
+        std::fs::write(spec_path, serde_json::to_string_pretty(&config.spec)?)?;
+        let config_path = self.endpoint_path().join("config.json");
+        std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?;
 
         // Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it.
         let logfile = std::fs::OpenOptions::new()
@@ -706,6 +723,16 @@ impl Endpoint {
             .append(true)
             .open(self.endpoint_path().join("compute.log"))?;
 
+        // TODO(tristan957): Remove when compatibility tests are no longer an
+        // issue
+        let old_compute_ctl = {
+            let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
+            let help_output = cmd.arg("--help").output()?;
+            let help_output = String::from_utf8_lossy(&help_output.stdout);
+
+            !help_output.contains("--config")
+        };
+
         // Launch compute_ctl
         let conn_str = self.connstr("cloud_admin", "postgres");
         println!("Starting postgres node at '{}'", conn_str);
@@ -724,9 +751,18 @@ impl Endpoint {
         ])
         .args(["--pgdata", self.pgdata().to_str().unwrap()])
         .args(["--connstr", &conn_str])
+        // TODO(tristan957): Change this to --config when compatibility tests
+        // are no longer an issue
         .args([
             "--spec-path",
-            self.endpoint_path().join("spec.json").to_str().unwrap(),
+            self.endpoint_path()
+                .join(if old_compute_ctl {
+                    "spec.json"
+                } else {
+                    "config.json"
+                })
+                .to_str()
+                .unwrap(),
         ])
         .args([
             "--pgbin",
@@ -869,10 +905,12 @@ impl Endpoint {
         stripe_size: Option<ShardStripeSize>,
         safekeepers: Option<Vec<NodeId>>,
     ) -> Result<()> {
-        let mut spec: ComputeSpec = {
-            let spec_path = self.endpoint_path().join("spec.json");
-            let file = std::fs::File::open(spec_path)?;
-            serde_json::from_reader(file)?
+        let (mut spec, compute_ctl_config) = {
+            let config_path = self.endpoint_path().join("config.json");
+            let file = std::fs::File::open(config_path)?;
+            let config: ComputeConfig = serde_json::from_reader(file)?;
+
+            (config.spec.unwrap(), config.compute_ctl_config)
         };
 
         let postgresql_conf = self.read_postgresql_conf()?;
@@ -922,7 +960,7 @@ impl Endpoint {
             .body(
                 serde_json::to_string(&ConfigurationRequest {
                     spec,
-                    compute_ctl_config: ComputeCtlConfig::default(),
+                    compute_ctl_config,
                 })
                 .unwrap(),
             )
diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs
index 2af272f3885d..2d9fe2c807b3 100644
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -10,6 +10,7 @@ mod background_process;
 pub mod broker;
 pub mod endpoint;
 pub mod local_env;
+pub mod object_storage;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 3f3794c0eef4..fa10abe91a4c 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -15,9 +15,10 @@ use clap::ValueEnum;
 use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
-use utils::auth::{Claims, encode_from_key_file};
+use utils::auth::encode_from_key_file;
 use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
 
+use crate::object_storage::{OBJECT_STORAGE_REMOTE_STORAGE_DIR, ObjectStorage};
 use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode};
 use crate::safekeeper::SafekeeperNode;
 
@@ -55,6 +56,7 @@ pub struct LocalEnv {
 
     // used to issue tokens during e.g pg start
     pub private_key_path: PathBuf,
+    pub public_key_path: PathBuf,
 
     pub broker: NeonBroker,
 
@@ -68,6 +70,8 @@ pub struct LocalEnv {
 
     pub safekeepers: Vec<SafekeeperConf>,
 
+    pub object_storage: ObjectStorageConf,
+
     // Control plane upcall API for pageserver: if None, we will not run storage_controller  If set, this will
     // be propagated into each pageserver's configuration.
     pub control_plane_api: Url,
@@ -95,6 +99,7 @@ pub struct OnDiskConfig {
     pub neon_distrib_dir: PathBuf,
     pub default_tenant_id: Option<TenantId>,
     pub private_key_path: PathBuf,
+    pub public_key_path: PathBuf,
     pub broker: NeonBroker,
     pub storage_controller: NeonStorageControllerConf,
     #[serde(
@@ -103,6 +108,7 @@ pub struct OnDiskConfig {
     )]
     pub pageservers: Vec<PageServerConf>,
     pub safekeepers: Vec<SafekeeperConf>,
+    pub object_storage: ObjectStorageConf,
     pub control_plane_api: Option<Url>,
     pub control_plane_hooks_api: Option<Url>,
     pub control_plane_compute_hook_api: Option<Url>,
@@ -136,11 +142,18 @@ pub struct NeonLocalInitConf {
     pub storage_controller: Option<NeonStorageControllerConf>,
     pub pageservers: Vec<NeonLocalInitPageserverConf>,
     pub safekeepers: Vec<SafekeeperConf>,
+    pub object_storage: ObjectStorageConf,
     pub control_plane_api: Option<Url>,
     pub control_plane_hooks_api: Option<Url>,
     pub generate_local_ssl_certs: bool,
 }
 
+#[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)]
+#[serde(default)]
+pub struct ObjectStorageConf {
+    pub port: u16,
+}
+
 /// Broker config for cluster internal communication.
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
@@ -398,6 +411,10 @@ impl LocalEnv {
         self.pg_dir(pg_version, "lib")
     }
 
+    pub fn object_storage_bin(&self) -> PathBuf {
+        self.neon_distrib_dir.join("object_storage")
+    }
+
     pub fn pageserver_bin(&self) -> PathBuf {
         self.neon_distrib_dir.join("pageserver")
     }
@@ -431,6 +448,10 @@ impl LocalEnv {
         self.base_data_dir.join("safekeepers").join(data_dir_name)
     }
 
+    pub fn object_storage_data_dir(&self) -> PathBuf {
+        self.base_data_dir.join("object_storage")
+    }
+
     pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> {
         if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
             Ok(conf)
@@ -582,6 +603,7 @@ impl LocalEnv {
                 neon_distrib_dir,
                 default_tenant_id,
                 private_key_path,
+                public_key_path,
                 broker,
                 storage_controller,
                 pageservers,
@@ -591,6 +613,7 @@ impl LocalEnv {
                 control_plane_compute_hook_api: _,
                 branch_name_mappings,
                 generate_local_ssl_certs,
+                object_storage,
             } = on_disk_config;
             LocalEnv {
                 base_data_dir: repopath.to_owned(),
@@ -598,6 +621,7 @@ impl LocalEnv {
                 neon_distrib_dir,
                 default_tenant_id,
                 private_key_path,
+                public_key_path,
                 broker,
                 storage_controller,
                 pageservers,
@@ -606,6 +630,7 @@ impl LocalEnv {
                 control_plane_hooks_api,
                 branch_name_mappings,
                 generate_local_ssl_certs,
+                object_storage,
             }
         };
 
@@ -705,6 +730,7 @@ impl LocalEnv {
                 neon_distrib_dir: self.neon_distrib_dir.clone(),
                 default_tenant_id: self.default_tenant_id,
                 private_key_path: self.private_key_path.clone(),
+                public_key_path: self.public_key_path.clone(),
                 broker: self.broker.clone(),
                 storage_controller: self.storage_controller.clone(),
                 pageservers: vec![], // it's skip_serializing anyway
@@ -714,6 +740,7 @@ impl LocalEnv {
                 control_plane_compute_hook_api: None,
                 branch_name_mappings: self.branch_name_mappings.clone(),
                 generate_local_ssl_certs: self.generate_local_ssl_certs,
+                object_storage: self.object_storage.clone(),
             },
         )
     }
@@ -730,7 +757,7 @@ impl LocalEnv {
     }
 
     // this function is used only for testing purposes in CLI e g generate tokens during init
-    pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result<String> {
+    pub fn generate_auth_token<S: Serialize>(&self, claims: &S) -> anyhow::Result<String> {
         let private_key_path = self.get_private_key_path();
         let key_data = fs::read(private_key_path)?;
         encode_from_key_file(claims, &key_data)
@@ -797,6 +824,7 @@ impl LocalEnv {
             control_plane_api,
             generate_local_ssl_certs,
             control_plane_hooks_api,
+            object_storage,
         } = conf;
 
         // Find postgres binaries.
@@ -828,6 +856,7 @@ impl LocalEnv {
         )
         .context("generate auth keys")?;
         let private_key_path = PathBuf::from("auth_private_key.pem");
+        let public_key_path = PathBuf::from("auth_public_key.pem");
 
         // create the runtime type because the remaining initialization code below needs
         // a LocalEnv instance op operation
@@ -838,6 +867,7 @@ impl LocalEnv {
             neon_distrib_dir,
             default_tenant_id: Some(default_tenant_id),
             private_key_path,
+            public_key_path,
             broker,
             storage_controller: storage_controller.unwrap_or_default(),
             pageservers: pageservers.iter().map(Into::into).collect(),
@@ -846,6 +876,7 @@ impl LocalEnv {
             control_plane_hooks_api,
             branch_name_mappings: Default::default(),
             generate_local_ssl_certs,
+            object_storage,
         };
 
         if generate_local_ssl_certs {
@@ -873,8 +904,13 @@ impl LocalEnv {
                 .context("pageserver init failed")?;
         }
 
+        ObjectStorage::from_env(&env)
+            .init()
+            .context("object storage init failed")?;
+
         // setup remote remote location for default LocalFs remote storage
         std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
+        std::fs::create_dir_all(env.base_data_dir.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR))?;
 
         env.persist_config()
     }
@@ -944,7 +980,7 @@ fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()>
     // -out rootCA.crt -keyout rootCA.key
     let keygen_output = Command::new("openssl")
         .args([
-            "req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500",
+            "req", "-x509", "-newkey", "ed25519", "-nodes", "-days", "36500",
         ])
         .args(["-subj", "/CN=Neon Local CA"])
         .args(["-out", cert_path.to_str().unwrap()])
@@ -974,7 +1010,7 @@ fn generate_ssl_cert(
     // -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1"
     let keygen_output = Command::new("openssl")
         .args(["req", "-new", "-nodes"])
-        .args(["-newkey", "rsa:2048"])
+        .args(["-newkey", "ed25519"])
         .args(["-subj", "/CN=localhost"])
         .args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"])
         .args(["-keyout", key_path.to_str().unwrap()])
diff --git a/control_plane/src/object_storage.rs b/control_plane/src/object_storage.rs
new file mode 100644
index 000000000000..1a595b780972
--- /dev/null
+++ b/control_plane/src/object_storage.rs
@@ -0,0 +1,107 @@
+use crate::background_process::{self, start_process, stop_process};
+use crate::local_env::LocalEnv;
+use anyhow::anyhow;
+use anyhow::{Context, Result};
+use camino::Utf8PathBuf;
+use std::io::Write;
+use std::time::Duration;
+
+/// Directory within .neon which will be used by default for LocalFs remote storage.
+pub const OBJECT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/object_storage";
+pub const OBJECT_STORAGE_DEFAULT_PORT: u16 = 9993;
+
+pub struct ObjectStorage {
+    pub bin: Utf8PathBuf,
+    pub data_dir: Utf8PathBuf,
+    pub pemfile: Utf8PathBuf,
+    pub port: u16,
+}
+
+impl ObjectStorage {
+    pub fn from_env(env: &LocalEnv) -> ObjectStorage {
+        ObjectStorage {
+            bin: Utf8PathBuf::from_path_buf(env.object_storage_bin()).unwrap(),
+            data_dir: Utf8PathBuf::from_path_buf(env.object_storage_data_dir()).unwrap(),
+            pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(),
+            port: env.object_storage.port,
+        }
+    }
+
+    fn config_path(&self) -> Utf8PathBuf {
+        self.data_dir.join("object_storage.json")
+    }
+
+    fn listen_addr(&self) -> Utf8PathBuf {
+        format!("127.0.0.1:{}", self.port).into()
+    }
+
+    pub fn init(&self) -> Result<()> {
+        println!("Initializing object storage in {:?}", self.data_dir);
+        let parent = self.data_dir.parent().unwrap();
+
+        #[derive(serde::Serialize)]
+        struct Cfg {
+            listen: Utf8PathBuf,
+            pemfile: Utf8PathBuf,
+            local_path: Utf8PathBuf,
+            r#type: String,
+        }
+        let cfg = Cfg {
+            listen: self.listen_addr(),
+            pemfile: parent.join(self.pemfile.clone()),
+            local_path: parent.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR),
+            r#type: "LocalFs".to_string(),
+        };
+        std::fs::create_dir_all(self.config_path().parent().unwrap())?;
+        std::fs::write(self.config_path(), serde_json::to_string(&cfg)?)
+            .context("write object storage config")?;
+        Ok(())
+    }
+
+    pub async fn start(&self, retry_timeout: &Duration) -> Result<()> {
+        println!("Starting s3 proxy at {}", self.listen_addr());
+        std::io::stdout().flush().context("flush stdout")?;
+
+        let process_status_check = || async {
+            tokio::time::sleep(Duration::from_millis(500)).await;
+            let res = reqwest::Client::new()
+                .get(format!("http://{}/metrics", self.listen_addr()))
+                .send()
+                .await;
+            match res {
+                Ok(response) if response.status().is_success() => Ok(true),
+                Ok(_) => Err(anyhow!("Failed to query /metrics")),
+                Err(e) => Err(anyhow!("Failed to check node status: {e}")),
+            }
+        };
+
+        let res = start_process(
+            "object_storage",
+            &self.data_dir.clone().into_std_path_buf(),
+            &self.bin.clone().into_std_path_buf(),
+            vec![self.config_path().to_string()],
+            vec![("RUST_LOG".into(), "debug".into())],
+            background_process::InitialPidFile::Create(self.pid_file()),
+            retry_timeout,
+            process_status_check,
+        )
+        .await;
+        if res.is_err() {
+            eprintln!("Logs:\n{}", std::fs::read_to_string(self.log_file())?);
+        }
+
+        res
+    }
+
+    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
+        stop_process(immediate, "object_storage", &self.pid_file())
+    }
+
+    fn log_file(&self) -> Utf8PathBuf {
+        self.data_dir.join("object_storage.log")
+    }
+
+    fn pid_file(&self) -> Utf8PathBuf {
+        self.data_dir.join("object_storage.pid")
+    }
+}
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 591eb3728b1d..5c985e6dc831 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -535,6 +535,11 @@ impl PageServerNode {
                 .map(|x| x.parse::<bool>())
                 .transpose()
                 .context("Failed to parse 'gc_compaction_enabled' as bool")?,
+            gc_compaction_verification: settings
+                .remove("gc_compaction_verification")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'gc_compaction_verification' as bool")?,
             gc_compaction_initial_threshold_kb: settings
                 .remove("gc_compaction_initial_threshold_kb")
                 .map(|x| x.parse::<u64>())
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 8000576e87ad..a4b56ae5c01b 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -13,7 +13,9 @@ use pageserver_api::controller_api::{
     NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest,
     TenantCreateResponse, TenantLocateResponse,
 };
-use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo};
+use pageserver_api::models::{
+    TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo,
+};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
@@ -82,7 +84,8 @@ impl NeonStorageControllerStopArgs {
 pub struct AttachHookRequest {
     pub tenant_shard_id: TenantShardId,
     pub node_id: Option<NodeId>,
-    pub generation_override: Option<i32>,
+    pub generation_override: Option<i32>, // only new tenants
+    pub config: Option<TenantConfig>,     // only new tenants
 }
 
 #[derive(Serialize, Deserialize)]
@@ -805,6 +808,7 @@ impl StorageController {
             tenant_shard_id,
             node_id: Some(pageserver_id),
             generation_override: None,
+            config: None,
         };
 
         let response = self
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index b7e479d90cbd..19c686dcfd31 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -941,7 +941,7 @@ async fn main() -> anyhow::Result<()> {
             let mut node_to_fill_descs = Vec::new();
 
             for desc in node_descs {
-                let to_drain = nodes.iter().any(|id| *id == desc.id);
+                let to_drain = nodes.contains(&desc.id);
                 if to_drain {
                     node_to_drain_descs.push(desc);
                 } else {
diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh
index 418aaf876da2..9409e9d055ff 100755
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -11,8 +11,8 @@ generate_id() {
 
 PG_VERSION=${PG_VERSION:-14}
 
-SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
-SPEC_FILE=/tmp/spec.json
+CONFIG_FILE_ORG=/var/db/postgres/configs/config.json
+CONFIG_FILE=/tmp/config.json
 
 echo "Waiting pageserver become ready."
 while ! nc -z pageserver 6400; do
@@ -20,7 +20,7 @@ while ! nc -z pageserver 6400; do
 done
 echo "Page server is ready."
 
-cp ${SPEC_FILE_ORG} ${SPEC_FILE}
+cp ${CONFIG_FILE_ORG} ${CONFIG_FILE}
 
  if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then
    tenant_id=${TENANT_ID}
@@ -73,17 +73,27 @@ else
   ulid_extension=ulid
 fi
 echo "Adding pgx_ulid"
-shared_libraries=$(jq -r '.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${SPEC_FILE})
-sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${SPEC_FILE}
+shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE})
+sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE}
 echo "Overwrite tenant id and timeline id in spec file"
-sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE}
-sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
+sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE}
+sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}
 
-cat ${SPEC_FILE}
+cat ${CONFIG_FILE}
+
+# TODO(tristan957): Remove these workarounds for backwards compatibility after
+# the next compute release. That includes these next few lines and the
+# --spec-path in the compute_ctl invocation.
+if compute_ctl --help | grep --quiet -- '--config'; then
+  SPEC_PATH="$CONFIG_FILE"
+else
+  jq '.spec' < "$CONFIG_FILE" > /tmp/spec.json
+  SPEC_PATH=/tmp/spec.json
+fi
 
 echo "Start compute node"
 /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
      -C "postgresql://cloud_admin@localhost:55433/postgres"  \
      -b /usr/local/bin/postgres                              \
      --compute-id "compute-$RANDOM"                          \
-     -S ${SPEC_FILE}
+     --spec-path "$SPEC_PATH"
diff --git a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
new file mode 100644
index 000000000000..3ddf96512a3a
--- /dev/null
+++ b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
@@ -0,0 +1,148 @@
+{
+    "spec": {
+        "format_version": 1.0,
+
+        "timestamp": "2022-10-12T18:00:00.000Z",
+        "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
+
+        "cluster": {
+            "cluster_id": "docker_compose",
+            "name": "docker_compose_test",
+            "state": "restarted",
+            "roles": [
+                {
+                    "name": "cloud_admin",
+                    "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
+                    "options": null
+                }
+            ],
+            "databases": [
+            ],
+            "settings": [
+                {
+                    "name": "fsync",
+                    "value": "off",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "wal_level",
+                    "value": "logical",
+                    "vartype": "enum"
+                },
+                {
+                    "name": "wal_log_hints",
+                    "value": "on",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "log_connections",
+                    "value": "on",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "port",
+                    "value": "55433",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "shared_buffers",
+                    "value": "1MB",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_connections",
+                    "value": "100",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "listen_addresses",
+                    "value": "0.0.0.0",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_wal_senders",
+                    "value": "10",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "max_replication_slots",
+                    "value": "10",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "wal_sender_timeout",
+                    "value": "5s",
+                    "vartype": "string"
+                },
+                {
+                    "name": "wal_keep_size",
+                    "value": "0",
+                    "vartype": "integer"
+                },
+                {
+                    "name": "password_encryption",
+                    "value": "md5",
+                    "vartype": "enum"
+                },
+                {
+                    "name": "restart_after_crash",
+                    "value": "off",
+                    "vartype": "bool"
+                },
+                {
+                    "name": "synchronous_standby_names",
+                    "value": "walproposer",
+                    "vartype": "string"
+                },
+                {
+                    "name": "shared_preload_libraries",
+                    "value": "neon,pg_cron,timescaledb,pg_stat_statements",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.safekeepers",
+                    "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.timeline_id",
+                    "value": "TIMELINE_ID",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.tenant_id",
+                    "value": "TENANT_ID",
+                    "vartype": "string"
+                },
+                {
+                    "name": "neon.pageserver_connstring",
+                    "value": "host=pageserver port=6400",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_replication_write_lag",
+                    "value": "500MB",
+                    "vartype": "string"
+                },
+                {
+                    "name": "max_replication_flush_lag",
+                    "value": "10GB",
+                    "vartype": "string"
+                },
+                {
+                    "name": "cron.database",
+                    "value": "postgres",
+                    "vartype": "string"
+                }
+            ]
+        },
+
+        "delta_operations": [
+        ]
+    },
+    "compute_ctl_config": {
+        "jwks": {
+            "keys": []
+        }
+    }
+}
diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
deleted file mode 100644
index 0308cab4515a..000000000000
--- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json
+++ /dev/null
@@ -1,141 +0,0 @@
-{
-    "format_version": 1.0,
-
-    "timestamp": "2022-10-12T18:00:00.000Z",
-    "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
-
-    "cluster": {
-        "cluster_id": "docker_compose",
-        "name": "docker_compose_test",
-        "state": "restarted",
-        "roles": [
-            {
-                "name": "cloud_admin",
-                "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
-                "options": null
-            }
-        ],
-        "databases": [
-        ],
-        "settings": [
-            {
-                "name": "fsync",
-                "value": "off",
-                "vartype": "bool"
-            },
-            {
-                "name": "wal_level",
-                "value": "logical",
-                "vartype": "enum"
-            },
-            {
-                "name": "wal_log_hints",
-                "value": "on",
-                "vartype": "bool"
-            },
-            {
-                "name": "log_connections",
-                "value": "on",
-                "vartype": "bool"
-            },
-            {
-                "name": "port",
-                "value": "55433",
-                "vartype": "integer"
-            },
-            {
-                "name": "shared_buffers",
-                "value": "1MB",
-                "vartype": "string"
-            },
-            {
-                "name": "max_connections",
-                "value": "100",
-                "vartype": "integer"
-            },
-            {
-                "name": "listen_addresses",
-                "value": "0.0.0.0",
-                "vartype": "string"
-            },
-            {
-                "name": "max_wal_senders",
-                "value": "10",
-                "vartype": "integer"
-            },
-            {
-                "name": "max_replication_slots",
-                "value": "10",
-                "vartype": "integer"
-            },
-            {
-                "name": "wal_sender_timeout",
-                "value": "5s",
-                "vartype": "string"
-            },
-            {
-                "name": "wal_keep_size",
-                "value": "0",
-                "vartype": "integer"
-            },
-            {
-                "name": "password_encryption",
-                "value": "md5",
-                "vartype": "enum"
-            },
-            {
-                "name": "restart_after_crash",
-                "value": "off",
-                "vartype": "bool"
-            },
-            {
-                "name": "synchronous_standby_names",
-                "value": "walproposer",
-                "vartype": "string"
-            },
-            {
-                "name": "shared_preload_libraries",
-                "value": "neon,pg_cron,timescaledb,pg_stat_statements",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.safekeepers",
-                "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.timeline_id",
-                "value": "TIMELINE_ID",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.tenant_id",
-                "value": "TENANT_ID",
-                "vartype": "string"
-            },
-            {
-                "name": "neon.pageserver_connstring",
-                "value": "host=pageserver port=6400",
-                "vartype": "string"
-            },
-            {
-                "name": "max_replication_write_lag",
-                "value": "500MB",
-                "vartype": "string"
-            },
-            {
-                "name": "max_replication_flush_lag",
-                "value": "10GB",
-                "vartype": "string"
-            },
-            {
-                "name": "cron.database",
-                "value": "postgres",
-                "vartype": "string"
-            }
-        ]
-    },
-
-    "delta_operations": [
-    ]
-}
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index 493a0a552334..fd3ad1fffcad 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -159,7 +159,7 @@ services:
       #- RUST_BACKTRACE=1
     # Mount the test files directly, for faster editing cycle.
     volumes:
-      - ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/
+      - ./compute_wrapper/var/db/postgres/configs/:/var/db/postgres/configs/
       - ./compute_wrapper/shell/:/shell/
     ports:
       - 55433:55433 # pg protocol handler
diff --git a/docs/storage_controller.md b/docs/storage_controller.md
index ac4aca4219ac..d761210033ba 100644
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -151,7 +151,7 @@ Example body:
 ```
 {
   "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
-  "stripe_size": 32768,
+  "stripe_size": 2048,
   "shards": [
       {"node_id": 344, "shard_number": 0},
       {"node_id": 722, "shard_number": 1},
diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs
index d88451c5495e..98f2fc297cfe 100644
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -5,6 +5,14 @@ use crate::privilege::Privilege;
 use crate::responses::ComputeCtlConfig;
 use crate::spec::{ComputeSpec, ExtVersion, PgIdent};
 
+/// When making requests to the `compute_ctl` external HTTP server, the client
+/// must specify a set of claims in `Authorization` header JWTs such that
+/// `compute_ctl` can authorize the request.
+#[derive(Clone, Debug, Deserialize, Serialize)]
+pub struct ComputeClaims {
+    pub compute_id: String,
+}
+
 /// Request of the /configure API
 ///
 /// We now pass only `spec` in the configuration request, but later we can
@@ -30,9 +38,3 @@ pub struct SetRoleGrantsRequest {
     pub privileges: Vec<Privilege>,
     pub role: PgIdent,
 }
-
-/// Request of the /configure_telemetry API
-#[derive(Debug, Deserialize, Serialize)]
-pub struct ConfigureTelemetryRequest {
-    pub logs_export_host: Option<String>,
-}
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index c8f6019c5cf1..353949736b4c 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -14,6 +14,32 @@ pub struct GenericAPIError {
     pub error: String,
 }
 
+/// All configuration parameters necessary for a compute. When
+/// [`ComputeConfig::spec`] is provided, it means that the compute is attached
+/// to a tenant. [`ComputeConfig::compute_ctl_config`] will always be provided
+/// and contains parameters necessary for operating `compute_ctl` independently
+/// of whether a tenant is attached to the compute or not.
+///
+/// This also happens to be the body of `compute_ctl`'s /configure request.
+#[derive(Debug, Deserialize, Serialize)]
+pub struct ComputeConfig {
+    /// The compute spec
+    pub spec: Option<ComputeSpec>,
+
+    /// The compute_ctl configuration
+    #[allow(dead_code)]
+    pub compute_ctl_config: ComputeCtlConfig,
+}
+
+impl From<ControlPlaneConfigResponse> for ComputeConfig {
+    fn from(value: ControlPlaneConfigResponse) -> Self {
+        Self {
+            spec: value.spec,
+            compute_ctl_config: value.compute_ctl_config,
+        }
+    }
+}
+
 #[derive(Debug, Clone, Serialize)]
 pub struct ExtensionInstallResponse {
     pub extension: PgIdent,
@@ -161,7 +187,7 @@ pub struct TlsConfig {
 
 /// Response of the `/computes/{compute_id}/spec` control-plane API.
 #[derive(Deserialize, Debug)]
-pub struct ControlPlaneSpecResponse {
+pub struct ControlPlaneConfigResponse {
     pub spec: Option<ComputeSpec>,
     pub status: ControlPlaneComputeStatus,
     pub compute_ctl_config: ComputeCtlConfig,
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index cff1f4c89a6d..5e67ccce0018 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -1,8 +1,8 @@
-//! `ComputeSpec` represents the contents of the spec.json file.
-//!
-//! The spec.json file is used to pass information to 'compute_ctl'. It contains
-//! all the information needed to start up the right version of PostgreSQL,
-//! and connect it to the storage nodes.
+//! The ComputeSpec contains all the information needed to start up
+//! the right version of PostgreSQL, and connect it to the storage nodes.
+//! It can be passed as part of the `config.json`, or the control plane can
+//! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or
+//! compute_ctl can fetch it by calling the control plane's API.
 use std::collections::HashMap;
 
 use indexmap::IndexMap;
@@ -104,6 +104,12 @@ pub struct ComputeSpec {
     pub timeline_id: Option<TimelineId>,
     pub pageserver_connstring: Option<String>,
 
+    // More neon ids that we expose to the compute_ctl
+    // and to postgres as neon extension GUCs.
+    pub project_id: Option<String>,
+    pub branch_id: Option<String>,
+    pub endpoint_id: Option<String>,
+
     /// Safekeeper membership config generation. It is put in
     /// neon.safekeepers GUC and serves two purposes:
     /// 1) Non zero value forces walproposer to use membership configurations.
@@ -159,15 +165,13 @@ pub struct ComputeSpec {
     #[serde(default)] // Default false
     pub drop_subscriptions_before_start: bool,
 
-    /// Log level for audit logging:
-    ///
-    /// Disabled - no audit logging. This is the default.
-    /// log - log masked statements to the postgres log using pgaudit extension
-    /// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
-    ///
-    /// Extensions should be present in shared_preload_libraries
+    /// Log level for compute audit logging
     #[serde(default)]
     pub audit_log_level: ComputeAudit,
+
+    /// Hostname and the port of the otel collector. Leave empty to disable Postgres logs forwarding.
+    /// Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:10514
+    pub logs_export_host: Option<String>,
 }
 
 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -179,9 +183,6 @@ pub enum ComputeFeature {
     /// track short-lived connections as user activity.
     ActivityMonitorExperimental,
 
-    /// Allow to configure rsyslog for Postgres logs export
-    PostgresLogsExport,
-
     /// This is a special feature flag that is used to represent unknown feature flags.
     /// Basically all unknown to enum flags are represented as this one. See unit test
     /// `parse_unknown_features()` for more details.
@@ -288,14 +289,25 @@ impl ComputeMode {
 }
 
 /// Log level for audit logging
-/// Disabled, log, hipaa
-/// Default is Disabled
 #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
 pub enum ComputeAudit {
     #[default]
     Disabled,
+    // Deprecated, use Base instead
     Log,
+    // (pgaudit.log = 'ddl', pgaudit.log_parameter='off')
+    // logged to the standard postgresql log stream
+    Base,
+    // Deprecated, use Full or Extended instead
     Hipaa,
+    // (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off')
+    // logged to separate files collected by rsyslog
+    // into dedicated log storage with strict access
+    Extended,
+    // (pgaudit.log='all', pgaudit.log_parameter='on'),
+    // logged to separate files collected by rsyslog
+    // into dedicated log storage with strict access.
+    Full,
 }
 
 #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml
index 6d24ee352a13..5f6578f76e83 100644
--- a/libs/http-utils/Cargo.toml
+++ b/libs/http-utils/Cargo.toml
@@ -30,6 +30,7 @@ tokio.workspace = true
 tracing.workspace = true
 url.workspace = true
 uuid.workspace = true
+x509-cert.workspace = true
 
 # to use tokio channels as streams, this is faster to compile than async_stream
 # why is it only here? no other crate should use it, streams are rarely needed.
diff --git a/libs/http-utils/src/server.rs b/libs/http-utils/src/server.rs
index 07fd56ac0123..f93f71c9622d 100644
--- a/libs/http-utils/src/server.rs
+++ b/libs/http-utils/src/server.rs
@@ -4,6 +4,8 @@ use futures::StreamExt;
 use futures::stream::FuturesUnordered;
 use hyper0::Body;
 use hyper0::server::conn::Http;
+use metrics::{IntCounterVec, register_int_counter_vec};
+use once_cell::sync::Lazy;
 use routerify::{RequestService, RequestServiceBuilder};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
@@ -26,6 +28,24 @@ pub struct Server {
     tls_acceptor: Option<TlsAcceptor>,
 }
 
+static CONNECTION_STARTED_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "http_server_connection_started_total",
+        "Number of established http/https connections",
+        &["scheme"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CONNECTION_ERROR_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "http_server_connection_errors_total",
+        "Number of occured connection errors by type",
+        &["type"]
+    )
+    .expect("failed to define a metric")
+});
+
 impl Server {
     pub fn new(
         request_service: Arc<RequestServiceBuilder<Body, ApiError>>,
@@ -60,6 +80,15 @@ impl Server {
             false
         }
 
+        let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]);
+        let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]);
+        let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]);
+        let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]);
+        let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]);
+
+        let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]);
+        let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]);
+
         let mut connections = FuturesUnordered::new();
         loop {
             tokio::select! {
@@ -67,6 +96,7 @@ impl Server {
                     let (tcp_stream, remote_addr) = match stream {
                         Ok(stream) => stream,
                         Err(err) => {
+                            tcp_error_cnt.inc();
                             if !suppress_io_error(&err) {
                                 info!("Failed to accept TCP connection: {err:#}");
                             }
@@ -78,11 +108,18 @@ impl Server {
                     let tls_acceptor = self.tls_acceptor.clone();
                     let cancel = cancel.clone();
 
+                    let tls_error_cnt = tls_error_cnt.clone();
+                    let http_error_cnt = http_error_cnt.clone();
+                    let https_error_cnt = https_error_cnt.clone();
+                    let http_connection_cnt = http_connection_cnt.clone();
+                    let https_connection_cnt = https_connection_cnt.clone();
+
                     connections.push(tokio::spawn(
                         async move {
                             match tls_acceptor {
                                 Some(tls_acceptor) => {
                                     // Handle HTTPS connection.
+                                    https_connection_cnt.inc();
                                     let tls_stream = tokio::select! {
                                         tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream,
                                         _ = cancel.cancelled() => return,
@@ -90,6 +127,7 @@ impl Server {
                                     let tls_stream = match tls_stream {
                                         Ok(tls_stream) => tls_stream,
                                         Err(err) => {
+                                            tls_error_cnt.inc();
                                             if !suppress_io_error(&err) {
                                                 info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
                                             }
@@ -97,6 +135,7 @@ impl Server {
                                         }
                                     };
                                     if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
+                                        https_error_cnt.inc();
                                         if !suppress_hyper_error(&err) {
                                             info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
                                         }
@@ -104,7 +143,9 @@ impl Server {
                                 }
                                 None => {
                                     // Handle HTTP connection.
+                                    http_connection_cnt.inc();
                                     if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
+                                        http_error_cnt.inc();
                                         if !suppress_hyper_error(&err) {
                                             info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
                                         }
@@ -115,6 +156,7 @@ impl Server {
                  }
                 Some(conn) = connections.next() => {
                     if let Err(err) = conn {
+                        panic_error_cnt.inc();
                         error!("Connection panicked: {err:#}");
                     }
                 }
@@ -122,6 +164,7 @@ impl Server {
                     // Wait for graceful shutdown of all connections.
                     while let Some(conn) = connections.next().await {
                         if let Err(err) = conn {
+                            panic_error_cnt.inc();
                             error!("Connection panicked: {err:#}");
                         }
                     }
diff --git a/libs/http-utils/src/tls_certs.rs b/libs/http-utils/src/tls_certs.rs
index 0c18d84d987d..2799db78a600 100644
--- a/libs/http-utils/src/tls_certs.rs
+++ b/libs/http-utils/src/tls_certs.rs
@@ -3,11 +3,14 @@ use std::{sync::Arc, time::Duration};
 use anyhow::Context;
 use arc_swap::ArcSwap;
 use camino::Utf8Path;
+use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec};
+use once_cell::sync::Lazy;
 use rustls::{
-    pki_types::{CertificateDer, PrivateKeyDer},
+    pki_types::{CertificateDer, PrivateKeyDer, UnixTime},
     server::{ClientHello, ResolvesServerCert},
     sign::CertifiedKey,
 };
+use x509_cert::der::Reader;
 
 pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result<Vec<CertificateDer<'static>>> {
     let cert_data = tokio::fs::read(filename)
@@ -53,6 +56,76 @@ pub async fn load_certified_key(
     Ok(certified_key)
 }
 
+/// rustls's CertifiedKey with extra parsed fields used for metrics.
+struct ParsedCertifiedKey {
+    certified_key: CertifiedKey,
+    expiration_time: UnixTime,
+}
+
+/// Parse expiration time from an X509 certificate.
+fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result<UnixTime> {
+    let parsed_cert = x509_cert::der::SliceReader::new(cert)
+        .context("Failed to parse cerficiate")?
+        .decode::<x509_cert::Certificate>()
+        .context("Failed to parse cerficiate")?;
+
+    Ok(UnixTime::since_unix_epoch(
+        parsed_cert
+            .tbs_certificate
+            .validity
+            .not_after
+            .to_unix_duration(),
+    ))
+}
+
+async fn load_and_parse_certified_key(
+    key_filename: &Utf8Path,
+    cert_filename: &Utf8Path,
+) -> anyhow::Result<ParsedCertifiedKey> {
+    let certified_key = load_certified_key(key_filename, cert_filename).await?;
+    let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?;
+    Ok(ParsedCertifiedKey {
+        certified_key,
+        expiration_time,
+    })
+}
+
+static CERT_EXPIRATION_TIME: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "tls_certs_expiration_time_seconds",
+        "Expiration time of the loaded certificate since unix epoch in seconds",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_started_total",
+        "Number of certificate reload loop iterations started",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_UPDATED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_updated_total",
+        "Number of times the certificate was updated to the new one",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
+static CERT_RELOAD_FAILED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "tls_certs_reload_failed_total",
+        "Number of times the certificate reload failed",
+        &["resolver_name"]
+    )
+    .expect("failed to define a metric")
+});
+
 /// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from
 /// the disk periodically.
 #[derive(Debug)]
@@ -63,16 +136,28 @@ pub struct ReloadingCertificateResolver {
 impl ReloadingCertificateResolver {
     /// Creates a new Resolver by loading certificate and private key from FS and
     /// creating tokio::task to reload them with provided reload_period.
+    /// resolver_name is used as metric's label.
     pub async fn new(
+        resolver_name: &str,
         key_filename: &Utf8Path,
         cert_filename: &Utf8Path,
         reload_period: Duration,
     ) -> anyhow::Result<Arc<Self>> {
+        // Create metrics for current resolver.
+        let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]);
+        let cert_reload_started_counter =
+            CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]);
+        let cert_reload_updated_counter =
+            CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]);
+        let cert_reload_failed_counter =
+            CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]);
+
+        let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?;
+
         let this = Arc::new(Self {
-            certified_key: ArcSwap::from_pointee(
-                load_certified_key(key_filename, cert_filename).await?,
-            ),
+            certified_key: ArcSwap::from_pointee(parsed_key.certified_key),
         });
+        cert_expiration_time.set(parsed_key.expiration_time.as_secs());
 
         tokio::spawn({
             let weak_this = Arc::downgrade(&this);
@@ -88,17 +173,22 @@ impl ReloadingCertificateResolver {
                         Some(this) => this,
                         None => break, // Resolver has been destroyed, exit.
                     };
-                    match load_certified_key(&key_filename, &cert_filename).await {
-                        Ok(new_certified_key) => {
-                            if new_certified_key.cert == this.certified_key.load().cert {
+                    cert_reload_started_counter.inc();
+
+                    match load_and_parse_certified_key(&key_filename, &cert_filename).await {
+                        Ok(parsed_key) => {
+                            if parsed_key.certified_key.cert == this.certified_key.load().cert {
                                 tracing::debug!("Certificate has not changed since last reloading");
                             } else {
                                 tracing::info!("Certificate has been reloaded");
-                                this.certified_key.store(Arc::new(new_certified_key));
+                                this.certified_key.store(Arc::new(parsed_key.certified_key));
+                                cert_expiration_time.set(parsed_key.expiration_time.as_secs());
+                                cert_reload_updated_counter.inc();
                             }
                             last_reload_failed = false;
                         }
                         Err(err) => {
+                            cert_reload_failed_counter.inc();
                             // Note: Reloading certs may fail if it conflicts with the script updating
                             // the files at the same time. Warn only if the error is persistent.
                             if last_reload_failed {
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 8f56d60a4af9..53b68afb0f51 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -180,6 +180,7 @@ pub struct ConfigToml {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub generate_unarchival_heatmap: Option<bool>,
     pub tracing: Option<Tracing>,
+    pub enable_tls_page_service_api: bool,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -206,6 +207,10 @@ pub struct PageServicePipeliningConfigPipelined {
     /// Causes runtime errors if larger than max get_vectored batch size.
     pub max_batch_size: NonZeroUsize,
     pub execution: PageServiceProtocolPipelinedExecutionStrategy,
+    // The default below is such that new versions of the software can start
+    // with the old configuration.
+    #[serde(default)]
+    pub batching: PageServiceProtocolPipelinedBatchingStrategy,
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -215,6 +220,19 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy {
     Tasks,
 }
 
+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum PageServiceProtocolPipelinedBatchingStrategy {
+    /// All get page requests in a batch will be at the same LSN
+    #[default]
+    UniformLsn,
+    /// Get page requests in a batch may be at different LSN
+    ///
+    /// One key cannot be present more than once at different LSNs in
+    /// the same batch.
+    ScatteredLsn,
+}
+
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case")]
 pub enum GetVectoredConcurrentIo {
@@ -451,6 +469,8 @@ pub struct TenantConfigToml {
     // gc-compaction related configs
     /// Enable automatic gc-compaction trigger on this tenant.
     pub gc_compaction_enabled: bool,
+    /// Enable verification of gc-compaction results.
+    pub gc_compaction_verification: bool,
     /// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold,
     /// gc-compaction will be triggered.
     pub gc_compaction_initial_threshold_kb: u64,
@@ -612,9 +632,12 @@ impl Default for ConfigToml {
             page_service_pipelining: if !cfg!(test) {
                 PageServicePipeliningConfig::Serial
             } else {
+                // Do not turn this into the default until scattered reads have been
+                // validated and rolled-out fully.
                 PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
                     max_batch_size: NonZeroUsize::new(32).unwrap(),
                     execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
+                    batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn,
                 })
             },
             get_vectored_concurrent_io: if !cfg!(test) {
@@ -631,6 +654,7 @@ impl Default for ConfigToml {
             load_previous_heatmap: None,
             generate_unarchival_heatmap: None,
             tracing: None,
+            enable_tls_page_service_api: false,
         }
     }
 }
@@ -690,6 +714,7 @@ pub mod tenant_conf_defaults {
     // image layers should be created.
     pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2;
     pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false;
+    pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true;
     pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB
     pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100;
 }
@@ -744,6 +769,7 @@ impl Default for TenantConfigToml {
             wal_receiver_protocol_override: None,
             rel_size_v2_enabled: false,
             gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
+            gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION,
             gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
             gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
             sampling_ratio: None,
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 3cb62f9d180b..91f9c03ba4f6 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -7,7 +7,8 @@ use std::time::{Duration, Instant};
 /// API (`/control/v1` prefix).  Implemented by the server
 /// in [`storage_controller::http`]
 use serde::{Deserialize, Serialize};
-use utils::id::{NodeId, TenantId};
+use utils::id::{NodeId, TenantId, TimelineId};
+use utils::lsn::Lsn;
 
 use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
 use crate::shard::{ShardStripeSize, TenantShardId};
@@ -499,6 +500,15 @@ pub struct SafekeeperSchedulingPolicyRequest {
     pub scheduling_policy: SkSchedulingPolicy,
 }
 
+/// Import request for safekeeper timelines.
+#[derive(Serialize, Deserialize, Clone)]
+pub struct TimelineImportRequest {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub start_lsn: Lsn,
+    pub sk_set: Vec<NodeId>,
+}
+
 #[cfg(test)]
 mod test {
     use serde_json;
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 8836e7ec8729..0c4d7fd4cb70 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -927,7 +927,7 @@ impl Key {
 
     /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
     #[inline(always)]
-    pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
+    pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> {
         Ok(match self.field1 {
             0x00 => (
                 RelTag {
@@ -938,7 +938,7 @@ impl Key {
                 },
                 self.field6,
             ),
-            _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1),
+            _ => return Err(ToRelBlockError(self.field1)),
         })
     }
 }
@@ -951,6 +951,17 @@ impl std::str::FromStr for Key {
     }
 }
 
+#[derive(Debug)]
+pub struct ToRelBlockError(u8);
+
+impl fmt::Display for ToRelBlockError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "unexpected value kind 0x{:02x}", self.0)
+    }
+}
+
+impl std::error::Error for ToRelBlockError {}
+
 #[cfg(test)]
 mod tests {
     use std::str::FromStr;
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index e505f23e49ed..79e3ef553b97 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -613,8 +613,7 @@ mod tests {
     use rand::{RngCore, SeedableRng};
 
     use super::*;
-    use crate::models::ShardParameters;
-    use crate::shard::{ShardCount, ShardNumber};
+    use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber, ShardStripeSize};
 
     // Helper function to create a key range.
     //
@@ -964,12 +963,8 @@ mod tests {
     }
     #[test]
     fn sharded_range_relation_gap() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
+        let shard_identity =
+            ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
 
         let range = ShardedRange::new(
             Range {
@@ -985,12 +980,8 @@ mod tests {
 
     #[test]
     fn shard_identity_keyspaces_single_key() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
+        let shard_identity =
+            ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
 
         let range = ShardedRange::new(
             Range {
@@ -1034,12 +1025,8 @@ mod tests {
 
     #[test]
     fn shard_identity_keyspaces_forkno_gap() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
+        let shard_identity =
+            ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
 
         let range = ShardedRange::new(
             Range {
@@ -1061,7 +1048,7 @@ mod tests {
             let shard_identity = ShardIdentity::new(
                 ShardNumber(shard_number),
                 ShardCount::new(4),
-                ShardParameters::DEFAULT_STRIPE_SIZE,
+                DEFAULT_STRIPE_SIZE,
             )
             .unwrap();
 
@@ -1144,37 +1131,44 @@ mod tests {
     /// for a single tenant.
     #[test]
     fn sharded_range_fragment_simple() {
+        const SHARD_COUNT: u8 = 4;
+        const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
+
         let shard_identity = ShardIdentity::new(
             ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
+            ShardCount::new(SHARD_COUNT),
+            ShardStripeSize(STRIPE_SIZE),
         )
         .unwrap();
 
         // A range which we happen to know covers exactly one stripe which belongs to this shard
         let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
+        let mut input_end = input_start;
+        input_end.field6 += STRIPE_SIZE; // field6 is block number
 
         // Ask for stripe_size blocks, we get the whole stripe
         assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 32768),
-            (32768, vec![(32768, input_start..input_end)])
+            do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE),
+            (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
         );
 
         // Ask for more, we still get the whole stripe
         assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 10000000),
-            (32768, vec![(32768, input_start..input_end)])
+            do_fragment(input_start, input_end, &shard_identity, 10 * STRIPE_SIZE),
+            (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
         );
 
         // Ask for target_nblocks of half the stripe size, we get two halves
         assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 16384),
+            do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE / 2),
             (
-                32768,
+                STRIPE_SIZE,
                 vec![
-                    (16384, input_start..input_start.add(16384)),
-                    (16384, input_start.add(16384)..input_end)
+                    (
+                        STRIPE_SIZE / 2,
+                        input_start..input_start.add(STRIPE_SIZE / 2)
+                    ),
+                    (STRIPE_SIZE / 2, input_start.add(STRIPE_SIZE / 2)..input_end)
                 ]
             )
         );
@@ -1182,40 +1176,53 @@ mod tests {
 
     #[test]
     fn sharded_range_fragment_multi_stripe() {
+        const SHARD_COUNT: u8 = 4;
+        const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
+        const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE;
+
         let shard_identity = ShardIdentity::new(
             ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
+            ShardCount::new(SHARD_COUNT),
+            ShardStripeSize(STRIPE_SIZE),
         )
         .unwrap();
 
         // A range which covers multiple stripes, exactly one of which belongs to the current shard.
         let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
+        let mut input_end = input_start;
+        input_end.field6 += RANGE_SIZE; // field6 is block number
+
         // Ask for all the blocks, get a fragment that covers the whole range but reports
         // its size to be just the blocks belonging to our shard.
         assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 131072),
-            (32768, vec![(32768, input_start..input_end)])
+            do_fragment(input_start, input_end, &shard_identity, RANGE_SIZE),
+            (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)])
         );
 
-        // Ask for a sub-stripe quantity
+        // Ask for a sub-stripe quantity that results in 3 fragments.
+        let limit = STRIPE_SIZE / 3 + 1;
         assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 16000),
+            do_fragment(input_start, input_end, &shard_identity, limit),
             (
-                32768,
+                STRIPE_SIZE,
                 vec![
-                    (16000, input_start..input_start.add(16000)),
-                    (16000, input_start.add(16000)..input_start.add(32000)),
-                    (768, input_start.add(32000)..input_end),
+                    (limit, input_start..input_start.add(limit)),
+                    (limit, input_start.add(limit)..input_start.add(2 * limit)),
+                    (
+                        STRIPE_SIZE - 2 * limit,
+                        input_start.add(2 * limit)..input_end
+                    ),
                 ]
             )
         );
 
         // Try on a range that starts slightly after our owned stripe
         assert_eq!(
-            do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
-            (32767, vec![(32767, input_start.add(1)..input_end)])
+            do_fragment(input_start.add(1), input_end, &shard_identity, RANGE_SIZE),
+            (
+                STRIPE_SIZE - 1,
+                vec![(STRIPE_SIZE - 1, input_start.add(1)..input_end)]
+            )
         );
     }
 
@@ -1223,32 +1230,40 @@ mod tests {
     /// a previous relation.
     #[test]
     fn sharded_range_fragment_starting_from_logical_size() {
+        const SHARD_COUNT: u8 = 4;
+        const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0;
+        const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE;
+
         let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
+        let mut input_end = Key::from_hex("000000067f00000001000000ae0100000000").unwrap();
+        input_end.field6 += RANGE_SIZE; // field6 is block number
 
         // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
         let shard_identity = ShardIdentity::new(
             ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
+            ShardCount::new(SHARD_COUNT),
+            ShardStripeSize(STRIPE_SIZE),
         )
         .unwrap();
         assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x10000),
-            (0x8001, vec![(0x8001, input_start..input_end)])
+            do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE),
+            (
+                STRIPE_SIZE + 1,
+                vec![(STRIPE_SIZE + 1, input_start..input_end)]
+            )
         );
 
         // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
         // store all logical sizes)
         let shard_identity = ShardIdentity::new(
             ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
+            ShardCount::new(SHARD_COUNT),
+            ShardStripeSize(STRIPE_SIZE),
         )
         .unwrap();
         assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x10000),
-            (0x1, vec![(0x1, input_start..input_end)])
+            do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE),
+            (1, vec![(1, input_start..input_end)])
         );
     }
 
@@ -1284,12 +1299,8 @@ mod tests {
         );
 
         // Same, but using a sharded identity
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
+        let shard_identity =
+            ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap();
         assert_eq!(
             do_fragment(input_start, input_end, &shard_identity, 0x8000),
             (u32::MAX, vec![(u32::MAX, input_start..input_end),])
@@ -1331,7 +1342,7 @@ mod tests {
                 ShardIdentity::new(
                     ShardNumber((prng.next_u32() % shard_count) as u8),
                     ShardCount::new(shard_count as u8),
-                    ShardParameters::DEFAULT_STRIPE_SIZE,
+                    DEFAULT_STRIPE_SIZE,
                 )
                 .unwrap()
             };
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 2ffff676882f..f491ed10e1a6 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -26,7 +26,7 @@ use utils::{completion, serde_system_time};
 use crate::config::Ratio;
 use crate::key::{CompactKey, Key};
 use crate::reltag::RelTag;
-use crate::shard::{ShardCount, ShardStripeSize, TenantShardId};
+use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
 
 /// The state of a tenant in this pageserver.
 ///
@@ -438,8 +438,6 @@ pub struct ShardParameters {
 }
 
 impl ShardParameters {
-    pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
-
     pub fn is_unsharded(&self) -> bool {
         self.count.is_unsharded()
     }
@@ -449,7 +447,7 @@ impl Default for ShardParameters {
     fn default() -> Self {
         Self {
             count: ShardCount::new(0),
-            stripe_size: Self::DEFAULT_STRIPE_SIZE,
+            stripe_size: DEFAULT_STRIPE_SIZE,
         }
     }
 }
@@ -578,6 +576,8 @@ pub struct TenantConfigPatch {
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub gc_compaction_enabled: FieldPatch<bool>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub gc_compaction_verification: FieldPatch<bool>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub gc_compaction_initial_threshold_kb: FieldPatch<u64>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub gc_compaction_ratio_percent: FieldPatch<u64>,
@@ -698,6 +698,9 @@ pub struct TenantConfig {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub gc_compaction_enabled: Option<bool>,
 
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub gc_compaction_verification: Option<bool>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     pub gc_compaction_initial_threshold_kb: Option<u64>,
 
@@ -746,6 +749,7 @@ impl TenantConfig {
             mut wal_receiver_protocol_override,
             mut rel_size_v2_enabled,
             mut gc_compaction_enabled,
+            mut gc_compaction_verification,
             mut gc_compaction_initial_threshold_kb,
             mut gc_compaction_ratio_percent,
             mut sampling_ratio,
@@ -837,6 +841,9 @@ impl TenantConfig {
         patch
             .gc_compaction_enabled
             .apply(&mut gc_compaction_enabled);
+        patch
+            .gc_compaction_verification
+            .apply(&mut gc_compaction_verification);
         patch
             .gc_compaction_initial_threshold_kb
             .apply(&mut gc_compaction_initial_threshold_kb);
@@ -878,6 +885,7 @@ impl TenantConfig {
             wal_receiver_protocol_override,
             rel_size_v2_enabled,
             gc_compaction_enabled,
+            gc_compaction_verification,
             gc_compaction_initial_threshold_kb,
             gc_compaction_ratio_percent,
             sampling_ratio,
@@ -976,6 +984,9 @@ impl TenantConfig {
             gc_compaction_enabled: self
                 .gc_compaction_enabled
                 .unwrap_or(global_conf.gc_compaction_enabled),
+            gc_compaction_verification: self
+                .gc_compaction_verification
+                .unwrap_or(global_conf.gc_compaction_verification),
             gc_compaction_initial_threshold_kb: self
                 .gc_compaction_initial_threshold_kb
                 .unwrap_or(global_conf.gc_compaction_initial_threshold_kb),
@@ -1680,6 +1691,7 @@ pub struct SecondaryProgress {
 pub struct TenantScanRemoteStorageShard {
     pub tenant_shard_id: TenantShardId,
     pub generation: Option<u32>,
+    pub stripe_size: Option<ShardStripeSize>,
 }
 
 #[derive(Serialize, Deserialize, Debug, Default)]
diff --git a/libs/pageserver_api/src/record.rs b/libs/pageserver_api/src/record.rs
index fda504a26ef3..73516c52203e 100644
--- a/libs/pageserver_api/src/record.rs
+++ b/libs/pageserver_api/src/record.rs
@@ -58,6 +58,8 @@ pub enum NeonWalRecord {
         /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
         /// its references in `timeline.rs`.
         will_init: bool,
+        /// Only append the record if the current image is the same as the one specified in this field.
+        only_if: Option<String>,
     },
 }
 
@@ -81,6 +83,17 @@ impl NeonWalRecord {
             append: s.as_ref().to_string(),
             clear: false,
             will_init: false,
+            only_if: None,
+        }
+    }
+
+    #[cfg(feature = "testing")]
+    pub fn wal_append_conditional(s: impl AsRef<str>, only_if: impl AsRef<str>) -> Self {
+        Self::Test {
+            append: s.as_ref().to_string(),
+            clear: false,
+            will_init: false,
+            only_if: Some(only_if.as_ref().to_string()),
         }
     }
 
@@ -90,6 +103,7 @@ impl NeonWalRecord {
             append: s.as_ref().to_string(),
             clear: true,
             will_init: false,
+            only_if: None,
         }
     }
 
@@ -99,6 +113,7 @@ impl NeonWalRecord {
             append: s.as_ref().to_string(),
             clear: true,
             will_init: true,
+            only_if: None,
         }
     }
 }
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 8386d6e586f6..feb59f5070cb 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -78,6 +78,12 @@ impl Default for ShardStripeSize {
     }
 }
 
+impl std::fmt::Display for ShardStripeSize {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 /// Layout version: for future upgrades where we might change how the key->shard mapping works
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
 pub struct ShardLayout(u8);
@@ -86,8 +92,11 @@ const LAYOUT_V1: ShardLayout = ShardLayout(1);
 /// ShardIdentity uses a magic layout value to indicate if it is unusable
 const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
 
-/// Default stripe size in pages: 256MiB divided by 8kiB page size.
-const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
+/// The default stripe size in pages. 16 MiB divided by 8 kiB page size.
+///
+/// A lower stripe size distributes ingest load better across shards, but reduces IO amortization.
+/// 16 MiB appears to be a reasonable balance: <https://github.com/neondatabase/neon/pull/10510>.
+pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(16 * 1024 / 8);
 
 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
 pub enum ShardConfigError {
@@ -537,7 +546,7 @@ mod tests {
             field6: 0x7d06,
         };
 
-        let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
+        let shard = key_to_shard_number(ShardCount(10), ShardStripeSize(32768), &key);
         assert_eq!(shard, ShardNumber(8));
     }
 
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index a0a891f0dc1a..654dde8da642 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -5,7 +5,6 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
 use std::future::Future;
-use std::io::ErrorKind;
 use std::net::SocketAddr;
 use std::os::fd::{AsRawFd, RawFd};
 use std::pin::Pin;
@@ -227,7 +226,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
         match self {
             MaybeWriteOnly::Full(framed) => framed.read_startup_message().await,
             MaybeWriteOnly::WriteOnly(_) => {
-                Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
+                Err(io::Error::other("reading from write only half").into())
             }
             MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
         }
@@ -237,7 +236,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
         match self {
             MaybeWriteOnly::Full(framed) => framed.read_message().await,
             MaybeWriteOnly::WriteOnly(_) => {
-                Err(io::Error::new(ErrorKind::Other, "reading from write only half").into())
+                Err(io::Error::other("reading from write only half").into())
             }
             MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"),
         }
@@ -975,7 +974,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'_, IO> {
             .write_message_noflush(&BeMessage::CopyData(buf))
             // write_message only writes to the buffer, so it can fail iff the
             // message is invaid, but CopyData can't be invalid.
-            .map_err(|_| io::Error::new(ErrorKind::Other, "failed to serialize CopyData"))?;
+            .map_err(|_| io::Error::other("failed to serialize CopyData"))?;
 
         Poll::Ready(Ok(buf.len()))
     }
diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs
index 907ef9eed3b4..75ca12301463 100644
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -85,8 +85,8 @@ static KEY: Lazy<rustls::pki_types::PrivateKeyDer<'static>> = Lazy::new(|| {
 
 static CERT: Lazy<rustls::pki_types::CertificateDer<'static>> = Lazy::new(|| {
     let mut cursor = Cursor::new(include_bytes!("cert.pem"));
-    let cert = rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap();
-    cert
+
+    rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap()
 });
 
 // test that basic select with ssl works
diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs
index 8e216d0f44ad..4e5e48ecf585 100644
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -35,7 +35,7 @@ impl ConnectionError {
     pub fn into_io_error(self) -> io::Error {
         match self {
             ConnectionError::Io(io) => io,
-            ConnectionError::Protocol(pe) => io::Error::new(io::ErrorKind::Other, pe.to_string()),
+            ConnectionError::Protocol(pe) => io::Error::other(pe.to_string()),
         }
     }
 }
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index e435ffbf7e05..e7afc6456401 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -257,7 +257,7 @@ pub enum ProtocolError {
 impl ProtocolError {
     /// Proxy stream.rs uses only io::Error; provide it.
     pub fn into_io_error(self) -> io::Error {
-        io::Error::new(io::ErrorKind::Other, self.to_string())
+        io::Error::other(self.to_string())
     }
 }
 
diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
index 27e05e24ec4a..2daf9a80d453 100644
--- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
+++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
@@ -212,7 +212,7 @@ impl ScramSha256 {
                     password,
                     channel_binding,
                 } => (nonce, password, channel_binding),
-                _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")),
+                _ => return Err(io::Error::other("invalid SCRAM state")),
             };
 
         let message =
@@ -291,7 +291,7 @@ impl ScramSha256 {
                 server_key,
                 auth_message,
             } => (server_key, auth_message),
-            _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")),
+            _ => return Err(io::Error::other("invalid SCRAM state")),
         };
 
         let message =
@@ -301,10 +301,7 @@ impl ScramSha256 {
 
         let verifier = match parsed {
             ServerFinalMessage::Error(e) => {
-                return Err(io::Error::new(
-                    io::ErrorKind::Other,
-                    format!("SCRAM error: {}", e),
-                ));
+                return Err(io::Error::other(format!("SCRAM error: {}", e)));
             }
             ServerFinalMessage::Verifier(verifier) => verifier,
         };
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 7bdf340f74b7..bd18d80915a7 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -28,7 +28,7 @@ toml_edit.workspace = true
 tracing.workspace = true
 scopeguard.workspace = true
 metrics.workspace = true
-utils.workspace = true
+utils = { path = "../utils", default-features = false }
 pin-project-lite.workspace = true
 
 azure_core.workspace = true
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index dee61a410d7d..18146c5464d5 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -801,8 +801,7 @@ where
             // that support needs to be hacked in.
             //
             // including {self:?} into the message would be useful, but unsure how to unproject.
-            _ => std::task::Poll::Ready(Err(std::io::Error::new(
-                std::io::ErrorKind::Other,
+            _ => std::task::Poll::Ready(Err(std::io::Error::other(
                 "cloned or initial values cannot be read",
             ))),
         }
@@ -855,7 +854,7 @@ where
         };
         Err(azure_core::error::Error::new(
             azure_core::error::ErrorKind::Io,
-            std::io::Error::new(std::io::ErrorKind::Other, msg),
+            std::io::Error::other(msg),
         ))
     }
 
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 4180602ac78f..fd2fa63fd09c 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,7 +5,8 @@ edition.workspace = true
 license.workspace = true
 
 [features]
-default = []
+default = ["rename_noreplace"]
+rename_noreplace = []
 # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
 # which adds some runtime cost to run tests on outage conditions
 testing = ["fail/failpoints"]
@@ -35,7 +36,7 @@ serde_with.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["signal"] }
 tokio-tar.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = ["serde"] }
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index cc5b0b1d1393..db4fc5685c10 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -173,7 +173,7 @@ impl std::fmt::Debug for JwtAuth {
 }
 
 // this function is used only for testing purposes in CLI e g generate tokens during init
-pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result<String> {
+pub fn encode_from_key_file<S: Serialize>(claims: &S, key_data: &[u8]) -> Result<String> {
     let key = EncodingKey::from_ed_pem(key_data)?;
     Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?)
 }
diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index 290a5b26863a..215fa36df49b 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -81,12 +81,9 @@ pub fn path_with_suffix_extension(
 }
 
 pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> {
-    let parent = file_path.parent().ok_or_else(|| {
-        io::Error::new(
-            io::ErrorKind::Other,
-            format!("File {file_path:?} has no parent"),
-        )
-    })?;
+    let parent = file_path
+        .parent()
+        .ok_or_else(|| io::Error::other(format!("File {file_path:?} has no parent")))?;
 
     fsync(file_path)?;
     fsync(parent)?;
diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs
index a406ab0378e7..e16edaaa9a96 100644
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -3,7 +3,9 @@ use std::{fs, io, path::Path};
 
 use anyhow::Context;
 
+#[cfg(feature = "rename_noreplace")]
 mod rename_noreplace;
+#[cfg(feature = "rename_noreplace")]
 pub use rename_noreplace::rename_noreplace;
 
 pub trait PathExt {
diff --git a/libs/utils/src/fs_ext/rename_noreplace.rs b/libs/utils/src/fs_ext/rename_noreplace.rs
index fc6f794b57f8..d0c07353d022 100644
--- a/libs/utils/src/fs_ext/rename_noreplace.rs
+++ b/libs/utils/src/fs_ext/rename_noreplace.rs
@@ -8,7 +8,7 @@ pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
     dst: &P2,
 ) -> nix::Result<()> {
     {
-        #[cfg(target_os = "linux")]
+        #[cfg(all(target_os = "linux", target_env = "gnu"))]
         {
             nix::fcntl::renameat2(
                 None,
@@ -29,7 +29,7 @@ pub fn rename_noreplace<P1: ?Sized + NixPath, P2: ?Sized + NixPath>(
             })??;
             nix::errno::Errno::result(res).map(drop)
         }
-        #[cfg(not(any(target_os = "linux", target_os = "macos")))]
+        #[cfg(not(any(all(target_os = "linux", target_env = "gnu"), target_os = "macos")))]
         {
             std::compile_error!("OS does not support no-replace renames");
         }
diff --git a/libs/utils/src/signals.rs b/libs/utils/src/signals.rs
index f2be1957c42a..426bb659167b 100644
--- a/libs/utils/src/signals.rs
+++ b/libs/utils/src/signals.rs
@@ -1,6 +1,8 @@
 pub use signal_hook::consts::TERM_SIGNALS;
 pub use signal_hook::consts::signal::*;
 use signal_hook::iterator::Signals;
+use tokio::signal::unix::{SignalKind, signal};
+use tracing::info;
 
 pub enum Signal {
     Quit,
@@ -36,3 +38,30 @@ impl ShutdownSignals {
         Ok(())
     }
 }
+
+/// Runs in a loop since we want to be responsive to multiple signals
+/// even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown)
+/// <https://github.com/neondatabase/neon/issues/9740>
+pub async fn signal_handler(token: tokio_util::sync::CancellationToken) {
+    let mut sigint = signal(SignalKind::interrupt()).unwrap();
+    let mut sigterm = signal(SignalKind::terminate()).unwrap();
+    let mut sigquit = signal(SignalKind::quit()).unwrap();
+
+    loop {
+        let signal = tokio::select! {
+            _ = sigquit.recv() => {
+                info!("Got signal SIGQUIT. Terminating in immediate shutdown mode.");
+                std::process::exit(111);
+            }
+            _ = sigint.recv() => "SIGINT",
+            _ = sigterm.recv() => "SIGTERM",
+        };
+
+        if !token.is_cancelled() {
+            info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
+            token.cancel();
+        } else {
+            info!("Got signal {signal}. Already shutting down.");
+        }
+    }
+}
diff --git a/object_storage/Cargo.toml b/object_storage/Cargo.toml
new file mode 100644
index 000000000000..17fbaefe6f37
--- /dev/null
+++ b/object_storage/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "object_storage"
+version = "0.0.1"
+edition.workspace = true
+license.workspace = true
+[dependencies]
+anyhow.workspace = true
+axum-extra.workspace = true
+axum.workspace = true
+camino.workspace = true
+futures.workspace = true
+jsonwebtoken.workspace = true
+prometheus.workspace = true
+remote_storage.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+tokio-util.workspace = true
+tokio.workspace = true
+tracing.workspace = true
+utils = { path = "../libs/utils", default-features = false }
+workspace_hack.workspace = true
+[dev-dependencies]
+camino-tempfile.workspace = true
+http-body-util.workspace = true
+itertools.workspace = true
+rand.workspace = true
+test-log.workspace = true
+tower.workspace = true
diff --git a/object_storage/src/app.rs b/object_storage/src/app.rs
new file mode 100644
index 000000000000..7b5627f0db95
--- /dev/null
+++ b/object_storage/src/app.rs
@@ -0,0 +1,561 @@
+use anyhow::anyhow;
+use axum::body::{Body, Bytes};
+use axum::response::{IntoResponse, Response};
+use axum::{Router, http::StatusCode};
+use object_storage::{PrefixS3Path, S3Path, Storage, bad_request, internal_error, not_found, ok};
+use remote_storage::TimeoutOrCancel;
+use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, RemotePath};
+use std::{sync::Arc, time::SystemTime, time::UNIX_EPOCH};
+use tokio_util::sync::CancellationToken;
+use tracing::{error, info};
+use utils::backoff::retry;
+
+pub fn app(state: Arc<Storage>) -> Router<()> {
+    use axum::routing::{delete as _delete, get as _get};
+    let delete_prefix = _delete(delete_prefix);
+    Router::new()
+        .route(
+            "/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}",
+            _get(get).put(set).delete(delete),
+        )
+        .route(
+            "/{tenant_id}/{timeline_id}/{endpoint_id}",
+            delete_prefix.clone(),
+        )
+        .route("/{tenant_id}/{timeline_id}", delete_prefix.clone())
+        .route("/{tenant_id}", delete_prefix)
+        .route("/metrics", _get(metrics))
+        .route("/status", _get(async || StatusCode::OK.into_response()))
+        .with_state(state)
+}
+
+type Result = anyhow::Result<Response, Response>;
+type State = axum::extract::State<Arc<Storage>>;
+
+const CONTENT_TYPE: &str = "content-type";
+const APPLICATION_OCTET_STREAM: &str = "application/octet-stream";
+const WARN_THRESHOLD: u32 = 3;
+const MAX_RETRIES: u32 = 10;
+
+async fn metrics() -> Result {
+    prometheus::TextEncoder::new()
+        .encode_to_string(&prometheus::gather())
+        .map(|s| s.into_response())
+        .map_err(|e| internal_error(e, "/metrics", "collecting metrics"))
+}
+
+async fn get(S3Path { path }: S3Path, state: State) -> Result {
+    info!(%path, "downloading");
+    let download_err = |e| {
+        if let DownloadError::NotFound = e {
+            info!(%path, %e, "downloading"); // 404 is not an issue of _this_ service
+            return not_found(&path);
+        }
+        internal_error(e, &path, "downloading")
+    };
+    let cancel = state.cancel.clone();
+    let opts = &DownloadOpts::default();
+
+    let stream = retry(
+        async || state.storage.download(&path, opts, &cancel).await,
+        DownloadError::is_permanent,
+        WARN_THRESHOLD,
+        MAX_RETRIES,
+        "downloading",
+        &cancel,
+    )
+    .await
+    .unwrap_or(Err(DownloadError::Cancelled))
+    .map_err(download_err)?
+    .download_stream;
+
+    Response::builder()
+        .status(StatusCode::OK)
+        .header(CONTENT_TYPE, APPLICATION_OCTET_STREAM)
+        .body(Body::from_stream(stream))
+        .map_err(|e| internal_error(e, path, "reading response"))
+}
+
+// Best solution for files is multipart upload, but remote_storage doesn't support it,
+// so we can either read Bytes in memory and push at once or forward BodyDataStream to
+// remote_storage. The latter may seem more peformant, but BodyDataStream doesn't have a
+// guaranteed size() which may produce issues while uploading to s3.
+// So, currently we're going with an in-memory copy plus a boundary to prevent uploading
+// very large files.
+async fn set(S3Path { path }: S3Path, state: State, bytes: Bytes) -> Result {
+    info!(%path, "uploading");
+    let request_len = bytes.len();
+    let max_len = state.max_upload_file_limit;
+    if request_len > max_len {
+        return Err(bad_request(
+            anyhow!("File size {request_len} exceeds max {max_len}"),
+            "uploading",
+        ));
+    }
+
+    let cancel = state.cancel.clone();
+    let fun = async || {
+        let stream = bytes_to_stream(bytes.clone());
+        state
+            .storage
+            .upload(stream, request_len, &path, None, &cancel)
+            .await
+    };
+    retry(
+        fun,
+        TimeoutOrCancel::caused_by_cancel,
+        WARN_THRESHOLD,
+        MAX_RETRIES,
+        "uploading",
+        &cancel,
+    )
+    .await
+    .unwrap_or(Err(anyhow!("uploading cancelled")))
+    .map_err(|e| internal_error(e, path, "reading response"))?;
+    Ok(ok())
+}
+
+async fn delete(S3Path { path }: S3Path, state: State) -> Result {
+    info!(%path, "deleting");
+    let cancel = state.cancel.clone();
+    retry(
+        async || state.storage.delete(&path, &cancel).await,
+        TimeoutOrCancel::caused_by_cancel,
+        WARN_THRESHOLD,
+        MAX_RETRIES,
+        "deleting",
+        &cancel,
+    )
+    .await
+    .unwrap_or(Err(anyhow!("deleting cancelled")))
+    .map_err(|e| internal_error(e, path, "deleting"))?;
+    Ok(ok())
+}
+
+async fn delete_prefix(PrefixS3Path { path }: PrefixS3Path, state: State) -> Result {
+    info!(%path, "deleting prefix");
+    let cancel = state.cancel.clone();
+    retry(
+        async || state.storage.delete_prefix(&path, &cancel).await,
+        TimeoutOrCancel::caused_by_cancel,
+        WARN_THRESHOLD,
+        MAX_RETRIES,
+        "deleting prefix",
+        &cancel,
+    )
+    .await
+    .unwrap_or(Err(anyhow!("deleting prefix cancelled")))
+    .map_err(|e| internal_error(e, path, "deleting prefix"))?;
+    Ok(ok())
+}
+
+pub async fn check_storage_permissions(
+    client: &GenericRemoteStorage,
+    cancel: CancellationToken,
+) -> anyhow::Result<()> {
+    info!("storage permissions check");
+
+    // as_nanos() as multiple instances proxying same bucket may be started at once
+    let now = SystemTime::now()
+        .duration_since(UNIX_EPOCH)?
+        .as_nanos()
+        .to_string();
+
+    let path = RemotePath::from_string(&format!("write_access_{now}"))?;
+    info!(%path, "uploading");
+
+    let body = now.to_string();
+    let stream = bytes_to_stream(Bytes::from(body.clone()));
+    client
+        .upload(stream, body.len(), &path, None, &cancel)
+        .await?;
+
+    use tokio::io::AsyncReadExt;
+    info!(%path, "downloading");
+    let download_opts = DownloadOpts {
+        kind: remote_storage::DownloadKind::Small,
+        ..Default::default()
+    };
+    let mut body_read_buf = Vec::new();
+    let stream = client
+        .download(&path, &download_opts, &cancel)
+        .await?
+        .download_stream;
+    tokio_util::io::StreamReader::new(stream)
+        .read_to_end(&mut body_read_buf)
+        .await?;
+    let body_read = String::from_utf8(body_read_buf)?;
+    if body != body_read {
+        error!(%body, %body_read, "File contents do not match");
+        anyhow::bail!("Read back file doesn't match original")
+    }
+
+    info!(%path, "removing");
+    client.delete(&path, &cancel).await
+}
+
+fn bytes_to_stream(bytes: Bytes) -> impl futures::Stream<Item = std::io::Result<Bytes>> {
+    futures::stream::once(futures::future::ready(Ok(bytes)))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use axum::{body::Body, extract::Request, response::Response};
+    use http_body_util::BodyExt;
+    use itertools::iproduct;
+    use std::env::var;
+    use std::sync::Arc;
+    use std::time::Duration;
+    use test_log::test as testlog;
+    use tower::{Service, util::ServiceExt};
+    use utils::id::{TenantId, TimelineId};
+
+    // see libs/remote_storage/tests/test_real_s3.rs
+    const REAL_S3_ENV: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
+    const REAL_S3_BUCKET: &str = "REMOTE_STORAGE_S3_BUCKET";
+    const REAL_S3_REGION: &str = "REMOTE_STORAGE_S3_REGION";
+
+    async fn proxy() -> (Storage, Option<camino_tempfile::Utf8TempDir>) {
+        let cancel = CancellationToken::new();
+        let (dir, storage) = if var(REAL_S3_ENV).is_err() {
+            // tests execute in parallel and we need a new directory for each of them
+            let dir = camino_tempfile::tempdir().unwrap();
+            let fs =
+                remote_storage::LocalFs::new(dir.path().into(), Duration::from_secs(5)).unwrap();
+            (Some(dir), GenericRemoteStorage::LocalFs(fs))
+        } else {
+            // test_real_s3::create_s3_client is hard to reference, reimplementing here
+            let millis = SystemTime::now()
+                .duration_since(UNIX_EPOCH)
+                .unwrap()
+                .as_millis();
+            use rand::Rng;
+            let random = rand::thread_rng().r#gen::<u32>();
+
+            let s3_config = remote_storage::S3Config {
+                bucket_name: var(REAL_S3_BUCKET).unwrap(),
+                bucket_region: var(REAL_S3_REGION).unwrap(),
+                prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")),
+                endpoint: None,
+                concurrency_limit: std::num::NonZeroUsize::new(100).unwrap(),
+                max_keys_per_list_response: None,
+                upload_storage_class: None,
+            };
+            let bucket = remote_storage::S3Bucket::new(&s3_config, Duration::from_secs(1))
+                .await
+                .unwrap();
+            (None, GenericRemoteStorage::AwsS3(Arc::new(bucket)))
+        };
+
+        let proxy = Storage {
+            auth: object_storage::JwtAuth::new(TEST_PUB_KEY_ED25519).unwrap(),
+            storage,
+            cancel: cancel.clone(),
+            max_upload_file_limit: usize::MAX,
+        };
+        check_storage_permissions(&proxy.storage, cancel)
+            .await
+            .unwrap();
+        (proxy, dir)
+    }
+
+    // see libs/utils/src/auth.rs
+    const TEST_PUB_KEY_ED25519: &[u8] = b"
+-----BEGIN PUBLIC KEY-----
+MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w=
+-----END PUBLIC KEY-----
+";
+
+    const TEST_PRIV_KEY_ED25519: &[u8] = br#"
+-----BEGIN PRIVATE KEY-----
+MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
+-----END PRIVATE KEY-----
+"#;
+
+    async fn request(req: Request<Body>) -> Response<Body> {
+        let (proxy, _) = proxy().await;
+        app(Arc::new(proxy))
+            .into_service()
+            .oneshot(req)
+            .await
+            .unwrap()
+    }
+
+    #[testlog(tokio::test)]
+    async fn status() {
+        let res = Request::builder()
+            .uri("/status")
+            .body(Body::empty())
+            .map(request)
+            .unwrap()
+            .await;
+        assert_eq!(res.status(), StatusCode::OK);
+    }
+
+    fn routes() -> impl Iterator<Item = (&'static str, &'static str)> {
+        iproduct!(
+            vec!["/1", "/1/2", "/1/2/3", "/1/2/3/4"],
+            vec!["GET", "PUT", "DELETE"]
+        )
+    }
+
+    #[testlog(tokio::test)]
+    async fn no_token() {
+        for (uri, method) in routes() {
+            info!(%uri, %method);
+            let res = Request::builder()
+                .uri(uri)
+                .method(method)
+                .body(Body::empty())
+                .map(request)
+                .unwrap()
+                .await;
+            assert!(matches!(
+                res.status(),
+                StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST
+            ));
+        }
+    }
+
+    #[testlog(tokio::test)]
+    async fn invalid_token() {
+        for (uri, method) in routes() {
+            info!(%uri, %method);
+            let status = Request::builder()
+                .uri(uri)
+                .header("Authorization", "Bearer 123")
+                .method(method)
+                .body(Body::empty())
+                .map(request)
+                .unwrap()
+                .await;
+            assert!(matches!(
+                status.status(),
+                StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST
+            ));
+        }
+    }
+
+    const TENANT_ID: TenantId =
+        TenantId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]);
+    const TIMELINE_ID: TimelineId =
+        TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
+    const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
+    fn token() -> String {
+        let claims = object_storage::Claims {
+            tenant_id: TENANT_ID,
+            timeline_id: TIMELINE_ID,
+            endpoint_id: ENDPOINT_ID.into(),
+            exp: u64::MAX,
+        };
+        let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
+        let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
+        jsonwebtoken::encode(&header, &claims, &key).unwrap()
+    }
+
+    #[testlog(tokio::test)]
+    async fn unauthorized() {
+        let (proxy, _) = proxy().await;
+        let mut app = app(Arc::new(proxy)).into_service();
+        let token = token();
+        let args = itertools::iproduct!(
+            vec![TENANT_ID.to_string(), TenantId::generate().to_string()],
+            vec![TIMELINE_ID.to_string(), TimelineId::generate().to_string()],
+            vec![ENDPOINT_ID, "ep-ololo"]
+        )
+        .skip(1);
+
+        for ((uri, method), (tenant, timeline, endpoint)) in iproduct!(routes(), args) {
+            info!(%uri, %method, %tenant, %timeline, %endpoint);
+            let request = Request::builder()
+                .uri(format!("/{tenant}/{timeline}/{endpoint}/sub/path/key"))
+                .method(method)
+                .header("Authorization", format!("Bearer {}", token))
+                .body(Body::empty())
+                .unwrap();
+            let status = ServiceExt::ready(&mut app)
+                .await
+                .unwrap()
+                .call(request)
+                .await
+                .unwrap()
+                .status();
+            assert_eq!(status, StatusCode::UNAUTHORIZED);
+        }
+    }
+
+    #[testlog(tokio::test)]
+    async fn method_not_allowed() {
+        let token = token();
+        let iter = iproduct!(vec!["", "/.."], vec!["GET", "PUT"]);
+        for (key, method) in iter {
+            let status = Request::builder()
+                .uri(format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}{key}"))
+                .method(method)
+                .header("Authorization", format!("Bearer {token}"))
+                .body(Body::empty())
+                .map(request)
+                .unwrap()
+                .await
+                .status();
+            assert!(matches!(
+                status,
+                StatusCode::BAD_REQUEST | StatusCode::METHOD_NOT_ALLOWED
+            ));
+        }
+    }
+
+    async fn requests_chain(
+        chain: impl Iterator<Item = (String, &str, &'static str, StatusCode, bool)>,
+        token: impl Fn(&str) -> String,
+    ) {
+        let (proxy, _) = proxy().await;
+        let mut app = app(Arc::new(proxy)).into_service();
+        for (uri, method, body, expected_status, compare_body) in chain {
+            info!(%uri, %method, %body, %expected_status);
+            let bearer = format!("Bearer {}", token(&uri));
+            let request = Request::builder()
+                .uri(uri)
+                .method(method)
+                .header("Authorization", &bearer)
+                .body(Body::from(body))
+                .unwrap();
+            let response = ServiceExt::ready(&mut app)
+                .await
+                .unwrap()
+                .call(request)
+                .await
+                .unwrap();
+            assert_eq!(response.status(), expected_status);
+            if !compare_body {
+                continue;
+            }
+            let read_body = response.into_body().collect().await.unwrap().to_bytes();
+            assert_eq!(body, read_body);
+        }
+    }
+
+    #[testlog(tokio::test)]
+    async fn metrics() {
+        let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key");
+        let req = vec![
+            (uri.clone(), "PUT", "body", StatusCode::OK, false),
+            (uri.clone(), "DELETE", "", StatusCode::OK, false),
+        ];
+        requests_chain(req.into_iter(), |_| token()).await;
+
+        let res = Request::builder()
+            .uri("/metrics")
+            .body(Body::empty())
+            .map(request)
+            .unwrap()
+            .await;
+        assert_eq!(res.status(), StatusCode::OK);
+        let body = res.into_body().collect().await.unwrap().to_bytes();
+        let body = String::from_utf8_lossy(&body);
+        tracing::debug!(%body);
+        // Storage metrics are not gathered for LocalFs
+        if var(REAL_S3_ENV).is_ok() {
+            assert!(body.contains("remote_storage_s3_deleted_objects_total"));
+        }
+        assert!(body.contains("process_threads"));
+    }
+
+    #[testlog(tokio::test)]
+    async fn insert_retrieve_remove() {
+        let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key");
+        let chain = vec![
+            (uri.clone(), "GET", "", StatusCode::NOT_FOUND, false),
+            (uri.clone(), "PUT", "пыщьпыщь", StatusCode::OK, false),
+            (uri.clone(), "GET", "пыщьпыщь", StatusCode::OK, true),
+            (uri.clone(), "DELETE", "", StatusCode::OK, false),
+            (uri, "GET", "", StatusCode::NOT_FOUND, false),
+        ];
+        requests_chain(chain.into_iter(), |_| token()).await;
+    }
+
+    fn delete_prefix_token(uri: &str) -> String {
+        use serde::Serialize;
+        let parts = uri.split("/").collect::<Vec<&str>>();
+        #[derive(Serialize)]
+        struct PrefixClaims {
+            tenant_id: TenantId,
+            timeline_id: Option<TimelineId>,
+            endpoint_id: Option<object_storage::EndpointId>,
+            exp: u64,
+        }
+        let claims = PrefixClaims {
+            tenant_id: parts.get(1).map(|c| c.parse().unwrap()).unwrap(),
+            timeline_id: parts.get(2).map(|c| c.parse().unwrap()),
+            endpoint_id: parts.get(3).map(ToString::to_string),
+            exp: u64::MAX,
+        };
+        let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
+        let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
+        jsonwebtoken::encode(&header, &claims, &key).unwrap()
+    }
+
+    // Can't use single digit numbers as they won't be validated as TimelineId and EndpointId
+    #[testlog(tokio::test)]
+    async fn delete_prefix() {
+        let tenant_id =
+            TenantId::from_array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]).to_string();
+        let t2 = TimelineId::from_array([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+        let t3 = TimelineId::from_array([3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+        let t4 = TimelineId::from_array([4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
+        let f = |timeline, path| format!("/{tenant_id}/{timeline}{path}");
+        // Why extra slash in string literals? Axum is weird with URIs:
+        // /1/2 and 1/2/ match different routes, thus first yields OK and second NOT_FOUND
+        //  as it matches /tenant/timeline/endpoint, see https://stackoverflow.com/a/75355932
+        // The cost of removing trailing slash is suprisingly hard:
+        // * Add tower dependency with NormalizePath layer
+        // * wrap Router<()> in this layer https://github.com/tokio-rs/axum/discussions/2377
+        // * Rewrite make_service() -> into_make_service()
+        // * Rewrite oneshot() (not available for NormalizePath)
+        // I didn't manage to get it working correctly
+        let chain = vec![
+            // create 1/2/3/4, 1/2/3/5, delete prefix 1/2/3 -> empty
+            (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), // we can override file contents
+            (f(t2, "/3/5"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/3/5"), "GET", "", StatusCode::NOT_FOUND, false),
+            // create 1/2/3/4, 1/2/5/6, delete prefix 1/2/3 -> 1/2/5/6
+            (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/5/6"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/5/6"), "GET", "", StatusCode::OK, false),
+            // create 1/2/3/4, 1/2/7/8, delete prefix 1/2 -> empty
+            (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/7/8"), "PUT", "", StatusCode::OK, false),
+            (f(t2, ""), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/7/8"), "GET", "", StatusCode::NOT_FOUND, false),
+            // create 1/2/3/4, 1/2/5/6, 1/3/8/9, delete prefix 1/2/3 -> 1/2/5/6, 1/3/8/9
+            (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/5/6"), "PUT", "", StatusCode::OK, false),
+            (f(t3, "/8/9"), "PUT", "", StatusCode::OK, false),
+            (f(t2, "/3"), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/5/6"), "GET", "", StatusCode::OK, false),
+            (f(t3, "/8/9"), "GET", "", StatusCode::OK, false),
+            // create 1/4/5/6, delete prefix 1/2 -> 1/3/8/9, 1/4/5/6
+            (f(t4, "/5/6"), "PUT", "", StatusCode::OK, false),
+            (f(t2, ""), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t3, "/8/9"), "GET", "", StatusCode::OK, false),
+            (f(t4, "/5/6"), "GET", "", StatusCode::OK, false),
+            // delete prefix 1 -> empty
+            (format!("/{tenant_id}"), "DELETE", "", StatusCode::OK, false),
+            (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t3, "/8/9"), "GET", "", StatusCode::NOT_FOUND, false),
+            (f(t4, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false),
+        ];
+        requests_chain(chain.into_iter(), delete_prefix_token).await;
+    }
+}
diff --git a/object_storage/src/lib.rs b/object_storage/src/lib.rs
new file mode 100644
index 000000000000..989afd4c25aa
--- /dev/null
+++ b/object_storage/src/lib.rs
@@ -0,0 +1,344 @@
+use anyhow::Result;
+use axum::extract::{FromRequestParts, Path};
+use axum::response::{IntoResponse, Response};
+use axum::{RequestPartsExt, http::StatusCode, http::request::Parts};
+use axum_extra::TypedHeader;
+use axum_extra::headers::{Authorization, authorization::Bearer};
+use camino::Utf8PathBuf;
+use jsonwebtoken::{DecodingKey, Validation};
+use remote_storage::{GenericRemoteStorage, RemotePath};
+use serde::{Deserialize, Serialize};
+use std::fmt::Display;
+use std::result::Result as StdResult;
+use std::sync::Arc;
+use tokio_util::sync::CancellationToken;
+use tracing::{debug, error};
+use utils::id::{TenantId, TimelineId};
+
+// simplified version of utils::auth::JwtAuth
+pub struct JwtAuth {
+    decoding_key: DecodingKey,
+    validation: Validation,
+}
+
+pub const VALIDATION_ALGO: jsonwebtoken::Algorithm = jsonwebtoken::Algorithm::EdDSA;
+impl JwtAuth {
+    pub fn new(key: &[u8]) -> Result<Self> {
+        Ok(Self {
+            decoding_key: DecodingKey::from_ed_pem(key)?,
+            validation: Validation::new(VALIDATION_ALGO),
+        })
+    }
+
+    pub fn decode<T: serde::de::DeserializeOwned>(&self, token: &str) -> Result<T> {
+        Ok(jsonwebtoken::decode(token, &self.decoding_key, &self.validation).map(|t| t.claims)?)
+    }
+}
+
+fn normalize_key(key: &str) -> StdResult<Utf8PathBuf, String> {
+    let key = clean_utf8(&Utf8PathBuf::from(key));
+    if key.starts_with("..") || key == "." || key == "/" {
+        return Err(format!("invalid key {key}"));
+    }
+    match key.strip_prefix("/").map(Utf8PathBuf::from) {
+        Ok(p) => Ok(p),
+        _ => Ok(key),
+    }
+}
+
+// Copied from path_clean crate with PathBuf->Utf8PathBuf
+fn clean_utf8(path: &camino::Utf8Path) -> Utf8PathBuf {
+    use camino::Utf8Component as Comp;
+    let mut out = Vec::new();
+    for comp in path.components() {
+        match comp {
+            Comp::CurDir => (),
+            Comp::ParentDir => match out.last() {
+                Some(Comp::RootDir) => (),
+                Some(Comp::Normal(_)) => {
+                    out.pop();
+                }
+                None | Some(Comp::CurDir) | Some(Comp::ParentDir) | Some(Comp::Prefix(_)) => {
+                    out.push(comp)
+                }
+            },
+            comp => out.push(comp),
+        }
+    }
+    if !out.is_empty() {
+        out.iter().collect()
+    } else {
+        Utf8PathBuf::from(".")
+    }
+}
+
+pub struct Storage {
+    pub auth: JwtAuth,
+    pub storage: GenericRemoteStorage,
+    pub cancel: CancellationToken,
+    pub max_upload_file_limit: usize,
+}
+
+pub type EndpointId = String; // If needed, reuse small string from proxy/src/types.rc
+
+#[derive(Deserialize, Serialize, PartialEq)]
+pub struct Claims {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub endpoint_id: EndpointId,
+    pub exp: u64,
+}
+
+impl Display for Claims {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "Claims(tenant_id {} timeline_id {} endpoint_id {} exp {})",
+            self.tenant_id, self.timeline_id, self.endpoint_id, self.exp
+        )
+    }
+}
+
+#[derive(Deserialize, Serialize)]
+struct KeyRequest {
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    endpoint_id: EndpointId,
+    path: String,
+}
+
+#[derive(Debug, PartialEq)]
+pub struct S3Path {
+    pub path: RemotePath,
+}
+
+impl TryFrom<&KeyRequest> for S3Path {
+    type Error = String;
+    fn try_from(req: &KeyRequest) -> StdResult<Self, Self::Error> {
+        let KeyRequest {
+            tenant_id,
+            timeline_id,
+            endpoint_id,
+            path,
+        } = &req;
+        let prefix = format!("{tenant_id}/{timeline_id}/{endpoint_id}",);
+        let path = Utf8PathBuf::from(prefix).join(normalize_key(path)?);
+        let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative
+        Ok(S3Path { path })
+    }
+}
+
+fn unauthorized(route: impl Display, claims: impl Display) -> Response {
+    debug!(%route, %claims, "route doesn't match claims");
+    StatusCode::UNAUTHORIZED.into_response()
+}
+
+pub fn bad_request(err: impl Display, desc: &'static str) -> Response {
+    debug!(%err, desc);
+    (StatusCode::BAD_REQUEST, err.to_string()).into_response()
+}
+
+pub fn ok() -> Response {
+    StatusCode::OK.into_response()
+}
+
+pub fn internal_error(err: impl Display, path: impl Display, desc: &'static str) -> Response {
+    error!(%err, %path, desc);
+    StatusCode::INTERNAL_SERVER_ERROR.into_response()
+}
+
+pub fn not_found(key: impl ToString) -> Response {
+    (StatusCode::NOT_FOUND, key.to_string()).into_response()
+}
+
+impl FromRequestParts<Arc<Storage>> for S3Path {
+    type Rejection = Response;
+    async fn from_request_parts(
+        parts: &mut Parts,
+        state: &Arc<Storage>,
+    ) -> Result<Self, Self::Rejection> {
+        let Path(path): Path<KeyRequest> = parts
+            .extract()
+            .await
+            .map_err(|e| bad_request(e, "invalid route"))?;
+        let TypedHeader(Authorization(bearer)) = parts
+            .extract::<TypedHeader<Authorization<Bearer>>>()
+            .await
+            .map_err(|e| bad_request(e, "invalid token"))?;
+        let claims: Claims = state
+            .auth
+            .decode(bearer.token())
+            .map_err(|e| bad_request(e, "decoding token"))?;
+        let route = Claims {
+            tenant_id: path.tenant_id,
+            timeline_id: path.timeline_id,
+            endpoint_id: path.endpoint_id.clone(),
+            exp: claims.exp,
+        };
+        if route != claims {
+            return Err(unauthorized(route, claims));
+        }
+        (&path)
+            .try_into()
+            .map_err(|e| bad_request(e, "invalid route"))
+    }
+}
+
+#[derive(Deserialize, Serialize, PartialEq)]
+pub struct PrefixKeyPath {
+    pub tenant_id: TenantId,
+    pub timeline_id: Option<TimelineId>,
+    pub endpoint_id: Option<EndpointId>,
+}
+
+impl Display for PrefixKeyPath {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "PrefixKeyPath(tenant_id {} timeline_id {} endpoint_id {})",
+            self.tenant_id,
+            self.timeline_id
+                .as_ref()
+                .map(ToString::to_string)
+                .unwrap_or("".to_string()),
+            self.endpoint_id
+                .as_ref()
+                .map(ToString::to_string)
+                .unwrap_or("".to_string())
+        )
+    }
+}
+
+#[derive(Debug, PartialEq)]
+pub struct PrefixS3Path {
+    pub path: RemotePath,
+}
+
+impl From<&PrefixKeyPath> for PrefixS3Path {
+    fn from(path: &PrefixKeyPath) -> Self {
+        let timeline_id = path
+            .timeline_id
+            .as_ref()
+            .map(ToString::to_string)
+            .unwrap_or("".to_string());
+        let endpoint_id = path
+            .endpoint_id
+            .as_ref()
+            .map(ToString::to_string)
+            .unwrap_or("".to_string());
+        let path = Utf8PathBuf::from(path.tenant_id.to_string())
+            .join(timeline_id)
+            .join(endpoint_id);
+        let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative
+        PrefixS3Path { path }
+    }
+}
+
+impl FromRequestParts<Arc<Storage>> for PrefixS3Path {
+    type Rejection = Response;
+    async fn from_request_parts(
+        parts: &mut Parts,
+        state: &Arc<Storage>,
+    ) -> Result<Self, Self::Rejection> {
+        let Path(path) = parts
+            .extract::<Path<PrefixKeyPath>>()
+            .await
+            .map_err(|e| bad_request(e, "invalid route"))?;
+        let TypedHeader(Authorization(bearer)) = parts
+            .extract::<TypedHeader<Authorization<Bearer>>>()
+            .await
+            .map_err(|e| bad_request(e, "invalid token"))?;
+        let claims: PrefixKeyPath = state
+            .auth
+            .decode(bearer.token())
+            .map_err(|e| bad_request(e, "invalid token"))?;
+        if path != claims {
+            return Err(unauthorized(path, claims));
+        }
+        Ok((&path).into())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn normalize_key() {
+        let f = super::normalize_key;
+        assert_eq!(f("hello/world/..").unwrap(), Utf8PathBuf::from("hello"));
+        assert_eq!(
+            f("ololo/1/../../not_ololo").unwrap(),
+            Utf8PathBuf::from("not_ololo")
+        );
+        assert!(f("ololo/1/../../../").is_err());
+        assert!(f(".").is_err());
+        assert!(f("../").is_err());
+        assert!(f("").is_err());
+        assert_eq!(f("/1/2/3").unwrap(), Utf8PathBuf::from("1/2/3"));
+        assert!(f("/1/2/3/../../../").is_err());
+        assert!(f("/1/2/3/../../../../").is_err());
+    }
+
+    const TENANT_ID: TenantId =
+        TenantId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]);
+    const TIMELINE_ID: TimelineId =
+        TimelineId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
+    const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
+
+    #[test]
+    fn s3_path() {
+        let auth = Claims {
+            tenant_id: TENANT_ID,
+            timeline_id: TIMELINE_ID,
+            endpoint_id: ENDPOINT_ID.into(),
+            exp: u64::MAX,
+        };
+        let s3_path = |key| {
+            let path = &format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/{key}");
+            let path = RemotePath::from_string(path).unwrap();
+            S3Path { path }
+        };
+
+        let path = "cache_key".to_string();
+        let mut key_path = KeyRequest {
+            path,
+            tenant_id: auth.tenant_id,
+            timeline_id: auth.timeline_id,
+            endpoint_id: auth.endpoint_id,
+        };
+        assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path));
+
+        key_path.path = "we/can/have/nested/paths".to_string();
+        assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path));
+
+        key_path.path = "../error/hello/../".to_string();
+        assert!(S3Path::try_from(&key_path).is_err());
+    }
+
+    #[test]
+    fn prefix_s3_path() {
+        let mut path = PrefixKeyPath {
+            tenant_id: TENANT_ID,
+            timeline_id: None,
+            endpoint_id: None,
+        };
+        let prefix_path = |s: String| RemotePath::from_string(&s).unwrap();
+        assert_eq!(
+            PrefixS3Path::from(&path).path,
+            prefix_path(format!("{TENANT_ID}"))
+        );
+
+        path.timeline_id = Some(TIMELINE_ID);
+        assert_eq!(
+            PrefixS3Path::from(&path).path,
+            prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}"))
+        );
+
+        path.endpoint_id = Some(ENDPOINT_ID.into());
+        assert_eq!(
+            PrefixS3Path::from(&path).path,
+            prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}"))
+        );
+    }
+}
diff --git a/object_storage/src/main.rs b/object_storage/src/main.rs
new file mode 100644
index 000000000000..40325db19de4
--- /dev/null
+++ b/object_storage/src/main.rs
@@ -0,0 +1,65 @@
+//! `object_storage` is a service which provides API for uploading and downloading
+//! files. It is used by compute and control plane for accessing LFC prewarm data.
+//! This service is deployed either as a separate component or as part of compute image
+//! for large computes.
+mod app;
+use anyhow::Context;
+use tracing::info;
+use utils::logging;
+
+//see set()
+const fn max_upload_file_limit() -> usize {
+    100 * 1024 * 1024
+}
+
+#[derive(serde::Deserialize)]
+#[serde(tag = "type")]
+struct Config {
+    listen: std::net::SocketAddr,
+    pemfile: camino::Utf8PathBuf,
+    #[serde(flatten)]
+    storage_config: remote_storage::RemoteStorageConfig,
+    #[serde(default = "max_upload_file_limit")]
+    max_upload_file_limit: usize,
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    logging::init(
+        logging::LogFormat::Plain,
+        logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+        logging::Output::Stdout,
+    )?;
+
+    let config: String = std::env::args().skip(1).take(1).collect();
+    if config.is_empty() {
+        anyhow::bail!("Usage: object_storage config.json")
+    }
+    info!("Reading config from {config}");
+    let config = std::fs::read_to_string(config.clone())?;
+    let config: Config = serde_json::from_str(&config).context("parsing config")?;
+    info!("Reading pemfile from {}", config.pemfile.clone());
+    let pemfile = std::fs::read(config.pemfile.clone())?;
+    info!("Loading public key from {}", config.pemfile.clone());
+    let auth = object_storage::JwtAuth::new(&pemfile)?;
+
+    let listener = tokio::net::TcpListener::bind(config.listen).await.unwrap();
+    info!("listening on {}", listener.local_addr().unwrap());
+
+    let storage = remote_storage::GenericRemoteStorage::from_config(&config.storage_config).await?;
+    let cancel = tokio_util::sync::CancellationToken::new();
+    app::check_storage_permissions(&storage, cancel.clone()).await?;
+
+    let proxy = std::sync::Arc::new(object_storage::Storage {
+        auth,
+        storage,
+        cancel: cancel.clone(),
+        max_upload_file_limit: config.max_upload_file_limit,
+    });
+
+    tokio::spawn(utils::signals::signal_handler(cancel.clone()));
+    axum::serve(listener, app::app(proxy))
+        .with_graceful_shutdown(async move { cancel.cancelled().await })
+        .await?;
+    Ok(())
+}
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 56d97bf8a9d3..74f3fce6e5c3 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -10,6 +10,8 @@ default = []
 # which adds some runtime cost to run tests on outage conditions
 testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]
 
+fuzz-read-path = ["testing"]
+
 [dependencies]
 anyhow.workspace = true
 arc-swap.workspace = true
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 000938b18917..3108b5351f75 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -126,7 +126,7 @@ async fn ingest(
             max_concurrency: NonZeroUsize::new(1).unwrap(),
         });
         let (_desc, path) = layer
-            .write_to_disk(&ctx, None, l0_flush_state.inner())
+            .write_to_disk(&ctx, None, l0_flush_state.inner(), &gate, cancel.clone())
             .await?
             .unwrap();
         tokio::fs::remove_file(path).await?;
diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 77b3f90b3ea7..215682d90c04 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -65,7 +65,7 @@ use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
 use once_cell::sync::Lazy;
 use pageserver::config::PageServerConf;
-use pageserver::walredo::PostgresRedoManager;
+use pageserver::walredo::{PostgresRedoManager, RedoAttemptType};
 use pageserver_api::key::Key;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::shard::TenantShardId;
@@ -223,7 +223,14 @@ impl Request {
 
         // TODO: avoid these clones
         manager
-            .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
+            .request_redo(
+                *key,
+                *lsn,
+                base_img.clone(),
+                records.clone(),
+                *pg_version,
+                RedoAttemptType::ReadPage,
+            )
             .await
             .context("request_redo")
     }
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index de527e307b1d..3510ccb52915 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -34,7 +34,7 @@ use utils::lsn::Lsn;
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::Version;
 use crate::tenant::storage_layer::IoConcurrency;
-use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery};
 use crate::tenant::{PageReconstructError, Timeline};
 
 #[derive(Debug, thiserror::Error)]
@@ -353,9 +353,10 @@ where
             let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar);
 
             for part in slru_partitions.parts {
+                let query = VersionedKeySpaceQuery::uniform(part, self.lsn);
                 let blocks = self
                     .timeline
-                    .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
+                    .get_vectored(query, self.io_concurrency.clone(), self.ctx)
                     .await?;
 
                 for (key, block) in blocks {
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 9a8494292d5b..250d4180f5ce 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -31,7 +31,6 @@ use pageserver::{
 };
 use postgres_backend::AuthType;
 use remote_storage::GenericRemoteStorage;
-use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -453,6 +452,24 @@ fn start_pageserver(
     info!("Using auth for http API: {:#?}", conf.http_auth_type);
     info!("Using auth for pg connections: {:#?}", conf.pg_auth_type);
 
+    let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api
+    {
+        let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new(
+            "main",
+            &conf.ssl_key_file,
+            &conf.ssl_cert_file,
+            conf.ssl_cert_reload_period,
+        ))?;
+
+        let server_config = rustls::ServerConfig::builder()
+            .with_no_client_auth()
+            .with_cert_resolver(resolver);
+
+        Some(Arc::new(server_config))
+    } else {
+        None
+    };
+
     match var("NEON_AUTH_TOKEN") {
         Ok(v) => {
             info!("Loaded JWT token for authentication with Safekeeper");
@@ -671,17 +688,11 @@ fn start_pageserver(
 
         let https_task = match https_listener {
             Some(https_listener) => {
-                let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new(
-                    &conf.ssl_key_file,
-                    &conf.ssl_cert_file,
-                    conf.ssl_cert_reload_period,
-                ))?;
-
-                let server_config = rustls::ServerConfig::builder()
-                    .with_no_client_auth()
-                    .with_cert_resolver(resolver);
+                let tls_server_config = tls_server_config
+                    .clone()
+                    .expect("tls_server_config is set earlier if https is enabled");
 
-                let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config));
+                let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config);
 
                 let server =
                     http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?;
@@ -737,6 +748,11 @@ fn start_pageserver(
             tokio::net::TcpListener::from_std(pageserver_listener)
                 .context("create tokio listener")?
         },
+        if conf.enable_tls_page_service_api {
+            tls_server_config
+        } else {
+            None
+        },
     );
 
     // All started up! Now just sit and wait for shutdown signal.
@@ -744,32 +760,7 @@ fn start_pageserver(
         let signal_token = CancellationToken::new();
         let signal_cancel = signal_token.child_token();
 
-        // Spawn signal handlers. Runs in a loop since we want to be responsive to multiple signals
-        // even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown). See:
-        // https://github.com/neondatabase/neon/issues/9740.
-        tokio::spawn(async move {
-            let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
-            let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
-            let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
-
-            loop {
-                let signal = tokio::select! {
-                    _ = sigquit.recv() => {
-                        info!("Got signal SIGQUIT. Terminating in immediate shutdown mode.");
-                        std::process::exit(111);
-                    }
-                    _ = sigint.recv() => "SIGINT",
-                    _ = sigterm.recv() => "SIGTERM",
-                };
-
-                if !signal_token.is_cancelled() {
-                    info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
-                    signal_token.cancel();
-                } else {
-                    info!("Got signal {signal}. Already shutting down.");
-                }
-            }
-        });
+        tokio::spawn(utils::signals::signal_handler(signal_token));
 
         // Wait for cancellation signal and shut down the pageserver.
         //
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index ccc29e59d4ba..26ae6af70e61 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -219,6 +219,11 @@ pub struct PageServerConf {
     pub generate_unarchival_heatmap: bool,
 
     pub tracing: Option<pageserver_api::config::Tracing>,
+
+    /// Enable TLS in page service API.
+    /// Does not force TLS: the client negotiates TLS usage during the handshake.
+    /// Uses key and certificate from ssl_key_file/ssl_cert_file.
+    pub enable_tls_page_service_api: bool,
 }
 
 /// Token for authentication to safekeepers
@@ -391,6 +396,7 @@ impl PageServerConf {
             load_previous_heatmap,
             generate_unarchival_heatmap,
             tracing,
+            enable_tls_page_service_api,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -441,6 +447,7 @@ impl PageServerConf {
             page_service_pipelining,
             get_vectored_concurrent_io,
             tracing,
+            enable_tls_page_service_api,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 566086c5270f..7ea148971f4a 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -212,6 +212,12 @@ paths:
               schema:
                 type: string
                 format: date-time
+        "412":
+          description: No timestamp is found for given LSN, e.g. if there had been no commits till LSN
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PreconditionFailedError"
 
   /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
     parameters:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index cf67dc596ada..bbc4bfae1b16 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -67,7 +67,7 @@ use crate::tenant::mgr::{
 };
 use crate::tenant::remote_timeline_client::index::GcCompactionState;
 use crate::tenant::remote_timeline_client::{
-    download_index_part, list_remote_tenant_shards, list_remote_timelines,
+    download_index_part, download_tenant_manifest, list_remote_tenant_shards, list_remote_timelines,
 };
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
@@ -989,7 +989,7 @@ async fn get_lsn_by_timestamp_handler(
     if !tenant_shard_id.is_shard_zero() {
         // Requires SLRU contents, which are only stored on shard zero
         return Err(ApiError::BadRequest(anyhow!(
-            "Size calculations are only available on shard zero"
+            "Lsn calculations by timestamp are only available on shard zero"
         )));
     }
 
@@ -1064,7 +1064,7 @@ async fn get_timestamp_of_lsn_handler(
     if !tenant_shard_id.is_shard_zero() {
         // Requires SLRU contents, which are only stored on shard zero
         return Err(ApiError::BadRequest(anyhow!(
-            "Size calculations are only available on shard zero"
+            "Timestamp calculations by lsn are only available on shard zero"
         )));
     }
 
@@ -1090,8 +1090,8 @@ async fn get_timestamp_of_lsn_handler(
             .to_string();
             json_response(StatusCode::OK, time)
         }
-        None => Err(ApiError::NotFound(
-            anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
+        None => Err(ApiError::PreconditionFailed(
+            format!("Timestamp for lsn {} not found", lsn).into(),
         )),
     }
 }
@@ -2274,6 +2274,7 @@ async fn timeline_compact_handler(
     if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? {
         flags |= CompactFlags::DryRun;
     }
+    // Manual compaction does not yield for L0.
 
     let wait_until_uploaded =
         parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
@@ -2911,9 +2912,22 @@ async fn tenant_scan_remote_handler(
             };
         }
 
+        let result =
+            download_tenant_manifest(&state.remote_storage, &tenant_shard_id, generation, &cancel)
+                .instrument(info_span!("download_tenant_manifest",
+                            tenant_id=%tenant_shard_id.tenant_id,
+                            shard_id=%tenant_shard_id.shard_slug()))
+                .await;
+        let stripe_size = match result {
+            Ok((manifest, _, _)) => manifest.stripe_size,
+            Err(DownloadError::NotFound) => None,
+            Err(err) => return Err(ApiError::InternalServerError(anyhow!(err))),
+        };
+
         response.shards.push(TenantScanRemoteStorageShard {
             tenant_shard_id,
             generation: generation.into(),
+            stripe_size,
         });
     }
 
@@ -3239,7 +3253,7 @@ async fn ingest_aux_files(
         modification
             .put_file(&fname, content.as_bytes(), &ctx)
             .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
     }
     modification
         .commit(&ctx)
@@ -3368,11 +3382,11 @@ async fn put_tenant_timeline_import_basebackup(
 
         let broker_client = state.broker_client.clone();
 
-        let mut body = StreamReader::new(request.into_body().map(|res| {
-            res.map_err(|error| {
-                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
-            })
-        }));
+        let mut body = StreamReader::new(
+            request
+                .into_body()
+                .map(|res| res.map_err(|error| std::io::Error::other(anyhow::anyhow!(error)))),
+        );
 
         tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
@@ -3446,7 +3460,7 @@ async fn put_tenant_timeline_import_wal(
 
         let mut body = StreamReader::new(request.into_body().map(|res| {
             res.map_err(|error| {
-                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
+                std::io::Error::other( anyhow::anyhow!(error))
             })
         }));
 
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 6dd005de5019..911449c7c503 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -27,7 +27,7 @@ use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
-use crate::walingest::WalIngest;
+use crate::walingest::{WalIngest, WalIngestErrorKind};
 
 // Returns checkpoint LSN from controlfile
 pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
@@ -157,9 +157,9 @@ async fn import_rel(
         .put_rel_creation(rel, nblocks as u32, ctx)
         .await
     {
-        match e {
-            RelationError::AlreadyExists => {
-                debug!("Relation {} already exist. We must be extending it.", rel)
+        match e.kind {
+            WalIngestErrorKind::RelationAlreadyExists(rel) => {
+                debug!("Relation {rel} already exists. We must be extending it.")
             }
             _ => return Err(e.into()),
         }
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 1fe51021fdeb..2a779b0daaf9 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -17,7 +17,7 @@ use metrics::{
 use once_cell::sync::Lazy;
 use pageserver_api::config::{
     PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
-    PageServiceProtocolPipelinedExecutionStrategy,
+    PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
 };
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
@@ -1714,6 +1714,28 @@ pub enum SmgrQueryType {
     Test,
 }
 
+#[derive(
+    Debug,
+    Clone,
+    Copy,
+    IntoStaticStr,
+    strum_macros::EnumCount,
+    strum_macros::EnumIter,
+    strum_macros::FromRepr,
+    enum_map::Enum,
+)]
+#[strum(serialize_all = "snake_case")]
+pub enum GetPageBatchBreakReason {
+    BatchFull,
+    NonBatchableRequest,
+    NonUniformLsn,
+    SamePageAtDifferentLsn,
+    NonUniformTimeline,
+    ExecutorSteal,
+    #[cfg(feature = "testing")]
+    NonUniformKey,
+}
+
 pub(crate) struct SmgrQueryTimePerTimeline {
     global_started: [IntCounter; SmgrQueryType::COUNT],
     global_latency: [Histogram; SmgrQueryType::COUNT],
@@ -1725,6 +1747,8 @@ pub(crate) struct SmgrQueryTimePerTimeline {
     per_timeline_flush_in_progress_micros: IntCounter,
     global_batch_wait_time: Histogram,
     per_timeline_batch_wait_time: Histogram,
+    global_batch_break_reason: [IntCounter; GetPageBatchBreakReason::COUNT],
+    per_timeline_batch_break_reason: GetPageBatchBreakReasonTimelineMetrics,
     throttling: Arc<tenant_throttling::Pagestream>,
 }
 
@@ -1858,12 +1882,55 @@ static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::n
     .expect("failed to define a metric")
 });
 
+static PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        // it's a counter, but, name is prepared to extend it to a histogram of queue depth
+        "pageserver_page_service_batch_break_reason_global",
+        "Reason for breaking batches of get page requests",
+        &["reason"],
+    )
+    .expect("failed to define a metric")
+});
+
+struct GetPageBatchBreakReasonTimelineMetrics {
+    map: EnumMap<GetPageBatchBreakReason, IntCounter>,
+}
+
+impl GetPageBatchBreakReasonTimelineMetrics {
+    fn new(tenant_id: &str, shard_slug: &str, timeline_id: &str) -> Self {
+        GetPageBatchBreakReasonTimelineMetrics {
+            map: EnumMap::from_array(std::array::from_fn(|reason_idx| {
+                let reason = GetPageBatchBreakReason::from_usize(reason_idx);
+                PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.with_label_values(&[
+                    tenant_id,
+                    shard_slug,
+                    timeline_id,
+                    reason.into(),
+                ])
+            })),
+        }
+    }
+
+    fn inc(&self, reason: GetPageBatchBreakReason) {
+        self.map[reason].inc()
+    }
+}
+
+static PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_page_service_batch_break_reason",
+        "Reason for breaking batches of get page requests",
+        &["tenant_id", "shard_id", "timeline_id", "reason"],
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_page_service_config_max_batch_size",
         "Configured maximum batch size for the server-side batching functionality of page_service. \
          Labels expose more of the configuration parameters.",
-        &["mode", "execution"]
+        &["mode", "execution", "batching"]
     )
     .expect("failed to define a metric")
 });
@@ -1871,10 +1938,11 @@ pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::
 fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
     PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset();
     let (label_values, value) = match conf {
-        PageServicePipeliningConfig::Serial => (["serial", "-"], 1),
+        PageServicePipeliningConfig::Serial => (["serial", "-", "-"], 1),
         PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
             max_batch_size,
             execution,
+            batching,
         }) => {
             let mode = "pipelined";
             let execution = match execution {
@@ -1883,7 +1951,12 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
                 }
                 PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks",
             };
-            ([mode, execution], max_batch_size.get())
+            let batching = match batching {
+                PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => "uniform-lsn",
+                PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => "scattered-lsn",
+            };
+
+            ([mode, execution, batching], max_batch_size.get())
         }
     };
     PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE
@@ -1979,6 +2052,15 @@ impl SmgrQueryTimePerTimeline {
             .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
             .unwrap();
 
+        let global_batch_break_reason = std::array::from_fn(|i| {
+            let reason = GetPageBatchBreakReason::from_usize(i);
+            PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL
+                .get_metric_with_label_values(&[reason.into()])
+                .unwrap()
+        });
+        let per_timeline_batch_break_reason =
+            GetPageBatchBreakReasonTimelineMetrics::new(&tenant_id, &shard_slug, &timeline_id);
+
         let global_flush_in_progress_micros =
             PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
         let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
@@ -1996,6 +2078,8 @@ impl SmgrQueryTimePerTimeline {
             per_timeline_flush_in_progress_micros,
             global_batch_wait_time,
             per_timeline_batch_wait_time,
+            global_batch_break_reason,
+            per_timeline_batch_break_reason,
             throttling: pagestream_throttle_metrics,
         }
     }
@@ -2024,9 +2108,16 @@ impl SmgrQueryTimePerTimeline {
     }
 
     /// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer
-    pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
+    pub(crate) fn observe_getpage_batch_start(
+        &self,
+        batch_size: usize,
+        break_reason: GetPageBatchBreakReason,
+    ) {
         self.global_batch_size.observe(batch_size as f64);
         self.per_timeline_batch_size.observe(batch_size as f64);
+
+        self.global_batch_break_reason[break_reason.into_usize()].inc();
+        self.per_timeline_batch_break_reason.inc(break_reason);
     }
 }
 
@@ -3392,6 +3483,15 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
         ]);
+
+        for reason in GetPageBatchBreakReason::iter() {
+            let _ = PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.remove_label_values(&[
+                tenant_id,
+                shard_id,
+                timeline_id,
+                reason.into(),
+            ]);
+        }
     }
 }
 
@@ -4270,6 +4370,7 @@ pub fn preinitialize_metrics(
     [
         &BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT,
         &SMGR_QUERY_STARTED_GLOBAL,
+        &PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL,
     ]
     .into_iter()
     .for_each(|c| {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 7e3991dbdce7..7a62d8049ba3 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -18,7 +18,7 @@ use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pageserver_api::config::{
     PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
-    PageServiceProtocolPipelinedExecutionStrategy,
+    PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy,
 };
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::models::{
@@ -58,8 +58,8 @@ use crate::context::{
     DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
 };
 use crate::metrics::{
-    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer,
-    TimelineMetrics,
+    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
+    SmgrOpTimer, TimelineMetrics,
 };
 use crate::pgdatadir_mapping::Version;
 use crate::span::{
@@ -105,6 +105,7 @@ pub fn spawn(
     pg_auth: Option<Arc<SwappableJwtAuth>>,
     perf_trace_dispatch: Option<Dispatch>,
     tcp_listener: tokio::net::TcpListener,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
 ) -> Listener {
     let cancel = CancellationToken::new();
     let libpq_ctx = RequestContext::todo_child(
@@ -124,6 +125,7 @@ pub fn spawn(
             perf_trace_dispatch,
             tcp_listener,
             conf.pg_auth_type,
+            tls_config,
             conf.page_service_pipelining.clone(),
             libpq_ctx,
             cancel.clone(),
@@ -181,6 +183,7 @@ pub async fn libpq_listener_main(
     perf_trace_dispatch: Option<Dispatch>,
     listener: tokio::net::TcpListener,
     auth_type: AuthType,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
     pipelining_config: PageServicePipeliningConfig,
     listener_ctx: RequestContext,
     listener_cancel: CancellationToken,
@@ -223,6 +226,7 @@ pub async fn libpq_listener_main(
                     local_auth,
                     socket,
                     auth_type,
+                    tls_config.clone(),
                     pipelining_config.clone(),
                     connection_ctx,
                     connections_cancel.child_token(),
@@ -264,6 +268,7 @@ async fn page_service_conn_main(
     auth: Option<Arc<SwappableJwtAuth>>,
     socket: tokio::net::TcpStream,
     auth_type: AuthType,
+    tls_config: Option<Arc<rustls::ServerConfig>>,
     pipelining_config: PageServicePipeliningConfig,
     connection_ctx: RequestContext,
     cancel: CancellationToken,
@@ -334,7 +339,8 @@ async fn page_service_conn_main(
         cancel.clone(),
         gate_guard,
     );
-    let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;
+    let pgbackend =
+        PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?;
 
     match pgbackend.run(&mut conn_handler, &cancel).await {
         Ok(()) => {
@@ -635,6 +641,7 @@ impl std::fmt::Display for BatchedPageStreamError {
 struct BatchedGetPageRequest {
     req: PagestreamGetPageRequest,
     timer: SmgrOpTimer,
+    effective_request_lsn: Lsn,
     ctx: RequestContext,
 }
 
@@ -664,8 +671,8 @@ enum BatchedFeMessage {
     GetPage {
         span: Span,
         shard: timeline::handle::WeakHandle<TenantManagerTypes>,
-        effective_request_lsn: Lsn,
         pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
+        batch_break_reason: GetPageBatchBreakReason,
     },
     DbSize {
         span: Span,
@@ -718,6 +725,119 @@ impl BatchedFeMessage {
             BatchedFeMessage::RespondError { .. } => {}
         }
     }
+
+    fn should_break_batch(
+        &self,
+        other: &BatchedFeMessage,
+        max_batch_size: NonZeroUsize,
+        batching_strategy: PageServiceProtocolPipelinedBatchingStrategy,
+    ) -> Option<GetPageBatchBreakReason> {
+        match (self, other) {
+            (
+                BatchedFeMessage::GetPage {
+                    shard: accum_shard,
+                    pages: accum_pages,
+                    ..
+                },
+                BatchedFeMessage::GetPage {
+                    shard: this_shard,
+                    pages: this_pages,
+                    ..
+                },
+            ) => {
+                assert_eq!(this_pages.len(), 1);
+                if accum_pages.len() >= max_batch_size.get() {
+                    trace!(%max_batch_size, "stopping batching because of batch size");
+                    assert_eq!(accum_pages.len(), max_batch_size.get());
+
+                    return Some(GetPageBatchBreakReason::BatchFull);
+                }
+                if !accum_shard.is_same_handle_as(this_shard) {
+                    trace!("stopping batching because timeline object mismatch");
+                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
+                    // But the current logic for keeping responses in order does not support that.
+
+                    return Some(GetPageBatchBreakReason::NonUniformTimeline);
+                }
+
+                match batching_strategy {
+                    PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => {
+                        if let Some(last_in_batch) = accum_pages.last() {
+                            if last_in_batch.effective_request_lsn
+                                != this_pages[0].effective_request_lsn
+                            {
+                                trace!(
+                                    accum_lsn = %last_in_batch.effective_request_lsn,
+                                    this_lsn = %this_pages[0].effective_request_lsn,
+                                    "stopping batching because LSN changed"
+                                );
+
+                                return Some(GetPageBatchBreakReason::NonUniformLsn);
+                            }
+                        }
+                    }
+                    PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => {
+                        // The read path doesn't curently support serving the same page at different LSNs.
+                        // While technically possible, it's uncertain if the complexity is worth it.
+                        // Break the batch if such a case is encountered.
+                        let same_page_different_lsn = accum_pages.iter().any(|batched| {
+                            batched.req.rel == this_pages[0].req.rel
+                                && batched.req.blkno == this_pages[0].req.blkno
+                                && batched.effective_request_lsn
+                                    != this_pages[0].effective_request_lsn
+                        });
+
+                        if same_page_different_lsn {
+                            trace!(
+                                rel=%this_pages[0].req.rel,
+                                blkno=%this_pages[0].req.blkno,
+                                lsn=%this_pages[0].effective_request_lsn,
+                                "stopping batching because same page was requested at different LSNs"
+                            );
+
+                            return Some(GetPageBatchBreakReason::SamePageAtDifferentLsn);
+                        }
+                    }
+                }
+
+                None
+            }
+            #[cfg(feature = "testing")]
+            (
+                BatchedFeMessage::Test {
+                    shard: accum_shard,
+                    requests: accum_requests,
+                    ..
+                },
+                BatchedFeMessage::Test {
+                    shard: this_shard,
+                    requests: this_requests,
+                    ..
+                },
+            ) => {
+                assert!(this_requests.len() == 1);
+                if accum_requests.len() >= max_batch_size.get() {
+                    trace!(%max_batch_size, "stopping batching because of batch size");
+                    assert_eq!(accum_requests.len(), max_batch_size.get());
+                    return Some(GetPageBatchBreakReason::BatchFull);
+                }
+                if !accum_shard.is_same_handle_as(this_shard) {
+                    trace!("stopping batching because timeline object mismatch");
+                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
+                    // But the current logic for keeping responses in order does not support that.
+                    return Some(GetPageBatchBreakReason::NonUniformTimeline);
+                }
+                let this_batch_key = this_requests[0].req.batch_key;
+                let accum_batch_key = accum_requests[0].req.batch_key;
+                if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
+                    trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
+                    return Some(GetPageBatchBreakReason::NonUniformKey);
+                }
+                None
+            }
+            (_, _) => Some(GetPageBatchBreakReason::NonBatchableRequest),
+        }
+    }
 }
 
 impl PageServerHandler {
@@ -1019,34 +1139,32 @@ impl PageServerHandler {
                 .await?;
 
                 // We're holding the Handle
-                // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
-                let res = Self::wait_or_get_last_lsn(
+                let effective_request_lsn = match Self::effective_request_lsn(
                     &shard,
+                    shard.get_last_record_lsn(),
                     req.hdr.request_lsn,
                     req.hdr.not_modified_since,
                     &shard.get_applied_gc_cutoff_lsn(),
-                    &ctx,
-                )
-                .maybe_perf_instrument(&ctx, |current_perf_span| {
-                    info_span!(
-                        target: PERF_TRACE_TARGET,
-                        parent: current_perf_span,
-                        "WAIT_LSN",
-                    )
-                })
-                .await;
-
-                let effective_request_lsn = match res {
+                ) {
                     Ok(lsn) => lsn,
                     Err(e) => {
                         return respond_error!(span, e);
                     }
                 };
+
                 BatchedFeMessage::GetPage {
                     span,
                     shard: shard.downgrade(),
-                    effective_request_lsn,
-                    pages: smallvec::smallvec![BatchedGetPageRequest { req, timer, ctx }],
+                    pages: smallvec::smallvec![BatchedGetPageRequest {
+                        req,
+                        timer,
+                        effective_request_lsn,
+                        ctx,
+                    }],
+                    // The executor grabs the batch when it becomes idle.
+                    // Hence, [`GetPageBatchBreakReason::ExecutorSteal`] is the
+                    // default reason for breaking the batch.
+                    batch_break_reason: GetPageBatchBreakReason::ExecutorSteal,
                 }
             }
             #[cfg(feature = "testing")]
@@ -1072,6 +1190,7 @@ impl PageServerHandler {
     #[instrument(skip_all, level = tracing::Level::TRACE)]
     #[allow(clippy::boxed_local)]
     fn pagestream_do_batch(
+        batching_strategy: PageServiceProtocolPipelinedBatchingStrategy,
         max_batch_size: NonZeroUsize,
         batch: &mut Result<BatchedFeMessage, QueryError>,
         this_msg: Result<BatchedFeMessage, QueryError>,
@@ -1083,90 +1202,59 @@ impl PageServerHandler {
             Err(e) => return Err(Err(e)),
         };
 
-        match (&mut *batch, this_msg) {
-            // something batched already, let's see if we can add this message to the batch
-            (
-                Ok(BatchedFeMessage::GetPage {
-                    span: _,
-                    shard: accum_shard,
-                    pages: accum_pages,
-                    effective_request_lsn: accum_lsn,
-                }),
-                BatchedFeMessage::GetPage {
-                    span: _,
-                    shard: this_shard,
-                    pages: this_pages,
-                    effective_request_lsn: this_lsn,
-                },
-            ) if (|| {
-                assert_eq!(this_pages.len(), 1);
-                if accum_pages.len() >= max_batch_size.get() {
-                    trace!(%accum_lsn, %this_lsn, %max_batch_size, "stopping batching because of batch size");
-                    assert_eq!(accum_pages.len(), max_batch_size.get());
-                    return false;
-                }
-                if !accum_shard.is_same_handle_as(&this_shard) {
-                    trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch");
-                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
-                    // But the current logic for keeping responses in order does not support that.
-                    return false;
-                }
-                // the vectored get currently only supports a single LSN, so, bounce as soon
-                // as the effective request_lsn changes
-                if *accum_lsn != this_lsn {
-                    trace!(%accum_lsn, %this_lsn, "stopping batching because LSN changed");
-                    return false;
-                }
-                true
-            })() =>
-            {
-                // ok to batch
-                accum_pages.extend(this_pages);
-                Ok(())
+        let eligible_batch = match batch {
+            Ok(b) => b,
+            Err(_) => {
+                return Err(Ok(this_msg));
             }
-            #[cfg(feature = "testing")]
-            (
-                Ok(BatchedFeMessage::Test {
-                    shard: accum_shard,
-                    requests: accum_requests,
-                    ..
-                }),
-                BatchedFeMessage::Test {
-                    shard: this_shard,
-                    requests: this_requests,
-                    ..
-                },
-            ) if (|| {
-                assert!(this_requests.len() == 1);
-                if accum_requests.len() >= max_batch_size.get() {
-                    trace!(%max_batch_size, "stopping batching because of batch size");
-                    assert_eq!(accum_requests.len(), max_batch_size.get());
-                    return false;
-                }
-                if !accum_shard.is_same_handle_as(&this_shard) {
-                    trace!("stopping batching because timeline object mismatch");
-                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
-                    // But the current logic for keeping responses in order does not support that.
-                    return false;
-                }
-                let this_batch_key = this_requests[0].req.batch_key;
-                let accum_batch_key = accum_requests[0].req.batch_key;
-                if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
-                    trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
-                    return false;
+        };
+
+        let batch_break =
+            eligible_batch.should_break_batch(&this_msg, max_batch_size, batching_strategy);
+
+        match batch_break {
+            Some(reason) => {
+                if let BatchedFeMessage::GetPage {
+                    batch_break_reason, ..
+                } = eligible_batch
+                {
+                    *batch_break_reason = reason;
                 }
-                true
-            })() =>
-            {
-                // ok to batch
-                accum_requests.extend(this_requests);
-                Ok(())
-            }
-            // something batched already but this message is unbatchable
-            (_, this_msg) => {
-                // by default, don't continue batching
+
                 Err(Ok(this_msg))
             }
+            None => {
+                // ok to batch
+                match (eligible_batch, this_msg) {
+                    (
+                        BatchedFeMessage::GetPage {
+                            pages: accum_pages, ..
+                        },
+                        BatchedFeMessage::GetPage {
+                            pages: this_pages, ..
+                        },
+                    ) => {
+                        accum_pages.extend(this_pages);
+                        Ok(())
+                    }
+                    #[cfg(feature = "testing")]
+                    (
+                        BatchedFeMessage::Test {
+                            requests: accum_requests,
+                            ..
+                        },
+                        BatchedFeMessage::Test {
+                            requests: this_requests,
+                            ..
+                        },
+                    ) => {
+                        accum_requests.extend(this_requests);
+                        Ok(())
+                    }
+                    // Shape guaranteed by [`BatchedFeMessage::should_break_batch`]
+                    _ => unreachable!(),
+                }
+            }
         }
     }
 
@@ -1387,8 +1475,8 @@ impl PageServerHandler {
             BatchedFeMessage::GetPage {
                 span,
                 shard,
-                effective_request_lsn,
                 pages,
+                batch_break_reason,
             } => {
                 fail::fail_point!("ps::handle-pagerequest-message::getpage");
                 let (shard, ctx) = upgrade_handle_and_set_context!(shard);
@@ -1399,9 +1487,9 @@ impl PageServerHandler {
                         let res = self
                             .handle_get_page_at_lsn_request_batched(
                                 &shard,
-                                effective_request_lsn,
                                 pages,
                                 io_concurrency,
+                                batch_break_reason,
                                 &ctx,
                             )
                             .instrument(span.clone())
@@ -1718,6 +1806,7 @@ impl PageServerHandler {
         let PageServicePipeliningConfigPipelined {
             max_batch_size,
             execution,
+            batching: batching_strategy,
         } = pipelining_config;
 
         // Macro to _define_ a pipeline stage.
@@ -1769,7 +1858,7 @@ impl PageServerHandler {
                     exit |= read_res.is_err();
                     let could_send = batch_tx
                         .send(read_res, |batch, res| {
-                            Self::pagestream_do_batch(max_batch_size, batch, res)
+                            Self::pagestream_do_batch(batching_strategy, max_batch_size, batch, res)
                         })
                         .await;
                     exit |= could_send.is_err();
@@ -1865,7 +1954,39 @@ impl PageServerHandler {
         ctx: &RequestContext,
     ) -> Result<Lsn, PageStreamError> {
         let last_record_lsn = timeline.get_last_record_lsn();
+        let effective_request_lsn = Self::effective_request_lsn(
+            timeline,
+            last_record_lsn,
+            request_lsn,
+            not_modified_since,
+            latest_gc_cutoff_lsn,
+        )?;
+
+        if effective_request_lsn > last_record_lsn {
+            timeline
+                .wait_lsn(
+                    not_modified_since,
+                    crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    timeline::WaitLsnTimeout::Default,
+                    ctx,
+                )
+                .await?;
+
+            // Since we waited for 'effective_request_lsn' to arrive, that is now the last
+            // record LSN. (Or close enough for our purposes; the last-record LSN can
+            // advance immediately after we return anyway)
+        }
+
+        Ok(effective_request_lsn)
+    }
 
+    fn effective_request_lsn(
+        timeline: &Timeline,
+        last_record_lsn: Lsn,
+        request_lsn: Lsn,
+        not_modified_since: Lsn,
+        latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
+    ) -> Result<Lsn, PageStreamError> {
         // Sanity check the request
         if request_lsn < not_modified_since {
             return Err(PageStreamError::BadRequest(
@@ -1900,19 +2021,7 @@ impl PageServerHandler {
             }
         }
 
-        // Wait for WAL up to 'not_modified_since' to arrive, if necessary
         if not_modified_since > last_record_lsn {
-            timeline
-                .wait_lsn(
-                    not_modified_since,
-                    crate::tenant::timeline::WaitLsnWaiter::PageService,
-                    timeline::WaitLsnTimeout::Default,
-                    ctx,
-                )
-                .await?;
-            // Since we waited for 'not_modified_since' to arrive, that is now the last
-            // record LSN. (Or close enough for our purposes; the last-record LSN can
-            // advance immediately after we return anyway)
             Ok(not_modified_since)
         } else {
             // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn)
@@ -2067,16 +2176,16 @@ impl PageServerHandler {
     async fn handle_get_page_at_lsn_request_batched(
         &mut self,
         timeline: &Timeline,
-        effective_lsn: Lsn,
         requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
         io_concurrency: IoConcurrency,
+        batch_break_reason: GetPageBatchBreakReason,
         ctx: &RequestContext,
     ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
         timeline
             .query_metrics
-            .observe_getpage_batch_start(requests.len());
+            .observe_getpage_batch_start(requests.len(), batch_break_reason);
 
         // If a page trace is running, submit an event for this request.
         if let Some(page_trace) = timeline.page_trace.load().as_ref() {
@@ -2086,20 +2195,81 @@ impl PageServerHandler {
                 // Ignore error (trace buffer may be full or tracer may have disconnected).
                 _ = page_trace.try_send(PageTraceEvent {
                     key,
-                    effective_lsn,
+                    effective_lsn: batch.effective_request_lsn,
                     time,
                 });
             }
         }
 
+        // If any request in the batch needs to wait for LSN, then do so now.
+        let mut perf_instrument = false;
+        let max_effective_lsn = requests
+            .iter()
+            .map(|req| {
+                if req.ctx.has_perf_span() {
+                    perf_instrument = true;
+                }
+
+                req.effective_request_lsn
+            })
+            .max()
+            .expect("batch is never empty");
+
+        let ctx = match perf_instrument {
+            true => RequestContextBuilder::from(ctx)
+                .root_perf_span(|| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        "GET_VECTORED",
+                        tenant_id = %timeline.tenant_shard_id.tenant_id,
+                        timeline_id = %timeline.timeline_id,
+                        shard = %timeline.tenant_shard_id.shard_slug(),
+                        %max_effective_lsn
+                    )
+                })
+                .attached_child(),
+            false => ctx.attached_child(),
+        };
+
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if max_effective_lsn > last_record_lsn {
+            if let Err(e) = timeline
+                .wait_lsn(
+                    max_effective_lsn,
+                    crate::tenant::timeline::WaitLsnWaiter::PageService,
+                    timeline::WaitLsnTimeout::Default,
+                    &ctx,
+                )
+                .maybe_perf_instrument(&ctx, |current_perf_span| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        parent: current_perf_span,
+                        "WAIT_LSN",
+                    )
+                })
+                .await
+            {
+                return Vec::from_iter(requests.into_iter().map(|req| {
+                    Err(BatchedPageStreamError {
+                        err: PageStreamError::from(e.clone()),
+                        req: req.req.hdr,
+                    })
+                }));
+            }
+        }
+
         let results = timeline
             .get_rel_page_at_lsn_batched(
-                requests
-                    .iter()
-                    .map(|p| (&p.req.rel, &p.req.blkno, p.ctx.attached_child())),
-                effective_lsn,
+                requests.iter().map(|p| {
+                    (
+                        &p.req.rel,
+                        &p.req.blkno,
+                        p.effective_request_lsn,
+                        p.ctx.attached_child(),
+                    )
+                }),
                 io_concurrency,
-                ctx,
+                &ctx,
             )
             .await;
         assert_eq!(results.len(), requests.len());
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index e3e06ab91a6e..81e548a095dc 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -6,14 +6,14 @@
 //! walingest.rs handles a few things like implicit relation creation and extension.
 //! Clarify that)
 //!
-use std::collections::{BTreeMap, HashMap, HashSet, hash_map};
+use std::collections::{HashMap, HashSet, hash_map};
 use std::ops::{ControlFlow, Range};
 
-use crate::PERF_TRACE_TARGET;
-use anyhow::{Context, ensure};
+use crate::walingest::{WalIngestError, WalIngestErrorKind};
+use crate::{PERF_TRACE_TARGET, ensure_walingest};
+use anyhow::Context;
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
-use itertools::Itertools;
 use pageserver_api::key::{
     AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists,
     TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range,
@@ -21,7 +21,7 @@ use pageserver_api::key::{
     repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
     slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
 };
-use pageserver_api::keyspace::SparseKeySpace;
+use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace};
 use pageserver_api::models::RelSizeMigration;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
@@ -40,7 +40,7 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
 
 use super::tenant::{PageReconstructError, Timeline};
 use crate::aux_file;
-use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
+use crate::context::{PerfInstrumentFutureExt, RequestContext};
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::metrics::{
     RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
@@ -50,7 +50,7 @@ use crate::span::{
     debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
 };
 use crate::tenant::storage_layer::IoConcurrency;
-use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery};
 
 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
 pub const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -136,12 +136,8 @@ impl From<PageReconstructError> for CalculateLogicalSizeError {
 
 #[derive(Debug, thiserror::Error)]
 pub enum RelationError {
-    #[error("Relation Already Exists")]
-    AlreadyExists,
     #[error("invalid relnode")]
     InvalidRelnode,
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
 }
 
 ///
@@ -210,10 +206,9 @@ impl Timeline {
                 let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
                 let res = self
                     .get_rel_page_at_lsn_batched(
-                        pages
-                            .iter()
-                            .map(|(tag, blknum)| (tag, blknum, ctx.attached_child())),
-                        effective_lsn,
+                        pages.iter().map(|(tag, blknum)| {
+                            (tag, blknum, effective_lsn, ctx.attached_child())
+                        }),
                         io_concurrency.clone(),
                         ctx,
                     )
@@ -251,8 +246,7 @@ impl Timeline {
     /// The ordering of the returned vec corresponds to the ordering of `pages`.
     pub(crate) async fn get_rel_page_at_lsn_batched(
         &self,
-        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, RequestContext)>,
-        effective_lsn: Lsn,
+        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, Lsn, RequestContext)>,
         io_concurrency: IoConcurrency,
         ctx: &RequestContext,
     ) -> Vec<Result<Bytes, PageReconstructError>> {
@@ -265,11 +259,13 @@ impl Timeline {
         let mut result = Vec::with_capacity(pages.len());
         let result_slots = result.spare_capacity_mut();
 
-        let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
-            BTreeMap::default();
+        let mut keys_slots: HashMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
+            HashMap::with_capacity(pages.len());
+
+        let mut req_keyspaces: HashMap<Lsn, KeySpaceRandomAccum> =
+            HashMap::with_capacity(pages.len());
 
-        let mut perf_instrument = false;
-        for (response_slot_idx, (tag, blknum, ctx)) in pages.enumerate() {
+        for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() {
             if tag.relnode == 0 {
                 result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
                     RelationError::InvalidRelnode.into(),
@@ -280,14 +276,14 @@ impl Timeline {
             }
 
             let nblocks = match self
-                .get_rel_size(*tag, Version::Lsn(effective_lsn), &ctx)
+                .get_rel_size(*tag, Version::Lsn(lsn), &ctx)
                 .maybe_perf_instrument(&ctx, |crnt_perf_span| {
                     info_span!(
                         target: PERF_TRACE_TARGET,
                         parent: crnt_perf_span,
                         "GET_REL_SIZE",
                         reltag=%tag,
-                        lsn=%effective_lsn,
+                        lsn=%lsn,
                     )
                 })
                 .await
@@ -303,7 +299,7 @@ impl Timeline {
             if *blknum >= nblocks {
                 debug!(
                     "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
-                    tag, blknum, effective_lsn, nblocks
+                    tag, blknum, lsn, nblocks
                 );
                 result_slots[response_slot_idx].write(Ok(ZERO_PAGE.clone()));
                 slots_filled += 1;
@@ -312,46 +308,29 @@ impl Timeline {
 
             let key = rel_block_to_key(*tag, *blknum);
 
-            if ctx.has_perf_span() {
-                perf_instrument = true;
-            }
-
             let key_slots = keys_slots.entry(key).or_default();
             key_slots.push((response_slot_idx, ctx));
-        }
 
-        let keyspace = {
-            // add_key requires monotonicity
-            let mut acc = KeySpaceAccum::new();
-            for key in keys_slots
-                .keys()
-                // in fact it requires strong monotonicity
-                .dedup()
-            {
-                acc.add_key(*key);
-            }
-            acc.to_keyspace()
-        };
+            let acc = req_keyspaces.entry(lsn).or_default();
+            acc.add_key(key);
+        }
 
-        let ctx = match perf_instrument {
-            true => RequestContextBuilder::from(ctx)
-                .root_perf_span(|| {
-                    info_span!(
-                        target: PERF_TRACE_TARGET,
-                        "GET_VECTORED",
-                        tenant_id = %self.tenant_shard_id.tenant_id,
-                        timeline_id = %self.timeline_id,
-                        lsn = %effective_lsn,
-                        shard = %self.tenant_shard_id.shard_slug(),
-                    )
-                })
-                .attached_child(),
-            false => ctx.attached_child(),
-        };
+        let query: Vec<(Lsn, KeySpace)> = req_keyspaces
+            .into_iter()
+            .map(|(lsn, acc)| (lsn, acc.to_keyspace()))
+            .collect();
 
+        let query = VersionedKeySpaceQuery::scattered(query);
         let res = self
-            .get_vectored(keyspace, effective_lsn, io_concurrency, &ctx)
-            .maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone())
+            .get_vectored(query, io_concurrency, ctx)
+            .maybe_perf_instrument(ctx, |current_perf_span| {
+                info_span!(
+                    target: PERF_TRACE_TARGET,
+                    parent: current_perf_span,
+                    "GET_BATCH",
+                    batch_size = %page_count,
+                )
+            })
             .await;
 
         match res {
@@ -381,12 +360,12 @@ impl Timeline {
                         // There is no standardized way to express that the batched span followed from N request spans.
                         // So, abuse the system and mark the request contexts as follows_from the batch span, so we get
                         // some linkage in our trace viewer. It allows us to answer: which GET_VECTORED did this GET_PAGE wait for.
-                        req_ctx.perf_follows_from(&ctx);
+                        req_ctx.perf_follows_from(ctx);
                         slots_filled += 1;
                     }
 
                     result_slots[first_slot].write(res);
-                    first_req_ctx.perf_follows_from(&ctx);
+                    first_req_ctx.perf_follows_from(ctx);
                     slots_filled += 1;
                 }
             }
@@ -425,7 +404,7 @@ impl Timeline {
                         }
                     };
 
-                    req_ctx.perf_follows_from(&ctx);
+                    req_ctx.perf_follows_from(ctx);
                     result_slots[*slot].write(err);
                 }
 
@@ -664,8 +643,9 @@ impl Timeline {
 
         let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
         for batch in batches.parts {
+            let query = VersionedKeySpaceQuery::uniform(batch, lsn);
             let blocks = self
-                .get_vectored(batch, lsn, io_concurrency.clone(), ctx)
+                .get_vectored(query, io_concurrency.clone(), ctx)
                 .await?;
 
             for (_key, block) in blocks {
@@ -691,7 +671,7 @@ impl Timeline {
         Ok(buf.get_u32_le())
     }
 
-    /// Get size of an SLRU segment
+    /// Does the slru segment exist?
     pub(crate) async fn get_slru_segment_exists(
         &self,
         kind: SlruKind,
@@ -844,9 +824,9 @@ impl Timeline {
         .await
     }
 
-    /// Obtain the possible timestamp range for the given lsn.
+    /// Obtain the timestamp for the given lsn.
     ///
-    /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
+    /// If the lsn has no timestamps (e.g. no commits), returns None.
     pub(crate) async fn get_timestamp_for_lsn(
         &self,
         probe_lsn: Lsn,
@@ -902,8 +882,9 @@ impl Timeline {
             );
 
             for batch in batches.parts.into_iter().rev() {
+                let query = VersionedKeySpaceQuery::uniform(batch, probe_lsn);
                 let blocks = self
-                    .get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx)
+                    .get_vectored(query, io_concurrency.clone(), ctx)
                     .await?;
 
                 for (_key, clog_page) in blocks.into_iter().rev() {
@@ -1478,8 +1459,8 @@ impl DatadirModification<'_> {
     }
 
     /// Set the current lsn
-    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
-        ensure!(
+    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> {
+        ensure_walingest!(
             lsn >= self.lsn,
             "setting an older lsn {} than {} is not allowed",
             lsn,
@@ -1578,7 +1559,7 @@ impl DatadirModification<'_> {
         &mut self,
         rel: RelTag,
         ctx: &RequestContext,
-    ) -> Result<u32, PageReconstructError> {
+    ) -> Result<u32, WalIngestError> {
         // Get current size and put rel creation if rel doesn't exist
         //
         // NOTE: we check the cache first even though get_rel_exists and get_rel_size would
@@ -1593,14 +1574,13 @@ impl DatadirModification<'_> {
             .await?
         {
             // create it with 0 size initially, the logic below will extend it
-            self.put_rel_creation(rel, 0, ctx)
-                .await
-                .context("Relation Error")?;
+            self.put_rel_creation(rel, 0, ctx).await?;
             Ok(0)
         } else {
-            self.tline
+            Ok(self
+                .tline
                 .get_rel_size(rel, Version::Modified(self), ctx)
-                .await
+                .await?)
         }
     }
 
@@ -1637,11 +1617,14 @@ impl DatadirModification<'_> {
         // TODO(vlad): remove this argument and replace the shard check with is_key_local
         shard: &ShardIdentity,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let mut gaps_at_lsns = Vec::default();
 
         for meta in batch.metadata.iter() {
-            let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
+            let key = Key::from_compact(meta.key());
+            let (rel, blkno) = key
+                .to_rel_block()
+                .map_err(|_| WalIngestErrorKind::InvalidKey(key, meta.lsn()))?;
             let new_nblocks = blkno + 1;
 
             let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
@@ -1683,8 +1666,8 @@ impl DatadirModification<'_> {
         rel: RelTag,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
         self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec));
         Ok(())
     }
@@ -1696,7 +1679,7 @@ impl DatadirModification<'_> {
         segno: u32,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         if !self.tline.tenant_shard_id.is_shard_zero() {
             return Ok(());
         }
@@ -1714,14 +1697,11 @@ impl DatadirModification<'_> {
         rel: RelTag,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
         let key = rel_block_to_key(rel, blknum);
         if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver at {}",
-                key
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
         }
         self.put(rel_block_to_key(rel, blknum), Value::Image(img));
         Ok(())
@@ -1733,15 +1713,12 @@ impl DatadirModification<'_> {
         segno: u32,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         assert!(self.tline.tenant_shard_id.is_shard_zero());
 
         let key = slru_block_to_key(kind, segno, blknum);
         if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver at {}",
-                key
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
         }
         self.put(key, Value::Image(img));
         Ok(())
@@ -1751,15 +1728,11 @@ impl DatadirModification<'_> {
         &mut self,
         rel: RelTag,
         blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
         let key = rel_block_to_key(rel, blknum);
         if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver: {} @ {}",
-                key,
-                self.lsn
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
         }
 
         let batch = self
@@ -1776,15 +1749,11 @@ impl DatadirModification<'_> {
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         assert!(self.tline.tenant_shard_id.is_shard_zero());
         let key = slru_block_to_key(kind, segno, blknum);
         if !key.is_valid_key_on_write_path() {
-            anyhow::bail!(
-                "the request contains data not supported by pageserver: {} @ {}",
-                key,
-                self.lsn
-            );
+            Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?;
         }
 
         let batch = self
@@ -1832,8 +1801,10 @@ impl DatadirModification<'_> {
         dbnode: Oid,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+    ) -> Result<(), WalIngestError> {
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
 
         // Add it to the directory (if it doesn't exist already)
         let buf = self.get(DBDIR_KEY, ctx).await?;
@@ -1874,13 +1845,13 @@ impl DatadirModification<'_> {
         xid: u64,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         // Add it to the directory entry
         let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?;
         let newdirbuf = if self.tline.pg_version >= 17 {
             let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?;
             if !dir.xids.insert(xid) {
-                anyhow::bail!("twophase file for xid {} already exists", xid);
+                Err(WalIngestErrorKind::FileAlreadyExists(xid))?;
             }
             self.pending_directory_entries.push((
                 DirectoryKind::TwoPhase,
@@ -1891,7 +1862,7 @@ impl DatadirModification<'_> {
             let xid = xid as u32;
             let mut dir = TwoPhaseDirectory::des(&dirbuf)?;
             if !dir.xids.insert(xid) {
-                anyhow::bail!("twophase file for xid {} already exists", xid);
+                Err(WalIngestErrorKind::FileAlreadyExists(xid.into()))?;
             }
             self.pending_directory_entries.push((
                 DirectoryKind::TwoPhase,
@@ -1909,22 +1880,22 @@ impl DatadirModification<'_> {
         &mut self,
         origin_id: RepOriginId,
         origin_lsn: Lsn,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let key = repl_origin_key(origin_id);
         self.put(key, Value::Image(origin_lsn.ser().unwrap().into()));
         Ok(())
     }
 
-    pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> {
+    pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> Result<(), WalIngestError> {
         self.set_replorigin(origin_id, Lsn::INVALID).await
     }
 
-    pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> {
+    pub fn put_control_file(&mut self, img: Bytes) -> Result<(), WalIngestError> {
         self.put(CONTROLFILE_KEY, Value::Image(img));
         Ok(())
     }
 
-    pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> {
+    pub fn put_checkpoint(&mut self, img: Bytes) -> Result<(), WalIngestError> {
         self.put(CHECKPOINT_KEY, Value::Image(img));
         Ok(())
     }
@@ -1934,7 +1905,7 @@ impl DatadirModification<'_> {
         spcnode: Oid,
         dbnode: Oid,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let total_blocks = self
             .tline
             .get_db_size(spcnode, dbnode, Version::Modified(self), ctx)
@@ -1973,20 +1944,21 @@ impl DatadirModification<'_> {
         rel: RelTag,
         nblocks: BlockNumber,
         ctx: &RequestContext,
-    ) -> Result<(), RelationError> {
+    ) -> Result<(), WalIngestError> {
         if rel.relnode == 0 {
-            return Err(RelationError::InvalidRelnode);
+            Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
+                "invalid relnode"
+            )))?;
         }
         // It's possible that this is the first rel for this db in this
         // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
-            .context("deserialize db")?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
 
         let dbdir_exists =
             if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
                 // Didn't exist. Update dbdir
                 e.insert(false);
-                let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
+                let buf = DbDirectory::ser(&dbdir)?;
                 self.pending_directory_entries.push((
                     DirectoryKind::Db,
                     MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
@@ -2003,27 +1975,25 @@ impl DatadirModification<'_> {
             RelDirectory::default()
         } else {
             // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
-                .context("deserialize db")?
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
         };
 
-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
 
         if v2_enabled {
             if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
             }
             let sparse_rel_dir_key =
                 rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
             // check if the rel_dir_key exists in v2
-            let val = self
-                .sparse_get(sparse_rel_dir_key, ctx)
-                .await
-                .map_err(|e| RelationError::Other(e.into()))?;
+            let val = self.sparse_get(sparse_rel_dir_key, ctx).await?;
             let val = RelDirExists::decode_option(val)
-                .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+                .map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?;
             if val == RelDirExists::Exists {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
             }
             self.put(
                 sparse_rel_dir_key,
@@ -2039,9 +2009,7 @@ impl DatadirModification<'_> {
                 // will be key not found errors if we don't create an empty one for rel_size_v2.
                 self.put(
                     rel_dir_key,
-                    Value::Image(Bytes::from(
-                        RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
-                    )),
+                    Value::Image(Bytes::from(RelDirectory::ser(&RelDirectory::default())?)),
                 );
             }
             self.pending_directory_entries
@@ -2049,7 +2017,7 @@ impl DatadirModification<'_> {
         } else {
             // Add the new relation to the rel directory entry, and write it back
             if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-                return Err(RelationError::AlreadyExists);
+                Err(WalIngestErrorKind::RelationAlreadyExists(rel))?;
             }
             if !dbdir_exists {
                 self.pending_directory_entries
@@ -2059,9 +2027,7 @@ impl DatadirModification<'_> {
                 .push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
             self.put(
                 rel_dir_key,
-                Value::Image(Bytes::from(
-                    RelDirectory::ser(&rel_dir).context("serialize")?,
-                )),
+                Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)),
             );
         }
 
@@ -2086,8 +2052,8 @@ impl DatadirModification<'_> {
         rel: RelTag,
         nblocks: BlockNumber,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
         if self
             .tline
             .get_rel_exists(rel, Version::Modified(self), ctx)
@@ -2117,8 +2083,8 @@ impl DatadirModification<'_> {
         rel: RelTag,
         nblocks: BlockNumber,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    ) -> Result<(), WalIngestError> {
+        ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode);
 
         // Put size
         let size_key = rel_size_to_key(rel);
@@ -2142,8 +2108,10 @@ impl DatadirModification<'_> {
         &mut self,
         drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+    ) -> Result<(), WalIngestError> {
+        let v2_enabled = self
+            .maybe_enable_rel_size_v2()
+            .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?;
         for ((spc_node, db_node), rel_tags) in drop_relations {
             let dir_key = rel_dir_to_key(spc_node, db_node);
             let buf = self.get(dir_key, ctx).await?;
@@ -2163,7 +2131,7 @@ impl DatadirModification<'_> {
                     let key =
                         rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
                     let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
-                        .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+                        .map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?;
                     if val == RelDirExists::Exists {
                         self.pending_directory_entries
                             .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
@@ -2206,7 +2174,7 @@ impl DatadirModification<'_> {
         segno: u32,
         nblocks: BlockNumber,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         assert!(self.tline.tenant_shard_id.is_shard_zero());
 
         // Add it to the directory entry
@@ -2215,7 +2183,7 @@ impl DatadirModification<'_> {
         let mut dir = SlruSegmentDirectory::des(&buf)?;
 
         if !dir.segments.insert(segno) {
-            anyhow::bail!("slru segment {kind:?}/{segno} already exists");
+            Err(WalIngestErrorKind::SlruAlreadyExists(kind, segno))?;
         }
         self.pending_directory_entries.push((
             DirectoryKind::SlruSegment(kind),
@@ -2242,7 +2210,7 @@ impl DatadirModification<'_> {
         kind: SlruKind,
         segno: u32,
         nblocks: BlockNumber,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         assert!(self.tline.tenant_shard_id.is_shard_zero());
 
         // Put size
@@ -2258,7 +2226,7 @@ impl DatadirModification<'_> {
         kind: SlruKind,
         segno: u32,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         // Remove it from the directory entry
         let dir_key = slru_dir_to_key(kind);
         let buf = self.get(dir_key, ctx).await?;
@@ -2283,7 +2251,7 @@ impl DatadirModification<'_> {
     }
 
     /// Drop a relmapper file (pg_filenode.map)
-    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> {
+    pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<(), WalIngestError> {
         // TODO
         Ok(())
     }
@@ -2293,7 +2261,7 @@ impl DatadirModification<'_> {
         &mut self,
         xid: u64,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         // Remove it from the directory entry
         let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
         let newdirbuf = if self.tline.pg_version >= 17 {
@@ -2308,7 +2276,8 @@ impl DatadirModification<'_> {
             ));
             Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
         } else {
-            let xid: u32 = u32::try_from(xid)?;
+            let xid: u32 = u32::try_from(xid)
+                .map_err(|e| WalIngestErrorKind::LogicalError(anyhow::Error::from(e)))?;
             let mut dir = TwoPhaseDirectory::des(&buf)?;
 
             if !dir.xids.remove(&xid) {
@@ -2333,7 +2302,7 @@ impl DatadirModification<'_> {
         path: &str,
         content: &[u8],
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let key = aux_file::encode_aux_file_key(path);
         // retrieve the key from the engine
         let old_val = match self.get(key, ctx).await {
@@ -2342,7 +2311,7 @@ impl DatadirModification<'_> {
             Err(e) => return Err(e.into()),
         };
         let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
-            aux_file::decode_file_value(old_val)?
+            aux_file::decode_file_value(old_val).map_err(WalIngestErrorKind::EncodeAuxFileError)?
         } else {
             Vec::new()
         };
@@ -2387,7 +2356,8 @@ impl DatadirModification<'_> {
             }
             (None, true) => warn!("removing non-existing aux file: {}", path),
         }
-        let new_val = aux_file::encode_file_value(&new_files)?;
+        let new_val = aux_file::encode_file_value(&new_files)
+            .map_err(WalIngestErrorKind::EncodeAuxFileError)?;
         self.put(key, Value::Image(new_val.into()));
 
         Ok(())
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0c399d4c913d..0ba70f45b2f4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -100,7 +100,7 @@ use crate::tenant::timeline::delete::DeleteTimelineFlow;
 use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
 use crate::walingest::WalLagCooldown;
-use crate::walredo::PostgresRedoManager;
+use crate::walredo::{PostgresRedoManager, RedoAttemptType};
 use crate::{InitializationOrder, TEMP_FILE_SUFFIX, import_datadir, span, task_mgr, walredo};
 
 static INIT_DB_SEMAPHORE: Lazy<Semaphore> = Lazy::new(|| Semaphore::new(8));
@@ -473,15 +473,16 @@ impl WalRedoManager {
         base_img: Option<(Lsn, bytes::Bytes)>,
         records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>,
         pg_version: u32,
+        redo_attempt_type: RedoAttemptType,
     ) -> Result<bytes::Bytes, walredo::Error> {
         match self {
             Self::Prod(_, mgr) => {
-                mgr.request_redo(key, lsn, base_img, records, pg_version)
+                mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type)
                     .await
             }
             #[cfg(test)]
             Self::Test(mgr) => {
-                mgr.request_redo(key, lsn, base_img, records, pg_version)
+                mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type)
                     .await
             }
         }
@@ -920,6 +921,7 @@ enum StartCreatingTimelineResult {
     Idempotent(Arc<Timeline>),
 }
 
+#[allow(clippy::large_enum_variant, reason = "TODO")]
 enum TimelineInitAndSyncResult {
     ReadyToActivate(Arc<Timeline>),
     NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata),
@@ -1006,6 +1008,7 @@ enum CreateTimelineCause {
     Delete,
 }
 
+#[allow(clippy::large_enum_variant, reason = "TODO")]
 enum LoadTimelineCause {
     Attach,
     Unoffload,
@@ -4079,6 +4082,7 @@ impl Tenant {
 
         TenantManifest {
             version: LATEST_TENANT_MANIFEST_VERSION,
+            stripe_size: Some(self.get_shard_stripe_size()),
             offloaded_timelines,
         }
     }
@@ -4398,10 +4402,7 @@ impl Tenant {
         .to_string();
 
         fail::fail_point!("tenant-config-before-write", |_| {
-            Err(std::io::Error::new(
-                std::io::ErrorKind::Other,
-                "tenant-config-before-write",
-            ))
+            Err(std::io::Error::other("tenant-config-before-write"))
         });
 
         // Convert the config to a toml file.
@@ -5879,6 +5880,7 @@ pub(crate) mod harness {
             base_img: Option<(Lsn, Bytes)>,
             records: Vec<(Lsn, NeonWalRecord)>,
             _pg_version: u32,
+            _redo_attempt_type: RedoAttemptType,
         ) -> Result<Bytes, walredo::Error> {
             let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
             if records_neon {
@@ -5931,12 +5933,20 @@ mod tests {
     use models::CompactLsnRange;
     use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX};
     use pageserver_api::keyspace::KeySpace;
+    #[cfg(feature = "testing")]
+    use pageserver_api::keyspace::KeySpaceRandomAccum;
     use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
     #[cfg(feature = "testing")]
     use pageserver_api::record::NeonWalRecord;
     use pageserver_api::value::Value;
     use pageserver_compaction::helpers::overlaps_with;
+    #[cfg(feature = "testing")]
+    use rand::SeedableRng;
+    #[cfg(feature = "testing")]
+    use rand::rngs::StdRng;
     use rand::{Rng, thread_rng};
+    #[cfg(feature = "testing")]
+    use std::ops::Range;
     use storage_layer::{IoConcurrency, PersistentLayerKey};
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
@@ -5946,7 +5956,7 @@ mod tests {
     use timeline::InMemoryLayerTestDesc;
     #[cfg(feature = "testing")]
     use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
-    use timeline::{CompactOptions, DeltaLayerTestDesc};
+    use timeline::{CompactOptions, DeltaLayerTestDesc, VersionedKeySpaceQuery};
     use utils::id::TenantId;
 
     use super::*;
@@ -5958,6 +5968,318 @@ mod tests {
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
 
+    #[cfg(feature = "testing")]
+    struct TestTimelineSpecification {
+        start_lsn: Lsn,
+        last_record_lsn: Lsn,
+
+        in_memory_layers_shape: Vec<(Range<Key>, Range<Lsn>)>,
+        delta_layers_shape: Vec<(Range<Key>, Range<Lsn>)>,
+        image_layers_shape: Vec<(Range<Key>, Lsn)>,
+
+        gap_chance: u8,
+        will_init_chance: u8,
+    }
+
+    #[cfg(feature = "testing")]
+    struct Storage {
+        storage: HashMap<(Key, Lsn), Value>,
+        start_lsn: Lsn,
+    }
+
+    #[cfg(feature = "testing")]
+    impl Storage {
+        fn get(&self, key: Key, lsn: Lsn) -> Bytes {
+            use bytes::BufMut;
+
+            let mut crnt_lsn = lsn;
+            let mut got_base = false;
+
+            let mut acc = Vec::new();
+
+            while crnt_lsn >= self.start_lsn {
+                if let Some(value) = self.storage.get(&(key, crnt_lsn)) {
+                    acc.push(value.clone());
+
+                    match value {
+                        Value::WalRecord(NeonWalRecord::Test { will_init, .. }) => {
+                            if *will_init {
+                                got_base = true;
+                                break;
+                            }
+                        }
+                        Value::Image(_) => {
+                            got_base = true;
+                            break;
+                        }
+                        _ => unreachable!(),
+                    }
+                }
+
+                crnt_lsn = crnt_lsn.checked_sub(1u64).unwrap();
+            }
+
+            assert!(
+                got_base,
+                "Input data was incorrect. No base image for {key}@{lsn}"
+            );
+
+            tracing::debug!("Wal redo depth for {key}@{lsn} is {}", acc.len());
+
+            let mut blob = BytesMut::new();
+            for value in acc.into_iter().rev() {
+                match value {
+                    Value::WalRecord(NeonWalRecord::Test { append, .. }) => {
+                        blob.extend_from_slice(append.as_bytes());
+                    }
+                    Value::Image(img) => {
+                        blob.put(img);
+                    }
+                    _ => unreachable!(),
+                }
+            }
+
+            blob.into()
+        }
+    }
+
+    #[cfg(feature = "testing")]
+    #[allow(clippy::too_many_arguments)]
+    async fn randomize_timeline(
+        tenant: &Arc<Tenant>,
+        new_timeline_id: TimelineId,
+        pg_version: u32,
+        spec: TestTimelineSpecification,
+        random: &mut rand::rngs::StdRng,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<(Arc<Timeline>, Storage, Vec<Lsn>)> {
+        let mut storage: HashMap<(Key, Lsn), Value> = HashMap::default();
+        let mut interesting_lsns = vec![spec.last_record_lsn];
+
+        for (key_range, lsn_range) in spec.in_memory_layers_shape.iter() {
+            let mut lsn = lsn_range.start;
+            while lsn < lsn_range.end {
+                let mut key = key_range.start;
+                while key < key_range.end {
+                    let gap = random.gen_range(1..=100) <= spec.gap_chance;
+                    let will_init = random.gen_range(1..=100) <= spec.will_init_chance;
+
+                    if gap {
+                        continue;
+                    }
+
+                    let record = if will_init {
+                        Value::WalRecord(NeonWalRecord::wal_init(format!("[wil_init {key}@{lsn}]")))
+                    } else {
+                        Value::WalRecord(NeonWalRecord::wal_append(format!("[delta {key}@{lsn}]")))
+                    };
+
+                    storage.insert((key, lsn), record);
+
+                    key = key.next();
+                }
+                lsn = Lsn(lsn.0 + 1);
+            }
+
+            // Stash some interesting LSN for future use
+            for offset in [0, 5, 100].iter() {
+                if *offset == 0 {
+                    interesting_lsns.push(lsn_range.start);
+                } else {
+                    let below = lsn_range.start.checked_sub(*offset);
+                    match below {
+                        Some(v) if v >= spec.start_lsn => {
+                            interesting_lsns.push(v);
+                        }
+                        _ => {}
+                    }
+
+                    let above = Lsn(lsn_range.start.0 + offset);
+                    interesting_lsns.push(above);
+                }
+            }
+        }
+
+        for (key_range, lsn_range) in spec.delta_layers_shape.iter() {
+            let mut lsn = lsn_range.start;
+            while lsn < lsn_range.end {
+                let mut key = key_range.start;
+                while key < key_range.end {
+                    let gap = random.gen_range(1..=100) <= spec.gap_chance;
+                    let will_init = random.gen_range(1..=100) <= spec.will_init_chance;
+
+                    if gap {
+                        continue;
+                    }
+
+                    let record = if will_init {
+                        Value::WalRecord(NeonWalRecord::wal_init(format!("[wil_init {key}@{lsn}]")))
+                    } else {
+                        Value::WalRecord(NeonWalRecord::wal_append(format!("[delta {key}@{lsn}]")))
+                    };
+
+                    storage.insert((key, lsn), record);
+
+                    key = key.next();
+                }
+                lsn = Lsn(lsn.0 + 1);
+            }
+
+            // Stash some interesting LSN for future use
+            for offset in [0, 5, 100].iter() {
+                if *offset == 0 {
+                    interesting_lsns.push(lsn_range.start);
+                } else {
+                    let below = lsn_range.start.checked_sub(*offset);
+                    match below {
+                        Some(v) if v >= spec.start_lsn => {
+                            interesting_lsns.push(v);
+                        }
+                        _ => {}
+                    }
+
+                    let above = Lsn(lsn_range.start.0 + offset);
+                    interesting_lsns.push(above);
+                }
+            }
+        }
+
+        for (key_range, lsn) in spec.image_layers_shape.iter() {
+            let mut key = key_range.start;
+            while key < key_range.end {
+                let blob = Bytes::from(format!("[image {key}@{lsn}]"));
+                let record = Value::Image(blob.clone());
+                storage.insert((key, *lsn), record);
+
+                key = key.next();
+            }
+
+            // Stash some interesting LSN for future use
+            for offset in [0, 5, 100].iter() {
+                if *offset == 0 {
+                    interesting_lsns.push(*lsn);
+                } else {
+                    let below = lsn.checked_sub(*offset);
+                    match below {
+                        Some(v) if v >= spec.start_lsn => {
+                            interesting_lsns.push(v);
+                        }
+                        _ => {}
+                    }
+
+                    let above = Lsn(lsn.0 + offset);
+                    interesting_lsns.push(above);
+                }
+            }
+        }
+
+        let in_memory_test_layers = {
+            let mut acc = Vec::new();
+
+            for (key_range, lsn_range) in spec.in_memory_layers_shape.iter() {
+                let mut data = Vec::new();
+
+                let mut lsn = lsn_range.start;
+                while lsn < lsn_range.end {
+                    let mut key = key_range.start;
+                    while key < key_range.end {
+                        if let Some(record) = storage.get(&(key, lsn)) {
+                            data.push((key, lsn, record.clone()));
+                        }
+
+                        key = key.next();
+                    }
+                    lsn = Lsn(lsn.0 + 1);
+                }
+
+                acc.push(InMemoryLayerTestDesc {
+                    data,
+                    lsn_range: lsn_range.clone(),
+                    is_open: false,
+                })
+            }
+
+            acc
+        };
+
+        let delta_test_layers = {
+            let mut acc = Vec::new();
+
+            for (key_range, lsn_range) in spec.delta_layers_shape.iter() {
+                let mut data = Vec::new();
+
+                let mut lsn = lsn_range.start;
+                while lsn < lsn_range.end {
+                    let mut key = key_range.start;
+                    while key < key_range.end {
+                        if let Some(record) = storage.get(&(key, lsn)) {
+                            data.push((key, lsn, record.clone()));
+                        }
+
+                        key = key.next();
+                    }
+                    lsn = Lsn(lsn.0 + 1);
+                }
+
+                acc.push(DeltaLayerTestDesc {
+                    data,
+                    lsn_range: lsn_range.clone(),
+                    key_range: key_range.clone(),
+                })
+            }
+
+            acc
+        };
+
+        let image_test_layers = {
+            let mut acc = Vec::new();
+
+            for (key_range, lsn) in spec.image_layers_shape.iter() {
+                let mut data = Vec::new();
+
+                let mut key = key_range.start;
+                while key < key_range.end {
+                    if let Some(record) = storage.get(&(key, *lsn)) {
+                        let blob = match record {
+                            Value::Image(blob) => blob.clone(),
+                            _ => unreachable!(),
+                        };
+
+                        data.push((key, blob));
+                    }
+
+                    key = key.next();
+                }
+
+                acc.push((*lsn, data));
+            }
+
+            acc
+        };
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                new_timeline_id,
+                spec.start_lsn,
+                pg_version,
+                ctx,
+                in_memory_test_layers,
+                delta_test_layers,
+                image_test_layers,
+                spec.last_record_lsn,
+            )
+            .await?;
+
+        Ok((
+            tline,
+            Storage {
+                storage,
+                start_lsn: spec.start_lsn,
+            },
+            interesting_lsns,
+        ))
+    }
+
     #[tokio::test]
     async fn test_basic() -> anyhow::Result<()> {
         let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await;
@@ -6784,10 +7106,11 @@ mod tests {
         for read in reads {
             info!("Doing vectored read on {:?}", read);
 
+            let query = VersionedKeySpaceQuery::uniform(read.clone(), reads_lsn);
+
             let vectored_res = tline
                 .get_vectored_impl(
-                    read.clone(),
-                    reads_lsn,
+                    query,
                     &mut ValuesReconstructState::new(io_concurrency.clone()),
                     &ctx,
                 )
@@ -6866,10 +7189,11 @@ mod tests {
         };
         let read_lsn = child_timeline.get_last_record_lsn();
 
+        let query = VersionedKeySpaceQuery::uniform(aux_keyspace.clone(), read_lsn);
+
         let vectored_res = child_timeline
             .get_vectored_impl(
-                aux_keyspace.clone(),
-                read_lsn,
+                query,
                 &mut ValuesReconstructState::new(io_concurrency.clone()),
                 &ctx,
             )
@@ -7015,10 +7339,12 @@ mod tests {
         let read = KeySpace {
             ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key],
         };
+
+        let query = VersionedKeySpaceQuery::uniform(read.clone(), current_lsn);
+
         let results = child_timeline
             .get_vectored_impl(
-                read.clone(),
-                current_lsn,
+                query,
                 &mut ValuesReconstructState::new(io_concurrency.clone()),
                 &ctx,
             )
@@ -7149,12 +7475,16 @@ mod tests {
         }
 
         for query_lsn in query_lsns {
+            let query = VersionedKeySpaceQuery::uniform(
+                KeySpace {
+                    ranges: vec![child_gap_at_key..child_gap_at_key.next()],
+                },
+                query_lsn,
+            );
+
             let results = child_timeline
                 .get_vectored_impl(
-                    KeySpace {
-                        ranges: vec![child_gap_at_key..child_gap_at_key.next()],
-                    },
-                    query_lsn,
+                    query,
                     &mut ValuesReconstructState::new(io_concurrency.clone()),
                     &ctx,
                 )
@@ -7653,10 +7983,11 @@ mod tests {
             }
 
             let mut cnt = 0;
+            let query = VersionedKeySpaceQuery::uniform(keyspace.clone(), lsn);
+
             for (key, value) in tline
                 .get_vectored_impl(
-                    keyspace.clone(),
-                    lsn,
+                    query,
                     &mut ValuesReconstructState::new(io_concurrency.clone()),
                     &ctx,
                 )
@@ -7863,8 +8194,9 @@ mod tests {
             io_concurrency: IoConcurrency,
         ) -> anyhow::Result<(BTreeMap<Key, Result<Bytes, PageReconstructError>>, usize)> {
             let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
+            let query = VersionedKeySpaceQuery::uniform(keyspace.clone(), lsn);
             let res = tline
-                .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
+                .get_vectored_impl(query, &mut reconstruct_state, ctx)
                 .await?;
             Ok((res, reconstruct_state.get_delta_layers_visited() as usize))
         }
@@ -8161,13 +8493,10 @@ mod tests {
 
         // test vectored scan on parent timeline
         let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
+        let query =
+            VersionedKeySpaceQuery::uniform(KeySpace::single(Key::metadata_key_range()), lsn);
         let res = tline
-            .get_vectored_impl(
-                KeySpace::single(Key::metadata_key_range()),
-                lsn,
-                &mut reconstruct_state,
-                &ctx,
-            )
+            .get_vectored_impl(query, &mut reconstruct_state, &ctx)
             .await?;
 
         assert_eq!(
@@ -8187,13 +8516,10 @@ mod tests {
 
         // test vectored scan on child timeline
         let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone());
+        let query =
+            VersionedKeySpaceQuery::uniform(KeySpace::single(Key::metadata_key_range()), lsn);
         let res = child
-            .get_vectored_impl(
-                KeySpace::single(Key::metadata_key_range()),
-                lsn,
-                &mut reconstruct_state,
-                &ctx,
-            )
+            .get_vectored_impl(query, &mut reconstruct_state, &ctx)
             .await?;
 
         assert_eq!(
@@ -8227,13 +8553,9 @@ mod tests {
         let io_concurrency =
             IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap());
         let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
+        let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
         let mut res = tline
-            .get_vectored_impl(
-                KeySpace::single(key..key.next()),
-                lsn,
-                &mut reconstruct_state,
-                ctx,
-            )
+            .get_vectored_impl(query, &mut reconstruct_state, ctx)
             .await?;
         Ok(res.pop_last().map(|(k, v)| {
             assert_eq!(k, key);
@@ -8733,6 +9055,21 @@ mod tests {
                 Lsn(0x20),
                 Value::WalRecord(NeonWalRecord::wal_init("i")),
             ),
+            (
+                get_key(4),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append_conditional("j", "i")),
+            ),
+            (
+                get_key(5),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_init("1")),
+            ),
+            (
+                get_key(5),
+                Lsn(0x30),
+                Value::WalRecord(NeonWalRecord::wal_append_conditional("j", "2")),
+            ),
         ];
         let image1 = vec![(get_key(1), "0x10".into())];
 
@@ -8763,8 +9100,18 @@ mod tests {
 
         // Need to remove the limit of "Neon WAL redo requires base image".
 
-        // assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new());
-        // assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new());
+        assert_eq!(
+            tline.get(get_key(3), Lsn(0x50), &ctx).await?,
+            Bytes::from_static(b"c")
+        );
+        assert_eq!(
+            tline.get(get_key(4), Lsn(0x50), &ctx).await?,
+            Bytes::from_static(b"ij")
+        );
+
+        // Manual testing required: currently, read errors will panic the process in debug mode. So we
+        // cannot enable this assertion in the unit test.
+        // assert!(tline.get(get_key(5), Lsn(0x50), &ctx).await.is_err());
 
         Ok(())
     }
@@ -9230,6 +9577,7 @@ mod tests {
                 &[Lsn(0x20), Lsn(0x40), Lsn(0x50)],
                 3,
                 None,
+                true,
             )
             .await
             .unwrap();
@@ -9354,7 +9702,15 @@ mod tests {
             ),
         ];
         let res = tline
-            .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None)
+            .generate_key_retention(
+                key,
+                &history,
+                Lsn(0x60),
+                &[Lsn(0x40), Lsn(0x50)],
+                3,
+                None,
+                true,
+            )
             .await
             .unwrap();
         let expected_res = KeyHistoryRetention {
@@ -9433,6 +9789,7 @@ mod tests {
                 &[],
                 3,
                 Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
+                true,
             )
             .await
             .unwrap();
@@ -9481,6 +9838,7 @@ mod tests {
                 &[Lsn(0x30)],
                 3,
                 Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))),
+                true,
             )
             .await
             .unwrap();
@@ -10331,14 +10689,13 @@ mod tests {
             )
             .await?;
 
-        let keyspace = KeySpace::single(get_key(0)..get_key(10));
+        let query = VersionedKeySpaceQuery::uniform(
+            KeySpace::single(get_key(0)..get_key(10)),
+            delta_layer_end_lsn,
+        );
+
         let results = tline
-            .get_vectored(
-                keyspace,
-                delta_layer_end_lsn,
-                IoConcurrency::sequential(),
-                &ctx,
-            )
+            .get_vectored(query, IoConcurrency::sequential(), &ctx)
             .await
             .expect("No vectored errors");
         for (key, res) in results {
@@ -10486,9 +10843,13 @@ mod tests {
             )
             .await?;
 
-        let keyspace = KeySpace::single(get_key(0)..get_key(10));
+        let query = VersionedKeySpaceQuery::uniform(
+            KeySpace::single(get_key(0)..get_key(10)),
+            last_record_lsn,
+        );
+
         let results = tline
-            .get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx)
+            .get_vectored(query, IoConcurrency::sequential(), &ctx)
             .await
             .expect("No vectored errors");
         for (key, res) in results {
@@ -10502,6 +10863,214 @@ mod tests {
         Ok(())
     }
 
+    // A randomized read path test. Generates a layer map according to a deterministic
+    // specification. Fills the (key, LSN) space in random manner and then performs
+    // random scattered queries validating the results against in-memory storage.
+    //
+    // See this internal Notion page for a diagram of the layer map:
+    // https://www.notion.so/neondatabase/Read-Path-Unit-Testing-Fuzzing-1d1f189e0047806c8e5cd37781b0a350?pvs=4
+    //
+    // A fuzzing mode is also supported. In this mode, the test will use a random
+    // seed instead of a hardcoded one. Use it in conjunction with `cargo stress`
+    // to run multiple instances in parallel:
+    //
+    // $ RUST_BACKTRACE=1 RUST_LOG=INFO \
+    //   cargo stress --package=pageserver --features=testing,fuzz-read-path --release -- test_read_path
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_read_path() -> anyhow::Result<()> {
+        use rand::seq::SliceRandom;
+
+        let seed = if cfg!(feature = "fuzz-read-path") {
+            let seed: u64 = thread_rng().r#gen();
+            seed
+        } else {
+            // Use a hard-coded seed when not in fuzzing mode.
+            // Note that with the current approach results are not reproducible
+            // accross platforms and Rust releases.
+            const SEED: u64 = 0;
+            SEED
+        };
+
+        let mut random = StdRng::seed_from_u64(seed);
+
+        let (queries, will_init_chance, gap_chance) = if cfg!(feature = "fuzz-read-path") {
+            const QUERIES: u64 = 5000;
+            let will_init_chance: u8 = random.gen_range(0..=10);
+            let gap_chance: u8 = random.gen_range(0..=50);
+
+            (QUERIES, will_init_chance, gap_chance)
+        } else {
+            const QUERIES: u64 = 1000;
+            const WILL_INIT_CHANCE: u8 = 1;
+            const GAP_CHANCE: u8 = 5;
+
+            (QUERIES, WILL_INIT_CHANCE, GAP_CHANCE)
+        };
+
+        let harness = TenantHarness::create("test_read_path").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        tracing::info!("Using random seed: {seed}");
+        tracing::info!(%will_init_chance, %gap_chance, "Fill params");
+
+        // Define the layer map shape. Note that this part is not randomized.
+
+        const KEY_DIMENSION_SIZE: u32 = 99;
+        let start_key = Key::from_hex("110000000033333333444444445500000000").unwrap();
+        let end_key = start_key.add(KEY_DIMENSION_SIZE);
+        let total_key_range = start_key..end_key;
+        let total_key_range_size = end_key.to_i128() - start_key.to_i128();
+        let total_start_lsn = Lsn(104);
+        let last_record_lsn = Lsn(504);
+
+        assert!(total_key_range_size % 3 == 0);
+
+        let in_memory_layers_shape = vec![
+            (total_key_range.clone(), Lsn(304)..Lsn(400)),
+            (total_key_range.clone(), Lsn(400)..last_record_lsn),
+        ];
+
+        let delta_layers_shape = vec![
+            (
+                start_key..(start_key.add((total_key_range_size / 3) as u32)),
+                Lsn(200)..Lsn(304),
+            ),
+            (
+                (start_key.add((total_key_range_size / 3) as u32))
+                    ..(start_key.add((total_key_range_size * 2 / 3) as u32)),
+                Lsn(200)..Lsn(304),
+            ),
+            (
+                (start_key.add((total_key_range_size * 2 / 3) as u32))
+                    ..(start_key.add(total_key_range_size as u32)),
+                Lsn(200)..Lsn(304),
+            ),
+        ];
+
+        let image_layers_shape = vec![
+            (
+                start_key.add((total_key_range_size * 2 / 3 - 10) as u32)
+                    ..start_key.add((total_key_range_size * 2 / 3 + 10) as u32),
+                Lsn(456),
+            ),
+            (
+                start_key.add((total_key_range_size / 3 - 10) as u32)
+                    ..start_key.add((total_key_range_size / 3 + 10) as u32),
+                Lsn(256),
+            ),
+            (total_key_range.clone(), total_start_lsn),
+        ];
+
+        let specification = TestTimelineSpecification {
+            start_lsn: total_start_lsn,
+            last_record_lsn,
+            in_memory_layers_shape,
+            delta_layers_shape,
+            image_layers_shape,
+            gap_chance,
+            will_init_chance,
+        };
+
+        // Create and randomly fill in the layers according to the specification
+        let (tline, storage, interesting_lsns) = randomize_timeline(
+            &tenant,
+            TIMELINE_ID,
+            DEFAULT_PG_VERSION,
+            specification,
+            &mut random,
+            &ctx,
+        )
+        .await?;
+
+        // Now generate queries based on the interesting lsns that we've collected.
+        //
+        // While there's still room in the query, pick and interesting LSN and a random
+        // key. Then roll the dice to see if the next key should also be included in
+        // the query. When the roll fails, break the "batch" and pick another point in the
+        // (key, LSN) space.
+
+        const PICK_NEXT_CHANCE: u8 = 50;
+        for _ in 0..queries {
+            let query = {
+                let mut keyspaces_at_lsn: HashMap<Lsn, KeySpaceRandomAccum> = HashMap::default();
+                let mut used_keys: HashSet<Key> = HashSet::default();
+
+                while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize {
+                    let selected_lsn = interesting_lsns.choose(&mut random).expect("not empty");
+                    let mut selected_key = start_key.add(random.gen_range(0..KEY_DIMENSION_SIZE));
+
+                    while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize {
+                        if used_keys.contains(&selected_key)
+                            || selected_key >= start_key.add(KEY_DIMENSION_SIZE)
+                        {
+                            break;
+                        }
+
+                        keyspaces_at_lsn
+                            .entry(*selected_lsn)
+                            .or_default()
+                            .add_key(selected_key);
+                        used_keys.insert(selected_key);
+
+                        let pick_next = random.gen_range(0..=100) <= PICK_NEXT_CHANCE;
+                        if pick_next {
+                            selected_key = selected_key.next();
+                        } else {
+                            break;
+                        }
+                    }
+                }
+
+                VersionedKeySpaceQuery::scattered(
+                    keyspaces_at_lsn
+                        .into_iter()
+                        .map(|(lsn, acc)| (lsn, acc.to_keyspace()))
+                        .collect(),
+                )
+            };
+
+            // Run the query and validate the results
+
+            let results = tline
+                .get_vectored(query.clone(), IoConcurrency::Sequential, &ctx)
+                .await;
+
+            let blobs = match results {
+                Ok(ok) => ok,
+                Err(err) => {
+                    panic!("seed={seed} Error returned for query {query}: {err}");
+                }
+            };
+
+            for (key, key_res) in blobs.into_iter() {
+                match key_res {
+                    Ok(blob) => {
+                        let requested_at_lsn = query.map_key_to_lsn(&key);
+                        let expected = storage.get(key, requested_at_lsn);
+
+                        if blob != expected {
+                            tracing::error!(
+                                "seed={seed} Mismatch for {key}@{requested_at_lsn} from query: {query}"
+                            );
+                        }
+
+                        assert_eq!(blob, expected);
+                    }
+                    Err(err) => {
+                        let requested_at_lsn = query.map_key_to_lsn(&key);
+
+                        panic!(
+                            "seed={seed} Error returned for {key}@{requested_at_lsn} from query {query}: {err}"
+                        );
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+
     fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
         (
             k1.is_delta,
@@ -11544,6 +12113,99 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_bottom_most_compation_redo_failure() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_bottom_most_compation_redo_failure").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x20")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x24),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x24")),
+            ),
+            (
+                get_key(1),
+                Lsn(0x28),
+                // This record will fail to redo
+                Value::WalRecord(NeonWalRecord::wal_append_conditional("@0x28", "???")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![], // in-memory layers
+                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
+                    Lsn(0x20)..Lsn(0x30),
+                    delta1,
+                )], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+        {
+            tline
+                .applied_gc_cutoff_lsn
+                .lock_for_write()
+                .store_and_unlock(Lsn(0x30))
+                .wait()
+                .await;
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let cancel = CancellationToken::new();
+
+        // Compaction will fail, but should not fire any critical error.
+        // Gc-compaction currently cannot figure out what keys are not in the keyspace during the compaction
+        // process. It will always try to redo the logs it reads and if it doesn't work, fail the entire
+        // compaction job. Tracked in <https://github.com/neondatabase/neon/issues/10395>.
+        let res = tline
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    compact_key_range: None,
+                    compact_lsn_range: None,
+                    ..Default::default()
+                },
+                &ctx,
+            )
+            .await;
+        assert!(res.is_err());
+
+        Ok(())
+    }
+
     #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> {
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index ff9a7e57b61c..abeaa166a40b 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -15,13 +15,14 @@
 //! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
 use std::cmp::min;
-use std::io::{Error, ErrorKind};
+use std::io::Error;
 
 use async_compression::Level;
 use bytes::{BufMut, BytesMut};
 use pageserver_api::models::ImageCompressionAlgorithm;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
+use tokio_util::sync::CancellationToken;
 use tracing::warn;
 
 use crate::context::RequestContext;
@@ -169,7 +170,13 @@ pub struct BlobWriter<const BUFFERED: bool> {
 }
 
 impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
-    pub fn new(inner: VirtualFile, start_offset: u64) -> Self {
+    pub fn new(
+        inner: VirtualFile,
+        start_offset: u64,
+        _gate: &utils::sync::gate::Gate,
+        _cancel: CancellationToken,
+        _ctx: &RequestContext,
+    ) -> Self {
         Self {
             inner,
             offset: start_offset,
@@ -331,10 +338,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                     return (
                         (
                             io_buf.slice_len(),
-                            Err(Error::new(
-                                ErrorKind::Other,
-                                format!("blob too large ({len} bytes)"),
-                            )),
+                            Err(Error::other(format!("blob too large ({len} bytes)"))),
                         ),
                         srcbuf,
                     );
@@ -435,12 +439,14 @@ pub(crate) mod tests {
     ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
         let temp_dir = camino_tempfile::tempdir()?;
         let pathbuf = temp_dir.path().join("file");
+        let gate = utils::sync::gate::Gate::default();
+        let cancel = CancellationToken::new();
 
         // Write part (in block to drop the file)
         let mut offsets = Vec::new();
         {
             let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
-            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
+            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0, &gate, cancel.clone(), ctx);
             for blob in blobs.iter() {
                 let (_, res) = if compression {
                     let res = wtr
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 66c586daffdc..67231556262f 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -216,12 +216,8 @@ impl<'a> FileBlockReader<'a> {
         match cache
             .read_immutable_buf(self.file_id, blknum, ctx)
             .await
-            .map_err(|e| {
-                std::io::Error::new(
-                    std::io::ErrorKind::Other,
-                    format!("Failed to read immutable buf: {e:#}"),
-                )
-            })? {
+            .map_err(|e| std::io::Error::other(format!("Failed to read immutable buf: {e:#}")))?
+        {
             ReadBufResult::Found(guard) => Ok(guard.into()),
             ReadBufResult::NotFound(write_guard) => {
                 // Read the page from disk into the buffer
diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs
index 0e07acfbc8c5..7dba4508e23f 100644
--- a/pageserver/src/tenant/remote_timeline_client/manifest.rs
+++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs
@@ -1,4 +1,5 @@
 use chrono::NaiveDateTime;
+use pageserver_api::shard::ShardStripeSize;
 use serde::{Deserialize, Serialize};
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -14,6 +15,12 @@ pub struct TenantManifest {
     /// allow release rollbacks.
     pub version: usize,
 
+    /// This tenant's stripe size. This is only advisory, and used to recover tenant data from
+    /// remote storage. The autoritative source is the storage controller. If None, assume the
+    /// original default value of 32768 blocks (256 MB).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub stripe_size: Option<ShardStripeSize>,
+
     /// The list of offloaded timelines together with enough information
     /// to not have to actually load them.
     ///
@@ -42,7 +49,12 @@ pub struct OffloadedTimelineManifest {
 
 /// The newest manifest version. This should be incremented on changes, even non-breaking ones. We
 /// do not use deny_unknown_fields, so new fields are not breaking.
-pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1;
+///
+/// 1: initial version
+/// 2: +stripe_size
+///
+/// When adding new versions, also add a parse_vX test case below.
+pub const LATEST_TENANT_MANIFEST_VERSION: usize = 2;
 
 impl TenantManifest {
     /// Returns true if the manifests are equal, ignoring the version number. This avoids
@@ -56,10 +68,11 @@ impl TenantManifest {
         // We could alternatively just clone and modify the version here.
         let Self {
             version: _, // ignore version
+            stripe_size,
             offloaded_timelines,
         } = self;
 
-        offloaded_timelines == &other.offloaded_timelines
+        stripe_size == &other.stripe_size && offloaded_timelines == &other.offloaded_timelines
     }
 
     /// Decodes a manifest from JSON.
@@ -89,6 +102,7 @@ mod tests {
          }"#;
         let expected = TenantManifest {
             version: 0,
+            stripe_size: None,
             offloaded_timelines: Vec::new(),
         };
         assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
@@ -104,6 +118,7 @@ mod tests {
          }"#;
         let expected = TenantManifest {
             version: 1,
+            stripe_size: None,
             offloaded_timelines: Vec::new(),
         };
         assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
@@ -130,6 +145,50 @@ mod tests {
          }"#;
         let expected = TenantManifest {
             version: 1,
+            stripe_size: None,
+            offloaded_timelines: vec![
+                OffloadedTimelineManifest {
+                    timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?,
+                    ancestor_timeline_id: None,
+                    ancestor_retain_lsn: None,
+                    archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?,
+                },
+                OffloadedTimelineManifest {
+                    timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?,
+                    ancestor_timeline_id: Some(TimelineId::from_str(
+                        "5c4df612fd159e63c1b7853fe94d97da",
+                    )?),
+                    ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?),
+                    archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?,
+                },
+            ],
+        };
+        assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
+        Ok(())
+    }
+
+    /// v2 manifests should be parsed, for backwards compatibility.
+    #[test]
+    fn parse_v2() -> anyhow::Result<()> {
+        let json = r#"{
+             "version": 2,
+             "stripe_size": 32768,
+             "offloaded_timelines": [
+                 {
+                     "timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
+                     "archived_at": "2025-03-07T11:07:11.373105434"
+                 },
+                 {
+                     "timeline_id": "f3def5823ad7080d2ea538d8e12163fa",
+                     "ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
+                     "ancestor_retain_lsn": "0/1F79038",
+                     "archived_at": "2025-03-05T11:10:22.257901390"
+                 }
+             ]
+         }"#;
+        let expected = TenantManifest {
+            version: 2,
+            stripe_size: Some(ShardStripeSize(32768)),
             offloaded_timelines: vec![
                 OffloadedTimelineManifest {
                     timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?,
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 2ea0c1b97902..796ad01e5452 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -715,13 +715,34 @@ pub(crate) enum LayerId {
 }
 
 /// Uniquely identify a layer visit by the layer
-/// and LSN floor (or start LSN) of the reads.
-/// The layer itself is not enough since we may
-/// have different LSN lower bounds for delta layer reads.
+/// and LSN range of the reads. Note that the end of the range is exclusive.
+///
+/// The layer itself is not enough since we may have different LSN lower
+/// bounds for delta layer reads. Scenarios where this can happen are:
+///
+/// 1. Layer overlaps: imagine an image layer inside and in-memory layer
+///    and a query that only partially hits the image layer. Part of the query
+///    needs to read the whole in-memory layer and the other part needs to read
+///    only up to the image layer. Hence, they'll have different LSN floor values
+///    for the read.
+///
+/// 2. Scattered reads: the read path supports starting at different LSNs. Imagine
+///    The start LSN for one range is inside a layer and the start LSN for another range
+///    Is above the layer (includes all of it). Both ranges need to read the layer all the
+///    Way to the end but starting at different points. Hence, they'll have different LSN
+///    Ceil values.
+///
+/// The implication is that we might visit the same layer multiple times
+/// in order to read different LSN ranges from it. In practice, this isn't very concerning
+/// because:
+/// 1. Layer overlaps are rare and generally not intended
+/// 2. Scattered reads will stabilise after the first few layers provided their starting LSNs
+///    are grouped tightly enough (likely the case).
 #[derive(Debug, PartialEq, Eq, Clone, Hash)]
 struct LayerToVisitId {
     layer_id: LayerId,
     lsn_floor: Lsn,
+    lsn_ceil: Lsn,
 }
 
 #[derive(Debug, PartialEq, Eq, Hash)]
@@ -805,6 +826,7 @@ impl LayerFringe {
         let layer_to_visit_id = LayerToVisitId {
             layer_id: layer.id(),
             lsn_floor: lsn_range.start,
+            lsn_ceil: lsn_range.end,
         };
 
         let entry = self.visit_reads.entry(layer_to_visit_id.clone());
diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
index fd50e4805de4..39cd02d101b4 100644
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -5,6 +5,7 @@ use std::sync::Arc;
 use bytes::Bytes;
 use pageserver_api::key::{KEY_SIZE, Key};
 use pageserver_api::value::Value;
+use tokio_util::sync::CancellationToken;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
 use utils::shard::TenantShardId;
@@ -179,7 +180,7 @@ impl BatchLayerWriter {
 
 /// An image writer that takes images and produces multiple image layers.
 #[must_use]
-pub struct SplitImageLayerWriter {
+pub struct SplitImageLayerWriter<'a> {
     inner: ImageLayerWriter,
     target_layer_size: u64,
     lsn: Lsn,
@@ -188,9 +189,12 @@ pub struct SplitImageLayerWriter {
     tenant_shard_id: TenantShardId,
     batches: BatchLayerWriter,
     start_key: Key,
+    gate: &'a utils::sync::gate::Gate,
+    cancel: CancellationToken,
 }
 
-impl SplitImageLayerWriter {
+impl<'a> SplitImageLayerWriter<'a> {
+    #[allow(clippy::too_many_arguments)]
     pub async fn new(
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
@@ -198,6 +202,8 @@ impl SplitImageLayerWriter {
         start_key: Key,
         lsn: Lsn,
         target_layer_size: u64,
+        gate: &'a utils::sync::gate::Gate,
+        cancel: CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         Ok(Self {
@@ -208,6 +214,8 @@ impl SplitImageLayerWriter {
                 tenant_shard_id,
                 &(start_key..Key::MAX),
                 lsn,
+                gate,
+                cancel.clone(),
                 ctx,
             )
             .await?,
@@ -217,6 +225,8 @@ impl SplitImageLayerWriter {
             batches: BatchLayerWriter::new(conf).await?,
             lsn,
             start_key,
+            gate,
+            cancel,
         })
     }
 
@@ -239,6 +249,8 @@ impl SplitImageLayerWriter {
                 self.tenant_shard_id,
                 &(key..Key::MAX),
                 self.lsn,
+                self.gate,
+                self.cancel.clone(),
                 ctx,
             )
             .await?;
@@ -291,7 +303,7 @@ impl SplitImageLayerWriter {
 /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
 /// will split them into multiple files based on size.
 #[must_use]
-pub struct SplitDeltaLayerWriter {
+pub struct SplitDeltaLayerWriter<'a> {
     inner: Option<(Key, DeltaLayerWriter)>,
     target_layer_size: u64,
     conf: &'static PageServerConf,
@@ -300,15 +312,19 @@ pub struct SplitDeltaLayerWriter {
     lsn_range: Range<Lsn>,
     last_key_written: Key,
     batches: BatchLayerWriter,
+    gate: &'a utils::sync::gate::Gate,
+    cancel: CancellationToken,
 }
 
-impl SplitDeltaLayerWriter {
+impl<'a> SplitDeltaLayerWriter<'a> {
     pub async fn new(
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         lsn_range: Range<Lsn>,
         target_layer_size: u64,
+        gate: &'a utils::sync::gate::Gate,
+        cancel: CancellationToken,
     ) -> anyhow::Result<Self> {
         Ok(Self {
             target_layer_size,
@@ -319,6 +335,8 @@ impl SplitDeltaLayerWriter {
             lsn_range,
             last_key_written: Key::MIN,
             batches: BatchLayerWriter::new(conf).await?,
+            gate,
+            cancel,
         })
     }
 
@@ -344,6 +362,8 @@ impl SplitDeltaLayerWriter {
                     self.tenant_shard_id,
                     key,
                     self.lsn_range.clone(),
+                    self.gate,
+                    self.cancel.clone(),
                     ctx,
                 )
                 .await?,
@@ -362,11 +382,13 @@ impl SplitDeltaLayerWriter {
                     self.tenant_shard_id,
                     key,
                     self.lsn_range.clone(),
+                    self.gate,
+                    self.cancel.clone(),
                     ctx,
                 )
                 .await?;
                 let (start_key, prev_delta_writer) =
-                    std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
+                    self.inner.replace((key, next_delta_writer)).unwrap();
                 self.batches.add_unfinished_delta_writer(
                     prev_delta_writer,
                     start_key..key,
@@ -469,6 +491,8 @@ mod tests {
             get_key(0),
             Lsn(0x18),
             4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
             &ctx,
         )
         .await
@@ -480,6 +504,8 @@ mod tests {
             tenant.tenant_shard_id,
             Lsn(0x18)..Lsn(0x20),
             4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
         )
         .await
         .unwrap();
@@ -546,6 +572,8 @@ mod tests {
             get_key(0),
             Lsn(0x18),
             4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
             &ctx,
         )
         .await
@@ -556,6 +584,8 @@ mod tests {
             tenant.tenant_shard_id,
             Lsn(0x18)..Lsn(0x20),
             4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
         )
         .await
         .unwrap();
@@ -643,6 +673,8 @@ mod tests {
             get_key(0),
             Lsn(0x18),
             4 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
             &ctx,
         )
         .await
@@ -654,6 +686,8 @@ mod tests {
             tenant.tenant_shard_id,
             Lsn(0x18)..Lsn(0x20),
             4 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
         )
         .await
         .unwrap();
@@ -730,6 +764,8 @@ mod tests {
             tenant.tenant_shard_id,
             Lsn(0x10)..Lsn(N as u64 * 16 + 0x10),
             4 * 1024 * 1024,
+            &tline.gate,
+            tline.cancel.clone(),
         )
         .await
         .unwrap();
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 05b0bc1a5c6c..4417b8aa5135 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -50,6 +50,7 @@ use rand::distributions::Alphanumeric;
 use serde::{Deserialize, Serialize};
 use tokio::sync::OnceCell;
 use tokio_epoll_uring::IoBuf;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::bin_ser::BeSer;
 use utils::id::{TenantId, TimelineId};
@@ -400,12 +401,15 @@ impl DeltaLayerWriterInner {
     ///
     /// Start building a new delta layer.
     ///
+    #[allow(clippy::too_many_arguments)]
     async fn new(
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         key_start: Key,
         lsn_range: Range<Lsn>,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         // Create the file initially with a temporary filename. We don't know
@@ -420,7 +424,7 @@ impl DeltaLayerWriterInner {
         let mut file = VirtualFile::create(&path, ctx).await?;
         // make room for the header block
         file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
-        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
+        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx);
 
         // Initialize the b-tree index builder
         let block_buf = BlockBuf::new();
@@ -628,12 +632,15 @@ impl DeltaLayerWriter {
     ///
     /// Start building a new delta layer.
     ///
+    #[allow(clippy::too_many_arguments)]
     pub async fn new(
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         key_start: Key,
         lsn_range: Range<Lsn>,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         Ok(Self {
@@ -644,6 +651,8 @@ impl DeltaLayerWriter {
                     tenant_shard_id,
                     key_start,
                     lsn_range,
+                    gate,
+                    cancel,
                     ctx,
                 )
                 .await?,
@@ -1885,6 +1894,8 @@ pub(crate) mod test {
             harness.tenant_shard_id,
             entries_meta.key_range.start,
             entries_meta.lsn_range.clone(),
+            &timeline.gate,
+            timeline.cancel.clone(),
             &ctx,
         )
         .await?;
@@ -2079,6 +2090,8 @@ pub(crate) mod test {
                 tenant.tenant_shard_id,
                 Key::MIN,
                 Lsn(0x11)..truncate_at,
+                &branch.gate,
+                branch.cancel.clone(),
                 ctx,
             )
             .await
@@ -2213,6 +2226,8 @@ pub(crate) mod test {
             tenant.tenant_shard_id,
             *key_start,
             (*lsn_min)..lsn_end,
+            &tline.gate,
+            tline.cancel.clone(),
             ctx,
         )
         .await?;
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 3243b7394257..3744d615f24e 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -48,6 +48,7 @@ use rand::distributions::Alphanumeric;
 use serde::{Deserialize, Serialize};
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::bin_ser::BeSer;
 use utils::id::{TenantId, TimelineId};
@@ -748,12 +749,15 @@ impl ImageLayerWriterInner {
     ///
     /// Start building a new image layer.
     ///
+    #[allow(clippy::too_many_arguments)]
     async fn new(
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         key_range: &Range<Key>,
         lsn: Lsn,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
         // Create the file initially with a temporary filename.
@@ -780,7 +784,7 @@ impl ImageLayerWriterInner {
         };
         // make room for the header block
         file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
-        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64);
+        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx);
 
         // Initialize the b-tree index builder
         let block_buf = BlockBuf::new();
@@ -988,18 +992,30 @@ impl ImageLayerWriter {
     ///
     /// Start building a new image layer.
     ///
+    #[allow(clippy::too_many_arguments)]
     pub async fn new(
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         key_range: &Range<Key>,
         lsn: Lsn,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
         ctx: &RequestContext,
     ) -> anyhow::Result<ImageLayerWriter> {
         Ok(Self {
             inner: Some(
-                ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx)
-                    .await?,
+                ImageLayerWriterInner::new(
+                    conf,
+                    timeline_id,
+                    tenant_shard_id,
+                    key_range,
+                    lsn,
+                    gate,
+                    cancel,
+                    ctx,
+                )
+                .await?,
             ),
         })
     }
@@ -1192,7 +1208,7 @@ mod test {
 
         // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero
         let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
+        let input_end = Key::from_hex("000000067f00000001000000ae0000002000").unwrap();
         let range = input_start..input_end;
 
         // Build an image layer to filter
@@ -1203,6 +1219,8 @@ mod test {
                 harness.tenant_shard_id,
                 &range,
                 lsn,
+                &timeline.gate,
+                timeline.cancel.clone(),
                 &ctx,
             )
             .await
@@ -1235,7 +1253,7 @@ mod test {
             let shard_identity = ShardIdentity::new(
                 ShardNumber(shard_number),
                 shard_count,
-                ShardStripeSize(0x8000),
+                ShardStripeSize(0x800),
             )
             .unwrap();
             let harness = TenantHarness::create_custom(
@@ -1268,6 +1286,8 @@ mod test {
                 harness.tenant_shard_id,
                 &range,
                 lsn,
+                &timeline.gate,
+                timeline.cancel.clone(),
                 &ctx,
             )
             .await
@@ -1287,12 +1307,12 @@ mod test {
 
             // This exact size and those below will need updating as/when the layer encoding changes, but
             // should be deterministic for a given version of the format, as we used no randomness generating the input.
-            assert_eq!(original_size, 1597440);
+            assert_eq!(original_size, 122880);
 
             match shard_number {
                 0 => {
                     // We should have written out just one stripe for our shard identity
-                    assert_eq!(wrote_keys, 0x8000);
+                    assert_eq!(wrote_keys, 0x800);
                     let replacement = replacement.unwrap();
 
                     // We should have dropped some of the data
@@ -1300,7 +1320,7 @@ mod test {
                     assert!(replacement.metadata().file_size > 0);
 
                     // Assert that we dropped ~3/4 of the data.
-                    assert_eq!(replacement.metadata().file_size, 417792);
+                    assert_eq!(replacement.metadata().file_size, 49152);
                 }
                 1 => {
                     // Shard 1 has no keys in our input range
@@ -1309,19 +1329,19 @@ mod test {
                 }
                 2 => {
                     // Shard 2 has one stripes in the input range
-                    assert_eq!(wrote_keys, 0x8000);
+                    assert_eq!(wrote_keys, 0x800);
                     let replacement = replacement.unwrap();
                     assert!(replacement.metadata().file_size < original_size);
                     assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 417792);
+                    assert_eq!(replacement.metadata().file_size, 49152);
                 }
                 3 => {
                     // Shard 3 has two stripes in the input range
-                    assert_eq!(wrote_keys, 0x10000);
+                    assert_eq!(wrote_keys, 0x1000);
                     let replacement = replacement.unwrap();
                     assert!(replacement.metadata().file_size < original_size);
                     assert!(replacement.metadata().file_size > 0);
-                    assert_eq!(replacement.metadata().file_size, 811008);
+                    assert_eq!(replacement.metadata().file_size, 73728);
                 }
                 _ => unreachable!(),
             }
@@ -1346,6 +1366,8 @@ mod test {
             tenant.tenant_shard_id,
             &key_range,
             lsn,
+            &tline.gate,
+            tline.cancel.clone(),
             ctx,
         )
         .await?;
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 388ed3201c20..5d558e66cc7e 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -719,6 +719,8 @@ impl InMemoryLayer {
         ctx: &RequestContext,
         key_range: Option<Range<Key>>,
         l0_flush_global_state: &l0_flush::Inner,
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
     ) -> Result<Option<(PersistentLayerDesc, Utf8PathBuf)>> {
         // Grab the lock in read-mode. We hold it over the I/O, but because this
         // layer is not writeable anymore, no one should be trying to acquire the
@@ -759,6 +761,8 @@ impl InMemoryLayer {
             self.tenant_shard_id,
             Key::MIN,
             self.start_lsn..end_lsn,
+            gate,
+            cancel,
             ctx,
         )
         .await?;
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
index 90455fd0cabd..ea354fc716d6 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
@@ -766,7 +766,7 @@ mod tests {
                     rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[len..]); // to discover bugs
                     Ok((dst, len))
                 }
-                Err(e) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)),
+                Err(e) => Err(std::io::Error::other(e)),
             }
         }
     }
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index 76cdddd06a8b..55db9fe06a32 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -59,6 +59,7 @@ impl LayerIterRef<'_> {
 /// 1. Unified iterator for image and delta layers.
 /// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
 /// 3. Lazy creation of the real delta/image iterator.
+#[allow(clippy::large_enum_variant, reason = "TODO")]
 pub(crate) enum IteratorWrapper<'a> {
     NotLoaded {
         ctx: &'a RequestContext,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5174da0f4384..c27a4b62da9d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -24,6 +24,7 @@ use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
 use crate::PERF_TRACE_TARGET;
+use crate::walredo::RedoAttemptType;
 use anyhow::{Context, Result, anyhow, bail, ensure};
 use arc_swap::{ArcSwap, ArcSwapOption};
 use bytes::Bytes;
@@ -115,7 +116,7 @@ use crate::pgdatadir_mapping::{
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::AttachmentMode;
 use crate::tenant::gc_result::GcResult;
-use crate::tenant::layer_map::{LayerMap, SearchResult};
+use crate::tenant::layer_map::LayerMap;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
@@ -584,7 +585,7 @@ pub(crate) enum PageReconstructError {
     WalRedo(anyhow::Error),
 
     #[error("{0}")]
-    MissingKey(MissingKeyError),
+    MissingKey(Box<MissingKeyError>),
 }
 
 impl From<anyhow::Error> for PageReconstructError {
@@ -689,16 +690,23 @@ impl std::fmt::Display for ReadPath {
 
 #[derive(thiserror::Error)]
 pub struct MissingKeyError {
-    key: Key,
+    keyspace: KeySpace,
     shard: ShardNumber,
-    cont_lsn: Lsn,
-    request_lsn: Lsn,
+    query: Option<VersionedKeySpaceQuery>,
+    // This is largest request LSN from the get page request batch
+    original_hwm_lsn: Lsn,
     ancestor_lsn: Option<Lsn>,
     /// Debug information about the read path if there's an error
     read_path: Option<ReadPath>,
     backtrace: Option<std::backtrace::Backtrace>,
 }
 
+impl MissingKeyError {
+    fn enrich(&mut self, query: VersionedKeySpaceQuery) {
+        self.query = Some(query);
+    }
+}
+
 impl std::fmt::Debug for MissingKeyError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{}", self)
@@ -709,14 +717,18 @@ impl std::fmt::Display for MissingKeyError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
-            "could not find data for key {} (shard {:?}) at LSN {}, request LSN {}",
-            self.key, self.shard, self.cont_lsn, self.request_lsn
+            "could not find data for key {} (shard {:?}), original HWM LSN {}",
+            self.keyspace, self.shard, self.original_hwm_lsn
         )?;
 
         if let Some(ref ancestor_lsn) = self.ancestor_lsn {
             write!(f, ", ancestor {}", ancestor_lsn)?;
         }
 
+        if let Some(ref query) = self.query {
+            write!(f, ", query {}", query)?;
+        }
+
         if let Some(ref read_path) = self.read_path {
             write!(f, "\n{}", read_path)?;
         }
@@ -816,7 +828,7 @@ pub(crate) enum GetVectoredError {
     InvalidLsn(Lsn),
 
     #[error("requested key not found: {0}")]
-    MissingKey(MissingKeyError),
+    MissingKey(Box<MissingKeyError>),
 
     #[error("ancestry walk")]
     GetReadyAncestorError(#[source] GetReadyAncestorError),
@@ -927,7 +939,7 @@ impl std::fmt::Debug for Timeline {
     }
 }
 
-#[derive(thiserror::Error, Debug)]
+#[derive(thiserror::Error, Debug, Clone)]
 pub(crate) enum WaitLsnError {
     // Called on a timeline which is shutting down
     #[error("Shutdown")]
@@ -1039,6 +1051,7 @@ pub(crate) enum ShutdownMode {
     Hard,
 }
 
+#[allow(clippy::large_enum_variant, reason = "TODO")]
 enum ImageLayerCreationOutcome {
     /// We generated an image layer
     Generated {
@@ -1126,14 +1139,12 @@ impl Timeline {
         // page_service.
         debug_assert!(!self.shard_identity.is_key_disposable(&key));
 
-        let keyspace = KeySpace {
-            ranges: vec![key..key.next()],
-        };
-
         let mut reconstruct_state = ValuesReconstructState::new(IoConcurrency::sequential());
 
+        let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
+
         let vectored_res = self
-            .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
+            .get_vectored_impl(query, &mut reconstruct_state, ctx)
             .await;
 
         let key_value = vectored_res?.pop_first();
@@ -1151,15 +1162,17 @@ impl Timeline {
                     value
                 }
             }
-            None => Err(PageReconstructError::MissingKey(MissingKeyError {
-                key,
-                shard: self.shard_identity.get_shard_number(&key),
-                cont_lsn: Lsn(0),
-                request_lsn: lsn,
-                ancestor_lsn: None,
-                backtrace: None,
-                read_path: None,
-            })),
+            None => Err(PageReconstructError::MissingKey(Box::new(
+                MissingKeyError {
+                    keyspace: KeySpace::single(key..key.next()),
+                    shard: self.shard_identity.get_shard_number(&key),
+                    original_hwm_lsn: lsn,
+                    ancestor_lsn: None,
+                    backtrace: None,
+                    read_path: None,
+                    query: None,
+                },
+            ))),
         }
     }
 
@@ -1172,21 +1185,18 @@ impl Timeline {
     /// which actually vectorizes the read path.
     pub(crate) async fn get_vectored(
         &self,
-        keyspace: KeySpace,
-        lsn: Lsn,
+        query: VersionedKeySpaceQuery,
         io_concurrency: super::storage_layer::IoConcurrency,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        if !lsn.is_valid() {
-            return Err(GetVectoredError::InvalidLsn(lsn));
-        }
+        let total_keyspace = query.total_keyspace();
 
-        let key_count = keyspace.total_raw_size().try_into().unwrap();
+        let key_count = total_keyspace.total_raw_size().try_into().unwrap();
         if key_count > Timeline::MAX_GET_VECTORED_KEYS {
             return Err(GetVectoredError::Oversized(key_count));
         }
 
-        for range in &keyspace.ranges {
+        for range in &total_keyspace.ranges {
             let mut key = range.start;
             while key != range.end {
                 assert!(!self.shard_identity.is_key_disposable(&key));
@@ -1195,9 +1205,8 @@ impl Timeline {
         }
 
         trace!(
-            "get vectored request for {:?}@{} from task kind {:?}",
-            keyspace,
-            lsn,
+            "get vectored query {} from task kind {:?}",
+            query,
             ctx.task_kind(),
         );
 
@@ -1206,12 +1215,7 @@ impl Timeline {
             .map(|metric| (metric, Instant::now()));
 
         let res = self
-            .get_vectored_impl(
-                keyspace.clone(),
-                lsn,
-                &mut ValuesReconstructState::new(io_concurrency),
-                ctx,
-            )
+            .get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx)
             .await;
 
         if let Some((metric, start)) = start {
@@ -1262,13 +1266,10 @@ impl Timeline {
             .for_task_kind(ctx.task_kind())
             .map(ScanLatencyOngoingRecording::start_recording);
 
+        let query = VersionedKeySpaceQuery::uniform(keyspace, lsn);
+
         let vectored_res = self
-            .get_vectored_impl(
-                keyspace.clone(),
-                lsn,
-                &mut ValuesReconstructState::new(io_concurrency),
-                ctx,
-            )
+            .get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx)
             .await;
 
         if let Some(recording) = start {
@@ -1280,18 +1281,27 @@ impl Timeline {
 
     pub(super) async fn get_vectored_impl(
         &self,
-        keyspace: KeySpace,
-        lsn: Lsn,
+        query: VersionedKeySpaceQuery,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
         let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
-            Some(ReadPath::new(keyspace.clone(), lsn))
+            Some(ReadPath::new(
+                query.total_keyspace(),
+                query.high_watermark_lsn()?,
+            ))
         } else {
             None
         };
+
         reconstruct_state.read_path = read_path;
 
+        let redo_attempt_type = if ctx.task_kind() == TaskKind::Compaction {
+            RedoAttemptType::LegacyCompaction
+        } else {
+            RedoAttemptType::ReadPage
+        };
+
         let traversal_res: Result<(), _> = {
             let ctx = RequestContextBuilder::from(ctx)
                 .perf_span(|crnt_perf_span| {
@@ -1303,7 +1313,7 @@ impl Timeline {
                 })
                 .attached_child();
 
-            self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, &ctx)
+            self.get_vectored_reconstruct_data(query.clone(), reconstruct_state, &ctx)
                 .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
                 .await
         };
@@ -1316,6 +1326,13 @@ impl Timeline {
                 .map(|state| state.collect_pending_ios())
                 .collect::<FuturesUnordered<_>>();
             while collect_futs.next().await.is_some() {}
+
+            // Enrich the missing key error with the original query.
+            if let GetVectoredError::MissingKey(mut missing_err) = err {
+                missing_err.enrich(query.clone());
+                return Err(GetVectoredError::MissingKey(missing_err));
+            }
+
             return Err(err);
         };
 
@@ -1333,6 +1350,8 @@ impl Timeline {
 
         let futs = FuturesUnordered::new();
         for (key, state) in std::mem::take(&mut reconstruct_state.keys) {
+            let req_lsn_for_key = query.map_key_to_lsn(&key);
+
             futs.push({
                 let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
                 let ctx = RequestContextBuilder::from(&ctx)
@@ -1379,7 +1398,7 @@ impl Timeline {
 
                     let walredo_deltas = converted.num_deltas();
                     let walredo_res = walredo_self
-                        .reconstruct_value(key, lsn, converted)
+                        .reconstruct_value(key, req_lsn_for_key, converted, redo_attempt_type)
                         .maybe_perf_instrument(&ctx, |crnt_perf_span| {
                             info_span!(
                                 target: PERF_TRACE_TARGET,
@@ -1406,15 +1425,18 @@ impl Timeline {
         // to avoid infinite results.
         if !results.is_empty() {
             if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
+                let total_keyspace = query.total_keyspace();
+                let max_request_lsn = query.high_watermark_lsn().expect("Validated previously");
+
                 static LOG_PACER: Lazy<Mutex<RateLimit>> =
                     Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
                 LOG_PACER.lock().unwrap().call(|| {
-                    let num_keys = keyspace.total_raw_size();
+                    let num_keys = total_keyspace.total_raw_size();
                     let num_pages = results.len();
                     tracing::info!(
                       shard_id = %self.tenant_shard_id.shard_slug(),
-                      lsn = %lsn,
-                      "Vectored read for {keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
+                      lsn = %max_request_lsn,
+                      "Vectored read for {total_keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
                     );
                 });
             }
@@ -2715,6 +2737,10 @@ impl Timeline {
             .tenant_conf
             .gc_compaction_enabled
             .unwrap_or(self.conf.default_tenant_conf.gc_compaction_enabled);
+        let gc_compaction_verification = tenant_conf
+            .tenant_conf
+            .gc_compaction_verification
+            .unwrap_or(self.conf.default_tenant_conf.gc_compaction_verification);
         let gc_compaction_initial_threshold_kb = tenant_conf
             .tenant_conf
             .gc_compaction_initial_threshold_kb
@@ -2729,6 +2755,7 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.gc_compaction_ratio_percent);
         GcCompactionCombinedSettings {
             gc_compaction_enabled,
+            gc_compaction_verification,
             gc_compaction_initial_threshold_kb,
             gc_compaction_ratio_percent,
         }
@@ -3927,6 +3954,154 @@ impl Timeline {
     }
 }
 
+#[derive(Clone)]
+/// Type representing a query in the ([`Lsn`], [`Key`]) space.
+/// In other words, a set of segments in a 2D space.
+///
+/// This representation has the advatange of avoiding hash map
+/// allocations for uniform queries.
+pub(crate) enum VersionedKeySpaceQuery {
+    /// Variant for queries at a single [`Lsn`]
+    Uniform { keyspace: KeySpace, lsn: Lsn },
+    /// Variant for queries at multiple [`Lsn`]s
+    Scattered {
+        keyspaces_at_lsn: Vec<(Lsn, KeySpace)>,
+    },
+}
+
+impl VersionedKeySpaceQuery {
+    pub(crate) fn uniform(keyspace: KeySpace, lsn: Lsn) -> Self {
+        Self::Uniform { keyspace, lsn }
+    }
+
+    pub(crate) fn scattered(keyspaces_at_lsn: Vec<(Lsn, KeySpace)>) -> Self {
+        Self::Scattered { keyspaces_at_lsn }
+    }
+
+    /// Returns the most recent (largest) LSN included in the query.
+    /// If any of the LSNs included in the query are invalid, returns
+    /// an error instead.
+    fn high_watermark_lsn(&self) -> Result<Lsn, GetVectoredError> {
+        match self {
+            Self::Uniform { lsn, .. } => {
+                if !lsn.is_valid() {
+                    return Err(GetVectoredError::InvalidLsn(*lsn));
+                }
+
+                Ok(*lsn)
+            }
+            Self::Scattered { keyspaces_at_lsn } => {
+                let mut max_lsn = None;
+                for (lsn, _keyspace) in keyspaces_at_lsn.iter() {
+                    if !lsn.is_valid() {
+                        return Err(GetVectoredError::InvalidLsn(*lsn));
+                    }
+                    max_lsn = std::cmp::max(max_lsn, Some(lsn));
+                }
+
+                if let Some(computed) = max_lsn {
+                    Ok(*computed)
+                } else {
+                    Err(GetVectoredError::Other(anyhow!("empty input")))
+                }
+            }
+        }
+    }
+
+    /// Returns the total keyspace being queried: the result of projecting
+    /// everything in the key dimensions onto the key axis.
+    fn total_keyspace(&self) -> KeySpace {
+        match self {
+            Self::Uniform { keyspace, .. } => keyspace.clone(),
+            Self::Scattered { keyspaces_at_lsn } => keyspaces_at_lsn
+                .iter()
+                .map(|(_lsn, keyspace)| keyspace)
+                .fold(KeySpace::default(), |mut acc, v| {
+                    acc.merge(v);
+                    acc
+                }),
+        }
+    }
+
+    /// Returns LSN for a specific key.
+    ///
+    /// Invariant: requested key must be part of [`Self::total_keyspace`]
+    pub(super) fn map_key_to_lsn(&self, key: &Key) -> Lsn {
+        match self {
+            Self::Uniform { lsn, .. } => *lsn,
+            Self::Scattered { keyspaces_at_lsn } => {
+                keyspaces_at_lsn
+                    .iter()
+                    .find(|(_lsn, keyspace)| keyspace.contains(key))
+                    .expect("Returned key was requested")
+                    .0
+            }
+        }
+    }
+
+    /// Remove any parts of the query (segments) which overlap with the provided
+    /// key space (also segments).
+    fn remove_overlapping_with(&mut self, to_remove: &KeySpace) -> KeySpace {
+        match self {
+            Self::Uniform { keyspace, .. } => keyspace.remove_overlapping_with(to_remove),
+            Self::Scattered { keyspaces_at_lsn } => {
+                let mut removed_accum = KeySpaceRandomAccum::new();
+                keyspaces_at_lsn.iter_mut().for_each(|(_lsn, keyspace)| {
+                    let removed = keyspace.remove_overlapping_with(to_remove);
+                    removed_accum.add_keyspace(removed);
+                });
+
+                removed_accum.to_keyspace()
+            }
+        }
+    }
+
+    fn is_empty(&self) -> bool {
+        match self {
+            Self::Uniform { keyspace, .. } => keyspace.is_empty(),
+            Self::Scattered { keyspaces_at_lsn } => keyspaces_at_lsn
+                .iter()
+                .all(|(_lsn, keyspace)| keyspace.is_empty()),
+        }
+    }
+
+    /// "Lower" the query on the LSN dimension
+    fn lower(&mut self, to: Lsn) {
+        match self {
+            Self::Uniform { lsn, .. } => {
+                // If the originally requested LSN is smaller than the starting
+                // LSN of the ancestor we are descending into, we need to respect that.
+                // Hence the min.
+                *lsn = std::cmp::min(*lsn, to);
+            }
+            Self::Scattered { keyspaces_at_lsn } => {
+                keyspaces_at_lsn.iter_mut().for_each(|(lsn, _keyspace)| {
+                    *lsn = std::cmp::min(*lsn, to);
+                });
+            }
+        }
+    }
+}
+
+impl std::fmt::Display for VersionedKeySpaceQuery {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[")?;
+
+        match self {
+            VersionedKeySpaceQuery::Uniform { keyspace, lsn } => {
+                write!(f, "{keyspace} @ {lsn}")?;
+            }
+            VersionedKeySpaceQuery::Scattered { keyspaces_at_lsn } => {
+                for (lsn, keyspace) in keyspaces_at_lsn.iter() {
+                    write!(f, "{keyspace} @ {lsn},")?;
+                }
+            }
+        }
+
+        write!(f, "]")
+    }
+}
+
 impl Timeline {
     #[allow(clippy::doc_lazy_continuation)]
     /// Get the data needed to reconstruct all keys in the provided keyspace
@@ -3941,16 +4116,15 @@ impl Timeline {
     /// 2.4. If the fringe is empty, go back to 1
     async fn get_vectored_reconstruct_data(
         &self,
-        mut keyspace: KeySpace,
-        request_lsn: Lsn,
+        mut query: VersionedKeySpaceQuery,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
+        let original_hwm_lsn = query.high_watermark_lsn().unwrap();
+
         let mut timeline_owned: Arc<Timeline>;
         let mut timeline = self;
 
-        let mut cont_lsn = Lsn(request_lsn.0 + 1);
-
         let missing_keyspace = loop {
             if self.cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
@@ -3967,15 +4141,14 @@ impl Timeline {
                             parent: crnt_perf_span,
                             "PLAN_IO_TIMELINE",
                             timeline = %timeline.timeline_id,
-                            lsn = %cont_lsn,
+                            high_watermark_lsn = %query.high_watermark_lsn().unwrap(),
                         )
                     })
                     .attached_child();
 
                 Self::get_vectored_reconstruct_data_timeline(
                     timeline,
-                    keyspace.clone(),
-                    cont_lsn,
+                    &query,
                     reconstruct_state,
                     &self.cancel,
                     &ctx,
@@ -3984,23 +4157,23 @@ impl Timeline {
                 .await?
             };
 
-            keyspace.remove_overlapping_with(&completed);
+            query.remove_overlapping_with(&completed);
 
             // Do not descend into the ancestor timeline for aux files.
             // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
             // stalling compaction.
-            keyspace.remove_overlapping_with(&KeySpace {
+            query.remove_overlapping_with(&KeySpace {
                 ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()],
             });
 
             // Keyspace is fully retrieved
-            if keyspace.is_empty() {
+            if query.is_empty() {
                 break None;
             }
 
             let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else {
                 // Not fully retrieved but no ancestor timeline.
-                break Some(keyspace);
+                break Some(query.total_keyspace());
             };
 
             // Now we see if there are keys covered by the image layer but does not exist in the
@@ -4011,7 +4184,7 @@ impl Timeline {
             // keys from `keyspace`, we expect there to be no overlap between it and the image covered key
             // space. If that's not the case, we had at least one key encounter a gap in the image layer
             // and stop the search as a result of that.
-            let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
+            let mut removed = query.remove_overlapping_with(&image_covered_keyspace);
             // Do not fire missing key error and end early for sparse keys. Note that we hava already removed
             // non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of
             // figuring out what is the inherited key range and do a fine-grained pruning.
@@ -4021,11 +4194,11 @@ impl Timeline {
             if !removed.is_empty() {
                 break Some(removed);
             }
-            // If we reached this point, `remove_overlapping_with` should not have made any change to the
-            // keyspace.
 
-            // Take the min to avoid reconstructing a page with data newer than request Lsn.
-            cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
+            // Each key range in the original query is at some point in the LSN space.
+            // When descending into the ancestor, lower all ranges in the LSN space
+            // such that new changes on the parent timeline are not visible.
+            query.lower(timeline.ancestor_lsn);
 
             let ctx = RequestContextBuilder::from(ctx)
                 .perf_span(|crnt_perf_span| {
@@ -4034,7 +4207,6 @@ impl Timeline {
                         parent: crnt_perf_span,
                         "GET_ANCESTOR",
                         timeline = %timeline.timeline_id,
-                        lsn = %cont_lsn,
                         ancestor = %ancestor_timeline.timeline_id,
                         ancestor_lsn = %timeline.ancestor_lsn
                     )
@@ -4064,22 +4236,47 @@ impl Timeline {
         };
 
         if let Some(missing_keyspace) = missing_keyspace {
-            return Err(GetVectoredError::MissingKey(MissingKeyError {
-                key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */
-                shard: self
-                    .shard_identity
-                    .get_shard_number(&missing_keyspace.start().unwrap()),
-                cont_lsn,
-                request_lsn,
+            return Err(GetVectoredError::MissingKey(Box::new(MissingKeyError {
+                keyspace: missing_keyspace, /* better if we can store the full keyspace */
+                shard: self.shard_identity.number,
+                original_hwm_lsn,
                 ancestor_lsn: Some(timeline.ancestor_lsn),
                 backtrace: None,
                 read_path: std::mem::take(&mut reconstruct_state.read_path),
-            }));
+                query: None,
+            })));
         }
 
         Ok(())
     }
 
+    async fn get_vectored_init_fringe(
+        &self,
+        query: &VersionedKeySpaceQuery,
+    ) -> Result<LayerFringe, GetVectoredError> {
+        let mut fringe = LayerFringe::new();
+        let guard = self.layers.read().await;
+
+        match query {
+            VersionedKeySpaceQuery::Uniform { keyspace, lsn } => {
+                // LSNs requested by the compute or determined by the pageserver
+                // are inclusive. Queries to the layer map use exclusive LSNs.
+                // Hence, bump the value before the query - same in the other
+                // match arm.
+                let cont_lsn = Lsn(lsn.0 + 1);
+                guard.update_search_fringe(keyspace, cont_lsn, &mut fringe)?;
+            }
+            VersionedKeySpaceQuery::Scattered { keyspaces_at_lsn } => {
+                for (lsn, keyspace) in keyspaces_at_lsn.iter() {
+                    let cont_lsn_for_keyspace = Lsn(lsn.0 + 1);
+                    guard.update_search_fringe(keyspace, cont_lsn_for_keyspace, &mut fringe)?;
+                }
+            }
+        }
+
+        Ok(fringe)
+    }
+
     /// Collect the reconstruct data for a keyspace from the specified timeline.
     ///
     /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect
@@ -4098,18 +4295,11 @@ impl Timeline {
     /// decides how to deal with these two keyspaces.
     async fn get_vectored_reconstruct_data_timeline(
         timeline: &Timeline,
-        keyspace: KeySpace,
-        mut cont_lsn: Lsn,
+        query: &VersionedKeySpaceQuery,
         reconstruct_state: &mut ValuesReconstructState,
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> Result<TimelineVisitOutcome, GetVectoredError> {
-        let mut unmapped_keyspace = keyspace.clone();
-        let mut fringe = LayerFringe::new();
-
-        let mut completed_keyspace = KeySpace::default();
-        let mut image_covered_keyspace = KeySpaceRandomAccum::new();
-
         // Prevent GC from progressing while visiting the current timeline.
         // If we are GC-ing because a new image layer was added while traversing
         // the timeline, then it will remove layers that are required for fulfilling
@@ -4120,11 +4310,37 @@ impl Timeline {
         // See `compaction::compact_with_gc` for why we need this.
         let _guard = timeline.gc_compaction_layer_update_lock.read().await;
 
-        loop {
+        // Initialize the fringe
+        let mut fringe = timeline.get_vectored_init_fringe(query).await?;
+
+        let mut completed_keyspace = KeySpace::default();
+        let mut image_covered_keyspace = KeySpaceRandomAccum::new();
+
+        while let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
             if cancel.is_cancelled() {
                 return Err(GetVectoredError::Cancelled);
             }
 
+            if let Some(ref mut read_path) = reconstruct_state.read_path {
+                read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range);
+            }
+
+            // Visit the layer and plan IOs for it
+            let next_cont_lsn = lsn_range.start;
+            layer_to_read
+                .get_values_reconstruct_data(
+                    keyspace_to_read.clone(),
+                    lsn_range,
+                    reconstruct_state,
+                    ctx,
+                )
+                .await?;
+
+            let mut unmapped_keyspace = keyspace_to_read;
+            let cont_lsn = next_cont_lsn;
+
+            reconstruct_state.on_layer_visited(&layer_to_read);
+
             let (keys_done_last_step, keys_with_image_coverage) =
                 reconstruct_state.consume_done_keys();
             unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
@@ -4135,31 +4351,15 @@ impl Timeline {
                 image_covered_keyspace.add_range(keys_with_image_coverage);
             }
 
+            // Query the layer map for the next layers to read.
+            //
             // Do not descent any further if the last layer we visited
             // completed all keys in the keyspace it inspected. This is not
             // required for correctness, but avoids visiting extra layers
             // which turns out to be a perf bottleneck in some cases.
             if !unmapped_keyspace.is_empty() {
                 let guard = timeline.layers.read().await;
-                let layers = guard.layer_map()?;
-
-                for range in unmapped_keyspace.ranges.iter() {
-                    let results = layers.range_search(range.clone(), cont_lsn);
-
-                    results
-                        .found
-                        .into_iter()
-                        .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
-                            (
-                                guard.upgrade(layer),
-                                keyspace_accum.to_keyspace(),
-                                lsn_floor..cont_lsn,
-                            )
-                        })
-                        .for_each(|(layer, keyspace, lsn_range)| {
-                            fringe.update(layer, keyspace, lsn_range)
-                        });
-                }
+                guard.update_search_fringe(&unmapped_keyspace, cont_lsn, &mut fringe)?;
 
                 // It's safe to drop the layer map lock after planning the next round of reads.
                 // The fringe keeps readable handles for the layers which are safe to read even
@@ -4173,28 +4373,6 @@ impl Timeline {
                 // at two different time points.
                 drop(guard);
             }
-
-            if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
-                if let Some(ref mut read_path) = reconstruct_state.read_path {
-                    read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range);
-                }
-                let next_cont_lsn = lsn_range.start;
-                layer_to_read
-                    .get_values_reconstruct_data(
-                        keyspace_to_read.clone(),
-                        lsn_range,
-                        reconstruct_state,
-                        ctx,
-                    )
-                    .await?;
-
-                unmapped_keyspace = keyspace_to_read;
-                cont_lsn = next_cont_lsn;
-
-                reconstruct_state.on_layer_visited(&layer_to_read);
-            } else {
-                break;
-            }
         }
 
         Ok(TimelineVisitOutcome {
@@ -4808,7 +4986,13 @@ impl Timeline {
         let ctx = ctx.attached_child();
         let work = async move {
             let Some((desc, path)) = frozen_layer
-                .write_to_disk(&ctx, key_range, self_clone.l0_flush_global_state.inner())
+                .write_to_disk(
+                    &ctx,
+                    key_range,
+                    self_clone.l0_flush_global_state.inner(),
+                    &self_clone.gate,
+                    self_clone.cancel.clone(),
+                )
                 .await?
             else {
                 return Ok(None);
@@ -4994,13 +5178,11 @@ impl Timeline {
                 if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS
                     || (last_key_in_range && key_request_accum.raw_size() > 0)
                 {
+                    let query =
+                        VersionedKeySpaceQuery::uniform(key_request_accum.consume_keyspace(), lsn);
+
                     let results = self
-                        .get_vectored(
-                            key_request_accum.consume_keyspace(),
-                            lsn,
-                            io_concurrency.clone(),
-                            ctx,
-                        )
+                        .get_vectored(query, io_concurrency.clone(), ctx)
                         .await?;
 
                     if self.cancel.is_cancelled() {
@@ -5089,7 +5271,11 @@ impl Timeline {
         // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
         // not contain too many keys, otherwise this takes a lot of memory.
         let data = self
-            .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
+            .get_vectored_impl(
+                VersionedKeySpaceQuery::uniform(partition.clone(), lsn),
+                &mut reconstruct_state,
+                ctx,
+            )
             .await?;
         let (data, total_kb_retrieved, total_keys_retrieved) = {
             let mut new_data = BTreeMap::new();
@@ -5346,6 +5532,8 @@ impl Timeline {
                 self.tenant_shard_id,
                 &img_range,
                 lsn,
+                &self.gate,
+                self.cancel.clone(),
                 ctx,
             )
             .await?;
@@ -6353,37 +6541,21 @@ impl Timeline {
 
     /// Reconstruct a value, using the given base image and WAL records in 'data'.
     async fn reconstruct_value(
-        &self,
-        key: Key,
-        request_lsn: Lsn,
-        data: ValueReconstructState,
-    ) -> Result<Bytes, PageReconstructError> {
-        self.reconstruct_value_inner(key, request_lsn, data, false)
-            .await
-    }
-
-    /// Reconstruct a value, using the given base image and WAL records in 'data'. It does not fire critical errors because
-    /// sometimes it is expected to fail due to unreplayable history described in <https://github.com/neondatabase/neon/issues/10395>.
-    async fn reconstruct_value_wo_critical_error(
-        &self,
-        key: Key,
-        request_lsn: Lsn,
-        data: ValueReconstructState,
-    ) -> Result<Bytes, PageReconstructError> {
-        self.reconstruct_value_inner(key, request_lsn, data, true)
-            .await
-    }
-
-    async fn reconstruct_value_inner(
         &self,
         key: Key,
         request_lsn: Lsn,
         mut data: ValueReconstructState,
-        no_critical_error: bool,
+        redo_attempt_type: RedoAttemptType,
     ) -> Result<Bytes, PageReconstructError> {
         // Perform WAL redo if needed
         data.records.reverse();
 
+        let fire_critical_error = match redo_attempt_type {
+            RedoAttemptType::ReadPage => true,
+            RedoAttemptType::LegacyCompaction => true,
+            RedoAttemptType::GcCompaction => false,
+        };
+
         // If we have a page image, and no WAL, we're all set
         if data.records.is_empty() {
             if let Some((img_lsn, img)) = &data.img {
@@ -6430,13 +6602,20 @@ impl Timeline {
                     .as_ref()
                     .context("timeline has no walredo manager")
                     .map_err(PageReconstructError::WalRedo)?
-                    .request_redo(key, request_lsn, data.img, data.records, self.pg_version)
+                    .request_redo(
+                        key,
+                        request_lsn,
+                        data.img,
+                        data.records,
+                        self.pg_version,
+                        redo_attempt_type,
+                    )
                     .await;
                 let img = match res {
                     Ok(img) => img,
                     Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
                     Err(walredo::Error::Other(err)) => {
-                        if !no_critical_error {
+                        if fire_critical_error {
                             critical!("walredo failure during page reconstruction: {err:?}");
                         }
                         return Err(PageReconstructError::WalRedo(
@@ -6719,6 +6898,8 @@ impl Timeline {
             self.tenant_shard_id,
             &(min_key..end_key),
             lsn,
+            &self.gate,
+            self.cancel.clone(),
             ctx,
         )
         .await?;
@@ -6780,6 +6961,8 @@ impl Timeline {
             self.tenant_shard_id,
             deltas.key_range.start,
             deltas.lsn_range,
+            &self.gate,
+            self.cancel.clone(),
             ctx,
         )
         .await?;
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8403c0a7d9c2..91cc8ca10c9f 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -7,7 +7,7 @@
 use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
-use std::time::Instant;
+use std::time::{Duration, Instant};
 
 use super::layer_manager::LayerManager;
 use super::{
@@ -16,6 +16,8 @@ use super::{
     Timeline,
 };
 
+use crate::tenant::timeline::DeltaEntry;
+use crate::walredo::RedoAttemptType;
 use anyhow::{Context, anyhow};
 use bytes::Bytes;
 use enumset::EnumSet;
@@ -78,6 +80,7 @@ impl std::fmt::Display for GcCompactionJobId {
 
 pub struct GcCompactionCombinedSettings {
     pub gc_compaction_enabled: bool,
+    pub gc_compaction_verification: bool,
     pub gc_compaction_initial_threshold_kb: u64,
     pub gc_compaction_ratio_percent: u64,
 }
@@ -223,6 +226,7 @@ impl GcCompactionQueue {
             gc_compaction_enabled,
             gc_compaction_initial_threshold_kb,
             gc_compaction_ratio_percent,
+            ..
         } = timeline.get_gc_compaction_settings();
         if !gc_compaction_enabled {
             return Ok(());
@@ -315,6 +319,9 @@ impl GcCompactionQueue {
                     flags: {
                         let mut flags = EnumSet::new();
                         flags |= CompactFlags::EnhancedGcBottomMostCompaction;
+                        if timeline.get_compaction_l0_first() {
+                            flags |= CompactFlags::YieldForL0;
+                        }
                         flags
                     },
                     sub_compaction: true,
@@ -742,8 +749,8 @@ impl KeyHistoryRetention {
     async fn pipe_to(
         self,
         key: Key,
-        delta_writer: &mut SplitDeltaLayerWriter,
-        mut image_writer: Option<&mut SplitImageLayerWriter>,
+        delta_writer: &mut SplitDeltaLayerWriter<'_>,
+        mut image_writer: Option<&mut SplitImageLayerWriter<'_>>,
         stat: &mut CompactionStatistics,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
@@ -783,6 +790,114 @@ impl KeyHistoryRetention {
         }
         Ok(())
     }
+
+    /// Verify if every key in the retention is readable by replaying the logs.
+    async fn verify(
+        &self,
+        key: Key,
+        base_img_from_ancestor: &Option<(Key, Lsn, Bytes)>,
+        full_history: &[(Key, Lsn, Value)],
+        tline: &Arc<Timeline>,
+    ) -> anyhow::Result<()> {
+        // Usually the min_lsn should be the first record but we do a full iteration to be safe.
+        let Some(min_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).min() else {
+            // This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`.
+            return Ok(());
+        };
+        let Some(max_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).max() else {
+            // This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`.
+            return Ok(());
+        };
+        let mut base_img = base_img_from_ancestor
+            .as_ref()
+            .map(|(_, lsn, img)| (*lsn, img));
+        let mut history = Vec::new();
+
+        async fn collect_and_verify(
+            key: Key,
+            lsn: Lsn,
+            base_img: &Option<(Lsn, &Bytes)>,
+            history: &[(Lsn, &NeonWalRecord)],
+            tline: &Arc<Timeline>,
+        ) -> anyhow::Result<()> {
+            let mut records = history
+                .iter()
+                .map(|(lsn, val)| (*lsn, (*val).clone()))
+                .collect::<Vec<_>>();
+
+            // WAL redo requires records in the reverse LSN order
+            records.reverse();
+            let data = ValueReconstructState {
+                img: base_img.as_ref().map(|(lsn, img)| (*lsn, (*img).clone())),
+                records,
+            };
+
+            tline
+                .reconstruct_value(key, lsn, data, RedoAttemptType::GcCompaction)
+                .await
+                .with_context(|| format!("verification failed for key {} at lsn {}", key, lsn))?;
+
+            Ok(())
+        }
+
+        for (retain_lsn, KeyLogAtLsn(logs)) in &self.below_horizon {
+            for (lsn, val) in logs {
+                match val {
+                    Value::Image(img) => {
+                        base_img = Some((*lsn, img));
+                        history.clear();
+                    }
+                    Value::WalRecord(rec) if val.will_init() => {
+                        base_img = None;
+                        history.clear();
+                        history.push((*lsn, rec));
+                    }
+                    Value::WalRecord(rec) => {
+                        history.push((*lsn, rec));
+                    }
+                }
+            }
+            if *retain_lsn >= min_lsn {
+                // Only verify after the key appears in the full history for the first time.
+
+                if base_img.is_none() && history.is_empty() {
+                    anyhow::bail!(
+                        "verificatoin failed: key {} has no history at {}",
+                        key,
+                        retain_lsn
+                    );
+                };
+                // We don't modify history: in theory, we could replace the history with a single
+                // image as in `generate_key_retention` to make redos at later LSNs faster. But we
+                // want to verify everything as if they are read from the real layer map.
+                collect_and_verify(key, *retain_lsn, &base_img, &history, tline).await?;
+            }
+        }
+
+        for (lsn, val) in &self.above_horizon.0 {
+            match val {
+                Value::Image(img) => {
+                    // Above the GC horizon, we verify every time we see an image.
+                    collect_and_verify(key, *lsn, &base_img, &history, tline).await?;
+                    base_img = Some((*lsn, img));
+                    history.clear();
+                }
+                Value::WalRecord(rec) if val.will_init() => {
+                    // Above the GC horizon, we verify every time we see an init record.
+                    collect_and_verify(key, *lsn, &base_img, &history, tline).await?;
+                    base_img = None;
+                    history.clear();
+                    history.push((*lsn, rec));
+                }
+                Value::WalRecord(rec) => {
+                    history.push((*lsn, rec));
+                }
+            }
+        }
+        // Ensure the latest record is readable.
+        collect_and_verify(key, max_lsn, &base_img, &history, tline).await?;
+        Ok(())
+    }
 }
 
 #[derive(Debug, Serialize, Default)]
@@ -819,15 +934,16 @@ pub struct CompactionStatistics {
     time_acquire_lock_secs: f64,
     time_analyze_secs: f64,
     time_download_layer_secs: f64,
+    time_to_first_kv_pair_secs: f64,
     time_main_loop_secs: f64,
     time_final_phase_secs: f64,
     time_total_secs: f64,
 
     // Summary
-    /// Ratio of the key-value size before/after gc-compaction.
-    uncompressed_size_ratio: f64,
-    /// Ratio of the physical size before/after gc-compaction.
-    physical_size_ratio: f64,
+    /// Ratio of the key-value size after/before gc-compaction.
+    uncompressed_retention_ratio: f64,
+    /// Ratio of the physical size after/before gc-compaction.
+    compressed_retention_ratio: f64,
 }
 
 impl CompactionStatistics {
@@ -896,15 +1012,15 @@ impl CompactionStatistics {
     fn finalize(&mut self) {
         let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size;
         let produced_key_value_size = self.image_produced.size + self.wal_produced.size;
-        self.uncompressed_size_ratio =
-            original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0
+        self.uncompressed_retention_ratio =
+            produced_key_value_size as f64 / (original_key_value_size as f64 + 1.0); // avoid div by 0
         let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size;
         let produced_physical_size = self.image_layer_produced.size
             + self.delta_layer_produced.size
             + self.image_layer_discarded.size
             + self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate
-        self.physical_size_ratio =
-            original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0
+        self.compressed_retention_ratio =
+            produced_physical_size as f64 / (original_physical_size as f64 + 1.0); // avoid div by 0
     }
 }
 
@@ -1113,7 +1229,17 @@ impl Timeline {
             // being potentially much longer.
             let rewrite_max = partition_count;
 
-            self.compact_shard_ancestors(rewrite_max, ctx).await?;
+            let outcome = self
+                .compact_shard_ancestors(
+                    rewrite_max,
+                    options.flags.contains(CompactFlags::YieldForL0),
+                    ctx,
+                )
+                .await?;
+            match outcome {
+                CompactionOutcome::Pending | CompactionOutcome::YieldForL0 => return Ok(outcome),
+                CompactionOutcome::Done | CompactionOutcome::Skipped => {}
+            }
         }
 
         Ok(CompactionOutcome::Done)
@@ -1130,8 +1256,10 @@ impl Timeline {
     async fn compact_shard_ancestors(
         self: &Arc<Self>,
         rewrite_max: usize,
+        yield_for_l0: bool,
         ctx: &RequestContext,
-    ) -> Result<(), CompactionError> {
+    ) -> Result<CompactionOutcome, CompactionError> {
+        let mut outcome = CompactionOutcome::Done;
         let mut drop_layers = Vec::new();
         let mut layers_to_rewrite: Vec<Layer> = Vec::new();
 
@@ -1142,12 +1270,7 @@ impl Timeline {
         // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
         // are rewriting layers.
         let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();
-
-        tracing::info!(
-            "starting shard ancestor compaction, latest_gc_cutoff: {}, pitr cutoff {}",
-            *latest_gc_cutoff,
-            self.gc_info.read().unwrap().cutoffs.time
-        );
+        let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time;
 
         let layers = self.layers.read().await;
         for layer_desc in layers.layer_map()?.iter_historic_layers() {
@@ -1165,8 +1288,8 @@ impl Timeline {
                 // This ancestral layer only covers keys that belong to other shards.
                 // We include the full metadata in the log: if we had some critical bug that caused
                 // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers.
-                info!(%layer, old_metadata=?layer.metadata(),
-                    "dropping layer after shard split, contains no keys for this shard.",
+                debug!(%layer, old_metadata=?layer.metadata(),
+                    "dropping layer after shard split, contains no keys for this shard",
                 );
 
                 if cfg!(debug_assertions) {
@@ -1228,19 +1351,35 @@ impl Timeline {
             }
 
             if layers_to_rewrite.len() >= rewrite_max {
-                tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
+                debug!(%layer, "Will rewrite layer on a future compaction, already rewrote {}",
                     layers_to_rewrite.len()
                 );
-                continue;
+                outcome = CompactionOutcome::Pending;
+                break;
             }
 
             // Fall through: all our conditions for doing a rewrite passed.
             layers_to_rewrite.push(layer);
         }
 
-        // Drop read lock on layer map before we start doing time-consuming I/O
+        // Drop read lock on layer map before we start doing time-consuming I/O.
         drop(layers);
 
+        // Drop out early if there's nothing to do.
+        if layers_to_rewrite.is_empty() && drop_layers.is_empty() {
+            return Ok(CompactionOutcome::Done);
+        }
+
+        info!(
+            "starting shard ancestor compaction, rewriting {} layers and dropping {} layers \
+                (latest_gc_cutoff={} pitr_cutoff={})",
+            layers_to_rewrite.len(),
+            drop_layers.len(),
+            *latest_gc_cutoff,
+            pitr_cutoff,
+        );
+        let started = Instant::now();
+
         let mut replace_image_layers = Vec::new();
 
         for layer in layers_to_rewrite {
@@ -1248,13 +1387,15 @@ impl Timeline {
                 return Err(CompactionError::ShuttingDown);
             }
 
-            tracing::info!(layer=%layer, "Rewriting layer after shard split...");
+            info!(layer=%layer, "rewriting layer after shard split");
             let mut image_layer_writer = ImageLayerWriter::new(
                 self.conf,
                 self.timeline_id,
                 self.tenant_shard_id,
                 &layer.layer_desc().key_range,
                 layer.layer_desc().image_layer_lsn(),
+                &self.gate,
+                self.cancel.clone(),
                 ctx,
             )
             .await
@@ -1286,7 +1427,7 @@ impl Timeline {
                     .map_err(CompactionError::Other)?;
                 let new_layer = Layer::finish_creating(self.conf, self, desc, &path)
                     .map_err(CompactionError::Other)?;
-                tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes",
+                info!(layer=%new_layer, "rewrote layer, {} -> {} bytes",
                     layer.metadata().file_size,
                     new_layer.metadata().file_size);
 
@@ -1296,6 +1437,26 @@ impl Timeline {
                 // the layer has no data for us with the ShardedRange check above, but
                 drop_layers.push(layer);
             }
+
+            // Yield for L0 compaction if necessary, but make sure we update the layer map below
+            // with the work we've already done.
+            if yield_for_l0
+                && self
+                    .l0_compaction_trigger
+                    .notified()
+                    .now_or_never()
+                    .is_some()
+            {
+                info!("shard ancestor compaction yielding for L0 compaction");
+                outcome = CompactionOutcome::YieldForL0;
+                break;
+            }
+        }
+
+        for layer in &drop_layers {
+            info!(%layer, old_metadata=?layer.metadata(),
+                "dropping layer after shard split (no keys for this shard)",
+            );
         }
 
         // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded
@@ -1313,17 +1474,36 @@ impl Timeline {
         // necessary for correctness, but it simplifies testing, and avoids proceeding with another
         // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O
         // load.
-        match self.remote_client.wait_completion().await {
-            Ok(()) => (),
-            Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
-            Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
-                return Err(CompactionError::ShuttingDown);
+        if outcome != CompactionOutcome::YieldForL0 {
+            info!("shard ancestor compaction waiting for uploads");
+            tokio::select! {
+                result = self.remote_client.wait_completion() => match result {
+                    Ok(()) => {},
+                    Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
+                    Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
+                        return Err(CompactionError::ShuttingDown);
+                    }
+                },
+                // Don't wait if there's L0 compaction to do. We don't need to update the outcome
+                // here, because we've already done the actual work.
+                _ = self.l0_compaction_trigger.notified(), if yield_for_l0 => {},
             }
         }
 
+        info!(
+            "shard ancestor compaction done in {:.3}s{}",
+            started.elapsed().as_secs_f64(),
+            match outcome {
+                CompactionOutcome::Pending =>
+                    format!(", with pending work (rewrite_max={rewrite_max})"),
+                CompactionOutcome::YieldForL0 => String::from(", yielding for L0 compaction"),
+                CompactionOutcome::Skipped | CompactionOutcome::Done => String::new(),
+            }
+        );
+
         fail::fail_point!("compact-shard-ancestors-persistent");
 
-        Ok(())
+        Ok(outcome)
     }
 
     /// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is
@@ -1855,6 +2035,8 @@ impl Timeline {
                                 debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end);
                                 lsn_range.clone()
                             },
+                            &self.gate,
+                            self.cancel.clone(),
                             ctx,
                         )
                         .await
@@ -2142,6 +2324,7 @@ impl Timeline {
     /// ```
     ///
     /// Note that `accumulated_values` must be sorted by LSN and should belong to a single key.
+    #[allow(clippy::too_many_arguments)]
     pub(crate) async fn generate_key_retention(
         self: &Arc<Timeline>,
         key: Key,
@@ -2150,6 +2333,7 @@ impl Timeline {
         retain_lsn_below_horizon: &[Lsn],
         delta_threshold_cnt: usize,
         base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
+        verification: bool,
     ) -> anyhow::Result<KeyHistoryRetention> {
         // Pre-checks for the invariants
 
@@ -2236,8 +2420,8 @@ impl Timeline {
             "should have at least below + above horizon batches"
         );
         let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
-        if let Some((key, lsn, img)) = base_img_from_ancestor {
-            replay_history.push((key, lsn, Value::Image(img)));
+        if let Some((key, lsn, ref img)) = base_img_from_ancestor {
+            replay_history.push((key, lsn, Value::Image(img.clone())));
         }
 
         /// Generate debug information for the replay history
@@ -2351,22 +2535,15 @@ impl Timeline {
             // Whether to reconstruct the image. In debug mode, we will generate an image
             // at every retain_lsn to ensure data is not corrupted, but we won't put the
             // image into the final layer.
-            let generate_image = produce_image || debug_mode;
-            if produce_image {
+            let img_and_lsn = if produce_image {
                 records_since_last_image = 0;
-            }
-            let img_and_lsn = if generate_image {
                 let replay_history_for_debug = if debug_mode {
                     Some(replay_history.clone())
                 } else {
                     None
                 };
                 let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
-                let history = if produce_image {
-                    std::mem::take(&mut replay_history)
-                } else {
-                    replay_history.clone()
-                };
+                let history = std::mem::take(&mut replay_history);
                 let mut img = None;
                 let mut records = Vec::with_capacity(history.len());
                 if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
@@ -2401,6 +2578,7 @@ impl Timeline {
                         records.push((lsn, rec));
                     }
                 }
+                // WAL redo requires records in the reverse LSN order
                 records.reverse();
                 let state = ValueReconstructState { img, records };
                 // last batch does not generate image so i is always in range, unless we force generate
@@ -2411,7 +2589,7 @@ impl Timeline {
                     lsn_split_points[i]
                 };
                 let img = self
-                    .reconstruct_value_wo_critical_error(key, request_lsn, state)
+                    .reconstruct_value(key, request_lsn, state, RedoAttemptType::GcCompaction)
                     .await?;
                 Some((request_lsn, img))
             } else {
@@ -2433,10 +2611,16 @@ impl Timeline {
         assert_eq!(retention.len(), lsn_split_points.len() + 1);
         for (idx, logs) in retention.into_iter().enumerate() {
             if idx == lsn_split_points.len() {
-                return Ok(KeyHistoryRetention {
+                let retention = KeyHistoryRetention {
                     below_horizon: result,
                     above_horizon: KeyLogAtLsn(logs),
-                });
+                };
+                if verification {
+                    retention
+                        .verify(key, &base_img_from_ancestor, full_history, self)
+                        .await?;
+                }
+                return Ok(retention);
             } else {
                 result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
             }
@@ -2903,6 +3087,9 @@ impl Timeline {
             }
             (false, res)
         };
+
+        let verification = self.get_gc_compaction_settings().gc_compaction_verification;
+
         info!(
             "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} min_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}, has_data_below={}",
             job_desc.selected_layers.len(),
@@ -3032,7 +3219,7 @@ impl Timeline {
         .map_err(CompactionError::Other)?;
 
         let time_download_layer = timer.elapsed();
-        let timer = Instant::now();
+        let mut timer = Instant::now();
 
         // Step 2: Produce images+deltas.
         let mut accumulated_values = Vec::new();
@@ -3049,6 +3236,8 @@ impl Timeline {
                     job_desc.compaction_key_range.start,
                     lowest_retain_lsn,
                     self.get_compaction_target_size(),
+                    &self.gate,
+                    self.cancel.clone(),
                     ctx,
                 )
                 .await
@@ -3065,6 +3254,8 @@ impl Timeline {
             self.tenant_shard_id,
             lowest_retain_lsn..end_lsn,
             self.get_compaction_target_size(),
+            &self.gate,
+            self.cancel.clone(),
         )
         .await
         .context("failed to create delta layer writer")
@@ -3107,6 +3298,7 @@ impl Timeline {
         // Actually, we can decide not to write to the image layer at all at this point because
         // the key and LSN range are determined. However, to keep things simple here, we still
         // create this writer, and discard the writer in the end.
+        let mut time_to_first_kv_pair = None;
 
         while let Some(((key, lsn, val), desc)) = merge_iter
             .next_with_trace()
@@ -3114,6 +3306,11 @@ impl Timeline {
             .context("failed to get next key-value pair")
             .map_err(CompactionError::Other)?
         {
+            if time_to_first_kv_pair.is_none() {
+                time_to_first_kv_pair = Some(timer.elapsed());
+                timer = Instant::now();
+            }
+
             if cancel.is_cancelled() {
                 return Err(CompactionError::ShuttingDown);
             }
@@ -3155,6 +3352,8 @@ impl Timeline {
                                 self.tenant_shard_id,
                                 desc.key_range.start,
                                 desc.lsn_range.clone(),
+                                &self.gate,
+                                self.cancel.clone(),
                                 ctx,
                             )
                             .await
@@ -3172,6 +3371,8 @@ impl Timeline {
                                 self.tenant_shard_id,
                                 job_desc.compaction_key_range.end,
                                 desc.lsn_range.clone(),
+                                &self.gate,
+                                self.cancel.clone(),
                                 ctx,
                             )
                             .await
@@ -3213,6 +3414,7 @@ impl Timeline {
                             .await
                             .context("failed to get ancestor image")
                             .map_err(CompactionError::Other)?,
+                        verification,
                     )
                     .await
                     .context("failed to generate key retention")
@@ -3253,6 +3455,7 @@ impl Timeline {
                     .await
                     .context("failed to get ancestor image")
                     .map_err(CompactionError::Other)?,
+                verification,
             )
             .await
             .context("failed to generate key retention")
@@ -3449,6 +3652,9 @@ impl Timeline {
         let time_final_phase = timer.elapsed();
 
         stat.time_final_phase_secs = time_final_phase.as_secs_f64();
+        stat.time_to_first_kv_pair_secs = time_to_first_kv_pair
+            .unwrap_or(Duration::ZERO)
+            .as_secs_f64();
         stat.time_main_loop_secs = time_main_loop.as_secs_f64();
         stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64();
         stat.time_download_layer_secs = time_download_layer.as_secs_f64();
@@ -3738,6 +3944,8 @@ impl CompactionJobExecutor for TimelineAdaptor {
             self.timeline.tenant_shard_id,
             key_range.start,
             lsn_range.clone(),
+            &self.timeline.gate,
+            self.timeline.cancel.clone(),
             ctx,
         )
         .await?;
@@ -3813,6 +4021,8 @@ impl TimelineAdaptor {
             self.timeline.tenant_shard_id,
             key_range,
             lsn,
+            &self.timeline.gate,
+            self.timeline.cancel.clone(),
             ctx,
         )
         .await?;
@@ -3909,8 +4119,6 @@ impl CompactionLayer<Key> for OwnArc<DeltaLayer> {
     }
 }
 
-use crate::tenant::timeline::DeltaEntry;
-
 impl CompactionLayer<Key> for ResidentDeltaLayer {
     fn key_range(&self) -> &Range<Key> {
         &self.0.layer_desc().key_range
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 1b0d22dc82b1..a841cc55f011 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -30,6 +30,7 @@ use crate::tenant::storage_layer::{
     AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer,
     ValuesReconstructState,
 };
+use crate::tenant::timeline::VersionedKeySpaceQuery;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 
 #[derive(Debug, thiserror::Error)]
@@ -212,13 +213,9 @@ async fn generate_tombstone_image_layer(
         }
     }
 
+    let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key_range.clone()), image_lsn);
     let data = ancestor
-        .get_vectored_impl(
-            KeySpace::single(key_range.clone()),
-            image_lsn,
-            &mut reconstruct_state,
-            ctx,
-        )
+        .get_vectored_impl(query, &mut reconstruct_state, ctx)
         .await
         .context("failed to retrieve aux keys")
         .map_err(|e| Error::launder(e, Error::Prepare))?;
@@ -231,6 +228,8 @@ async fn generate_tombstone_image_layer(
             detached.tenant_shard_id,
             &key_range,
             image_lsn,
+            &detached.gate,
+            detached.cancel.clone(),
             ctx,
         )
         .await
@@ -779,6 +778,8 @@ async fn copy_lsn_prefix(
         target_timeline.tenant_shard_id,
         layer.layer_desc().key_range.start,
         layer.layer_desc().lsn_range.start..end_lsn,
+        &target_timeline.gate,
+        target_timeline.cancel.clone(),
         ctx,
     )
     .await
diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
index 3ef82b36588a..c6d2944769fe 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -738,6 +738,8 @@ impl ChunkProcessingJob {
             self.timeline.tenant_shard_id,
             &self.range,
             self.pgdata_lsn,
+            &self.timeline.gate,
+            self.timeline.cancel.clone(),
             ctx,
         )
         .await?;
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index ed92ea28ce20..ae898260d2df 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -3,17 +3,18 @@ use std::sync::Arc;
 
 use anyhow::{Context, bail, ensure};
 use itertools::Itertools;
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::TenantShardId;
 use tokio_util::sync::CancellationToken;
 use tracing::trace;
 use utils::id::TimelineId;
 use utils::lsn::{AtomicLsn, Lsn};
 
-use super::{ReadableLayer, TimelineWriterState};
+use super::{LayerFringe, ReadableLayer, TimelineWriterState};
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::metrics::TimelineMetrics;
-use crate::tenant::layer_map::{BatchedUpdates, LayerMap};
+use crate::tenant::layer_map::{BatchedUpdates, LayerMap, SearchResult};
 use crate::tenant::storage_layer::{
     AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc,
     PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
@@ -38,7 +39,7 @@ impl Default for LayerManager {
 }
 
 impl LayerManager {
-    pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
+    fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
         match weak {
             ReadableLayerWeak::PersistentLayer(desc) => {
                 ReadableLayer::PersistentLayer(self.get_from_desc(&desc))
@@ -147,6 +148,36 @@ impl LayerManager {
         self.layers().keys().cloned().collect_vec()
     }
 
+    /// Update the [`LayerFringe`] of a read request
+    ///
+    /// Take a key space at a given LSN and query the layer map below each range
+    /// of the key space to find the next layers to visit.
+    pub(crate) fn update_search_fringe(
+        &self,
+        keyspace: &KeySpace,
+        cont_lsn: Lsn,
+        fringe: &mut LayerFringe,
+    ) -> Result<(), Shutdown> {
+        let map = self.layer_map()?;
+
+        for range in keyspace.ranges.iter() {
+            let results = map.range_search(range.clone(), cont_lsn);
+            results
+                .found
+                .into_iter()
+                .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
+                    (
+                        self.upgrade(layer),
+                        keyspace_accum.to_keyspace(),
+                        lsn_floor..cont_lsn,
+                    )
+                })
+                .for_each(|(layer, keyspace, lsn_range)| fringe.update(layer, keyspace, lsn_range));
+        }
+
+        Ok(())
+    }
+
     fn layers(&self) -> &HashMap<PersistentLayerKey, Layer> {
         use LayerManager::*;
         match self {
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index df2663f6bb04..3c3608d1bd1f 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -580,6 +580,7 @@ impl ConnectionManagerState {
                                 );
                                 Ok(())
                             }
+                            WalReceiverError::Cancelled => Ok(()),
                             WalReceiverError::Other(e) => {
                                 // give out an error to have task_mgr give it a really verbose logging
                                 if cancellation.is_cancelled() {
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index f41a9cfe82b9..52259f205bb8 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -73,6 +73,7 @@ pub(super) enum WalReceiverError {
     /// Generic error
     Other(anyhow::Error),
     ClosedGate,
+    Cancelled,
 }
 
 impl From<tokio_postgres::Error> for WalReceiverError {
@@ -200,6 +201,9 @@ pub(super) async fn handle_walreceiver_connection(
                                 // with a similar error.
                             },
                             WalReceiverError::SuccessfulCompletion(_) => {}
+                            WalReceiverError::Cancelled => {
+                                debug!("Connection cancelled")
+                            }
                             WalReceiverError::ClosedGate => {
                                 // doesn't happen at runtime
                             }
@@ -273,7 +277,12 @@ pub(super) async fn handle_walreceiver_connection(
 
     let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
 
-    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
+    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx)
+        .await
+        .map_err(|e| match e.kind {
+            crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled,
+            _ => WalReceiverError::Other(e.into()),
+        })?;
 
     let shard = vec![*timeline.get_shard_identity()];
 
@@ -445,7 +454,7 @@ pub(super) async fn handle_walreceiver_connection(
                         .inspect_err(|err| {
                             // TODO: we can't differentiate cancellation errors with
                             // anyhow::Error, so just ignore it if we're cancelled.
-                            if !cancellation.is_cancelled() {
+                            if !cancellation.is_cancelled() && !timeline.is_stopping() {
                                 critical!("{err:?}")
                             }
                         })?;
@@ -577,7 +586,7 @@ pub(super) async fn handle_walreceiver_connection(
                             .inspect_err(|err| {
                                 // TODO: we can't differentiate cancellation errors with
                                 // anyhow::Error, so just ignore it if we're cancelled.
-                                if !cancellation.is_cancelled() {
+                                if !cancellation.is_cancelled() && !timeline.is_stopping() {
                                     critical!("{err:?}")
                                 }
                             })?;
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index d5dc9666ce6b..be1b55ffa3ef 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -302,6 +302,7 @@ pub struct UploadQueueStoppedDeletable {
     pub(super) deleted_at: SetDeletedFlagProgress,
 }
 
+#[allow(clippy::large_enum_variant, reason = "TODO")]
 pub enum UploadQueueStopped {
     Deletable(UploadQueueStoppedDeletable),
     Uninitialized,
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 18df065f7646..e60c590f876c 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,13 +21,13 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.
 
+use std::backtrace::Backtrace;
 use std::collections::HashMap;
 use std::sync::{Arc, OnceLock};
 use std::time::{Duration, Instant, SystemTime};
 
-use anyhow::{Result, bail};
 use bytes::{Buf, Bytes};
-use pageserver_api::key::rel_block_to_key;
+use pageserver_api::key::{Key, rel_block_to_key};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
@@ -38,7 +38,7 @@ use postgres_ffi::{
     fsm_logical_to_physical, pg_constants,
 };
 use tracing::*;
-use utils::bin_ser::SerializeError;
+use utils::bin_ser::{DeserializeError, SerializeError};
 use utils::lsn::Lsn;
 use utils::rate_limit::RateLimit;
 use utils::{critical, failpoint_support};
@@ -104,12 +104,101 @@ struct WarnIngestLag {
     timestamp_invalid_msg_ratelimit: RateLimit,
 }
 
+pub struct WalIngestError {
+    pub backtrace: std::backtrace::Backtrace,
+    pub kind: WalIngestErrorKind,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum WalIngestErrorKind {
+    #[error(transparent)]
+    #[allow(private_interfaces)]
+    PageReconstructError(#[from] PageReconstructError),
+    #[error(transparent)]
+    DeserializationFailure(#[from] DeserializeError),
+    #[error(transparent)]
+    SerializationFailure(#[from] SerializeError),
+    #[error("the request contains data not supported by pageserver: {0} @ {1}")]
+    InvalidKey(Key, Lsn),
+    #[error("twophase file for xid {0} already exists")]
+    FileAlreadyExists(u64),
+    #[error("slru segment {0:?}/{1} already exists")]
+    SlruAlreadyExists(SlruKind, u32),
+    #[error("relation already exists")]
+    RelationAlreadyExists(RelTag),
+    #[error("invalid reldir key {0}")]
+    InvalidRelDirKey(Key),
+
+    #[error(transparent)]
+    LogicalError(anyhow::Error),
+    #[error(transparent)]
+    EncodeAuxFileError(anyhow::Error),
+    #[error(transparent)]
+    MaybeRelSizeV2Error(anyhow::Error),
+
+    #[error("timeline shutting down")]
+    Cancelled,
+}
+
+impl<T> From<T> for WalIngestError
+where
+    WalIngestErrorKind: From<T>,
+{
+    fn from(value: T) -> Self {
+        WalIngestError {
+            backtrace: Backtrace::capture(),
+            kind: WalIngestErrorKind::from(value),
+        }
+    }
+}
+
+impl std::error::Error for WalIngestError {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        self.kind.source()
+    }
+}
+
+impl core::fmt::Display for WalIngestError {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        self.kind.fmt(f)
+    }
+}
+
+impl core::fmt::Debug for WalIngestError {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        if f.alternate() {
+            f.debug_map()
+                .key(&"backtrace")
+                .value(&self.backtrace)
+                .key(&"kind")
+                .value(&self.kind)
+                .finish()
+        } else {
+            writeln!(f, "Error: {:?}", self.kind)?;
+            if self.backtrace.status() == std::backtrace::BacktraceStatus::Captured {
+                writeln!(f, "Stack backtrace: {:?}", self.backtrace)?;
+            }
+            Ok(())
+        }
+    }
+}
+
+#[macro_export]
+macro_rules! ensure_walingest {
+    ($($t:tt)*) => {
+        _ = || -> Result<(), anyhow::Error> {
+            anyhow::ensure!($($t)*);
+            Ok(())
+        }().map_err(WalIngestErrorKind::LogicalError)?;
+    };
+}
+
 impl WalIngest {
     pub async fn new(
         timeline: &Timeline,
         startpoint: Lsn,
         ctx: &RequestContext,
-    ) -> anyhow::Result<WalIngest> {
+    ) -> Result<WalIngest, WalIngestError> {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
         let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
@@ -145,7 +234,7 @@ impl WalIngest {
         interpreted: InterpretedWalRecord,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<bool> {
+    ) -> Result<bool, WalIngestError> {
         WAL_INGEST.records_received.inc();
         let prev_len = modification.len();
 
@@ -288,7 +377,7 @@ impl WalIngest {
     }
 
     /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL
-    fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64> {
+    fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result<u64, WalIngestError> {
         let next_full_xid =
             enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value });
 
@@ -298,9 +387,9 @@ impl WalIngest {
         if xid > next_xid {
             // Wraparound occurred, must be from a prev epoch.
             if epoch == 0 {
-                bail!(
+                Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!(
                     "apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}"
-                );
+                )))?;
             }
             epoch -= 1;
         }
@@ -313,7 +402,7 @@ impl WalIngest {
         clear_vm_bits: ClearVmBits,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let ClearVmBits {
             new_heap_blkno,
             old_heap_blkno,
@@ -402,7 +491,7 @@ impl WalIngest {
         create: DbaseCreate,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let DbaseCreate {
             db_id,
             tablespace_id,
@@ -505,7 +594,7 @@ impl WalIngest {
         dbase_drop: DbaseDrop,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let DbaseDrop {
             db_id,
             tablespace_ids,
@@ -523,7 +612,7 @@ impl WalIngest {
         create: SmgrCreate,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let SmgrCreate { rel } = create;
         self.put_rel_creation(modification, rel, ctx).await?;
         Ok(())
@@ -537,7 +626,7 @@ impl WalIngest {
         truncate: XlSmgrTruncate,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let XlSmgrTruncate {
             blkno,
             rnode,
@@ -689,7 +778,7 @@ impl WalIngest {
         record: XactRecord,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let (xact_common, is_commit, is_prepared) = match record {
             XactRecord::Prepare(XactPrepare { xl_xid, data }) => {
                 let xid: u64 = if modification.tline.pg_version >= 17 {
@@ -813,7 +902,7 @@ impl WalIngest {
         truncate: ClogTruncate,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let ClogTruncate {
             pageno,
             oldest_xid,
@@ -889,7 +978,7 @@ impl WalIngest {
         zero_page: ClogZeroPage,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         let ClogZeroPage { segno, rpageno } = zero_page;
 
         self.put_slru_page_image(
@@ -907,7 +996,7 @@ impl WalIngest {
         &mut self,
         modification: &mut DatadirModification,
         xlrec: &XlMultiXactCreate,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         // Create WAL record for updating the multixact-offsets page
         let pageno = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32;
         let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -1010,7 +1099,7 @@ impl WalIngest {
         modification: &mut DatadirModification<'_>,
         xlrec: &XlMultiXactTruncate,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         let (maxsegment, startsegment, endsegment) =
             enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
                 cp.oldestMulti = xlrec.end_trunc_off;
@@ -1058,7 +1147,7 @@ impl WalIngest {
         zero_page: MultiXactZeroPage,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         let MultiXactZeroPage {
             slru_kind,
             segno,
@@ -1080,7 +1169,7 @@ impl WalIngest {
         update: RelmapUpdate,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         let RelmapUpdate { update, buf } = update;
 
         modification
@@ -1093,7 +1182,7 @@ impl WalIngest {
         raw_record: RawXlogRecord,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         let RawXlogRecord { info, lsn, mut buf } = raw_record;
         let pg_version = modification.tline.pg_version;
 
@@ -1235,12 +1324,12 @@ impl WalIngest {
         put: PutLogicalMessage,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         let PutLogicalMessage { path, buf } = put;
         modification.put_file(path.as_str(), &buf, ctx).await
     }
 
-    fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<()> {
+    fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<(), WalIngestError> {
         match record {
             StandbyRecord::RunningXacts(running_xacts) => {
                 enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
@@ -1258,7 +1347,7 @@ impl WalIngest {
         &mut self,
         record: ReploriginRecord,
         modification: &mut DatadirModification<'_>,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         match record {
             ReploriginRecord::Set(set) => {
                 modification
@@ -1278,7 +1367,7 @@ impl WalIngest {
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         modification.put_rel_creation(rel, 0, ctx).await?;
         Ok(())
     }
@@ -1291,7 +1380,7 @@ impl WalIngest {
         blknum: BlockNumber,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
+    ) -> Result<(), WalIngestError> {
         self.handle_rel_extend(modification, rel, blknum, ctx)
             .await?;
         modification.put_rel_page_image(rel, blknum, img)?;
@@ -1305,7 +1394,7 @@ impl WalIngest {
         blknum: BlockNumber,
         rec: NeonWalRecord,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         self.handle_rel_extend(modification, rel, blknum, ctx)
             .await?;
         modification.put_rel_wal_record(rel, blknum, rec)?;
@@ -1318,7 +1407,7 @@ impl WalIngest {
         rel: RelTag,
         nblocks: BlockNumber,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         modification.put_rel_truncation(rel, nblocks, ctx).await?;
         Ok(())
     }
@@ -1329,7 +1418,7 @@ impl WalIngest {
         rel: RelTag,
         blknum: BlockNumber,
         ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
+    ) -> Result<(), WalIngestError> {
         let new_nblocks = blknum + 1;
         // Check if the relation exists. We implicitly create relations on first
         // record.
@@ -1423,7 +1512,7 @@ impl WalIngest {
         blknum: BlockNumber,
         img: Bytes,
         ctx: &RequestContext,
-    ) -> Result<()> {
+    ) -> Result<(), WalIngestError> {
         if !self.shard.is_shard_zero() {
             return Ok(());
         }
@@ -1441,7 +1530,7 @@ impl WalIngest {
         segno: u32,
         blknum: BlockNumber,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), WalIngestError> {
         // we don't use a cache for this like we do for relations. SLRUS are explcitly
         // extended with ZEROPAGE records, not with commit records, so it happens
         // a lot less frequently.
@@ -1509,6 +1598,7 @@ async fn get_relsize(
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
+    use anyhow::Result;
     use postgres_ffi::RELSEG_SIZE;
 
     use super::*;
@@ -1530,7 +1620,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> {
+    async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> {
         for i in 14..=16 {
             dispatch_pgversion!(i, {
                 pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?;
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 22d8d8381128..ed8a95436902 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -136,6 +136,16 @@ macro_rules! bail {
     }
 }
 
+#[derive(Debug, Clone, Copy)]
+pub enum RedoAttemptType {
+    /// Used for the read path. Will fire critical errors and retry twice if failure.
+    ReadPage,
+    // Used for legacy compaction (only used in image compaction). Will fire critical errors and retry once if failure.
+    LegacyCompaction,
+    // Used for gc compaction. Will not fire critical errors and not retry.
+    GcCompaction,
+}
+
 ///
 /// Public interface of WAL redo manager
 ///
@@ -156,11 +166,18 @@ impl PostgresRedoManager {
         base_img: Option<(Lsn, Bytes)>,
         records: Vec<(Lsn, NeonWalRecord)>,
         pg_version: u32,
+        redo_attempt_type: RedoAttemptType,
     ) -> Result<Bytes, Error> {
         if records.is_empty() {
             bail!("invalid WAL redo request with no records");
         }
 
+        let max_retry_attempts = match redo_attempt_type {
+            RedoAttemptType::ReadPage => 2,
+            RedoAttemptType::LegacyCompaction => 1,
+            RedoAttemptType::GcCompaction => 0,
+        };
+
         let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
         let mut img = base_img.map(|p| p.1);
         let mut batch_neon = apply_neon::can_apply_in_neon(&records[0].1);
@@ -180,6 +197,7 @@ impl PostgresRedoManager {
                         &records[batch_start..i],
                         self.conf.wal_redo_timeout,
                         pg_version,
+                        max_retry_attempts,
                     )
                     .await
                 };
@@ -201,6 +219,7 @@ impl PostgresRedoManager {
                 &records[batch_start..],
                 self.conf.wal_redo_timeout,
                 pg_version,
+                max_retry_attempts,
             )
             .await
         }
@@ -424,11 +443,11 @@ impl PostgresRedoManager {
         records: &[(Lsn, NeonWalRecord)],
         wal_redo_timeout: Duration,
         pg_version: u32,
+        max_retry_attempts: u32,
     ) -> Result<Bytes, Error> {
         *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
 
         let (rel, blknum) = key.to_rel_block().context("invalid record")?;
-        const MAX_RETRY_ATTEMPTS: u32 = 1;
         let mut n_attempts = 0u32;
         loop {
             let base_img = &base_img;
@@ -486,7 +505,7 @@ impl PostgresRedoManager {
                 info!(n_attempts, "retried walredo succeeded");
             }
             n_attempts += 1;
-            if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
+            if n_attempts > max_retry_attempts || result.is_ok() {
                 return result;
             }
         }
@@ -560,6 +579,7 @@ mod tests {
 
     use super::PostgresRedoManager;
     use crate::config::PageServerConf;
+    use crate::walredo::RedoAttemptType;
 
     #[tokio::test]
     async fn test_ping() {
@@ -593,6 +613,7 @@ mod tests {
                 None,
                 short_records(),
                 14,
+                RedoAttemptType::ReadPage,
             )
             .instrument(h.span())
             .await
@@ -621,6 +642,7 @@ mod tests {
                 None,
                 short_records(),
                 14,
+                RedoAttemptType::ReadPage,
             )
             .instrument(h.span())
             .await
@@ -642,6 +664,7 @@ mod tests {
                 None,
                 short_records(),
                 16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */
+                RedoAttemptType::ReadPage,
             )
             .instrument(h.span())
             .await
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index 61ae1eb97078..a3840f1f6f72 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -276,6 +276,7 @@ pub(crate) fn apply_in_neon(
             append,
             clear,
             will_init,
+            only_if,
         } => {
             use bytes::BufMut;
             if *will_init {
@@ -288,6 +289,13 @@ pub(crate) fn apply_in_neon(
             if *clear {
                 page.clear();
             }
+            if let Some(only_if) = only_if {
+                if page != only_if.as_bytes() {
+                    return Err(anyhow::anyhow!(
+                        "the current image does not match the expected image, cannot append"
+                    ));
+                }
+            }
             page.put_slice(append.as_bytes());
         }
     }
diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 8259d24359f6..426b176af94d 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -4,6 +4,7 @@
 MODULE_big = neon
 OBJS = \
 	$(WIN32RES) \
+	communicator.o \
 	extension_server.o \
 	file_cache.o \
 	hll.o \
diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c
new file mode 100644
index 000000000000..932034e22e14
--- /dev/null
+++ b/pgxn/neon/communicator.c
@@ -0,0 +1,2504 @@
+/*-------------------------------------------------------------------------
+ *
+ * communicator.c
+ *	  Functions for communicating with remote pageservers.
+ *
+ * This is the so-called "legacy" communicator. It consists of functions that
+ * are called from the smgr implementation, in pagestore_smgr.c. There are
+ * plans to replace this with a different implementation, see RFC.
+ *
+ * The communicator is a collection of functions that are called in each
+ * backend, when the backend needs to read a page or other information. It
+ * does not spawn background threads or anything like that. To process
+ * responses to prefetch requests in a timely fashion, however, it registers
+ * a ProcessInterrupts hook that gets called periodically from any
+ * CHECK_FOR_INTERRUPTS() point in the backend.
+ *
+ * By the time the functions in this file are called, the caller has already
+ * established that a request to the pageserver is necessary. The functions
+ * are only called for permanent relations (i.e. not temp or unlogged tables).
+ * Before making a call to the communicator, the caller has already checked
+ * the relation size or local file cache.
+ *
+ * However, when processing responses to getpage requests, the communicator
+ * writes pages directly to the LFC.
+ *
+ * The communicator functions take request LSNs as arguments; the caller is
+ * responsible for determining the correct LSNs to use. There's one exception
+ * to that, in prefetch_do_request(); it sometimes calls back to
+ * neon_get_request_lsns().  That's because sometimes a suitable response is
+ * found in the prefetch buffer and the request LSns are not needed, and the
+ * caller doesn't know whether it's needed or not.
+ *
+ * The main interface consists of the following "synchronous" calls:
+ *
+ * communicator_exists			- Returns true if a relation file exists
+ * communicator_nblocks			- Returns a relation's size
+ * communicator_dbsize			- Returns a databases's total size
+ * communicator_read_at_lsnv	- Read contents of one relation block
+ * communicator_read_slru_segment - Read contents of one SLRU segment
+ *
+ * In addition, there functions related to prefetching:
+ * communicator_prefetch_register_bufferv - Start prefetching a page
+ * communicator_prefetch_lookupv - Check if a page is already in prefetch queue
+ *
+ * Misc other functions:
+ * - communicator_init			- Initialize the module at startup
+ * - communicator_prefetch_pump_state - Called periodically to advance the state
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xlog.h"
+#include "access/xlogdefs.h"
+#include "access/xlog_internal.h"
+#include "access/xlogutils.h"
+#include "common/hashfn.h"
+#include "executor/instrument.h"
+#include "libpq/pqformat.h"
+#include "miscadmin.h"
+#include "port/pg_iovec.h"
+#include "postmaster/interrupt.h"
+#include "replication/walsender.h"
+#include "utils/timeout.h"
+
+#include "bitmap.h"
+#include "communicator.h"
+#include "file_cache.h"
+#include "neon.h"
+#include "neon_perf_counters.h"
+#include "pagestore_client.h"
+
+#if PG_VERSION_NUM >= 150000
+#include "access/xlogrecovery.h"
+#endif
+
+#if PG_VERSION_NUM < 160000
+typedef PGAlignedBlock PGIOAlignedBlock;
+#endif
+
+#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \
+	neon_shard_log(shard_no, elvl, "Broken connection state: " message, \
+				   ##__VA_ARGS__)
+
+page_server_api *page_server;
+
+static uint32 local_request_counter;
+#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter)
+
+/*
+ * Various settings related to prompt (fast) handling of PageStream responses
+ * at any CHECK_FOR_INTERRUPTS point.
+ */
+int				readahead_getpage_pull_timeout_ms = 0;
+static int		PS_TIMEOUT_ID = 0;
+static bool		timeout_set = false;
+static bool		timeout_signaled = false;
+
+/*
+ * We have a CHECK_FOR_INTERRUPTS in page_server->receive(), and we don't want
+ * that to handle any getpage responses if we're already working on the
+ * backlog of those, as we'd hit issues with determining which prefetch slot
+ * we just got a response for.
+ *
+ * To protect against that, we have this variable that's set whenever we start
+ * receiving data for prefetch slots, so that we don't get confused.
+ *
+ * Note that in certain error cases during readpage we may leak r_r_g=true,
+ * which results in a failure to pick up further responses until we first
+ * actively try to receive new getpage responses.
+ */
+static bool		readpage_reentrant_guard = false;
+
+static void pagestore_timeout_handler(void);
+
+#define START_PREFETCH_RECEIVE_WORK() \
+	do { \
+		readpage_reentrant_guard = true; \
+	} while (false)
+
+#define END_PREFETCH_RECEIVE_WORK() \
+	do { \
+		readpage_reentrant_guard = false; \
+		if (unlikely(timeout_signaled && !InterruptPending)) \
+			InterruptPending = true; \
+	} while (false)
+
+/*
+ * Prefetch implementation:
+ *
+ * Prefetch is performed locally by each backend.
+ *
+ * There can be up to readahead_buffer_size active IO requests registered at
+ * any time. Requests using smgr_prefetch are sent to the pageserver, but we
+ * don't wait on the response. Requests using smgr_read are either read from
+ * the buffer, or (if that's not possible) we wait on the response to arrive -
+ * this also will allow us to receive other prefetched pages.
+ * Each request is immediately written to the output buffer of the pageserver
+ * connection, but may not be flushed if smgr_prefetch is used: pageserver
+ * flushes sent requests on manual flush, or every neon.flush_output_after
+ * unflushed requests; which is not necessarily always and all the time.
+ *
+ * Once we have received a response, this value will be stored in the response
+ * buffer, indexed in a hash table. This allows us to retain our buffered
+ * prefetch responses even when we have cache misses.
+ *
+ * Reading of prefetch responses is delayed until them are actually needed
+ * (smgr_read). In case of prefetch miss or any other SMGR request other than
+ * smgr_read, all prefetch responses in the pipeline will need to be read from
+ * the connection; the responses are stored for later use.
+ *
+ * NOTE: The current implementation of the prefetch system implements a ring
+ * buffer of up to readahead_buffer_size requests. If there are more _read and
+ * _prefetch requests between the initial _prefetch and the _read of a buffer,
+ * the prefetch request will have been dropped from this prefetch buffer, and
+ * your prefetch was wasted.
+ */
+
+/*
+ * State machine:
+ *
+ * not in hash : in hash
+ *             :
+ * UNUSED ------> REQUESTED --> RECEIVED
+ *   ^         :      |            |
+ *   |         :      v            |
+ *   |         : TAG_REMAINS       |
+ *   |         :      |            |
+ *   +----------------+------------+
+ *             :
+ */
+typedef enum PrefetchStatus
+{
+	PRFS_UNUSED = 0,			/* unused slot */
+	PRFS_REQUESTED,				/* request was written to the sendbuffer to
+								 * PS, but not necessarily flushed. all fields
+								 * except response valid */
+	PRFS_RECEIVED,				/* all fields valid */
+	PRFS_TAG_REMAINS,			/* only buftag and my_ring_index are still
+								 * valid */
+} PrefetchStatus;
+
+/* must fit in uint8; bits 0x1 are used */
+typedef enum {
+	PRFSF_NONE	= 0x0,
+	PRFSF_LFC	= 0x1  /* received prefetch result is stored in LFC */
+} PrefetchRequestFlags;
+
+typedef struct PrefetchRequest
+{
+	BufferTag	buftag;			/* must be first entry in the struct */
+	shardno_t	shard_no;
+	uint8		status;		/* see PrefetchStatus for valid values */
+	uint8		flags;		/* see PrefetchRequestFlags */
+	neon_request_lsns request_lsns;
+	NeonRequestId reqid;
+	NeonResponse *response;		/* may be null */
+	uint64		my_ring_index;
+} PrefetchRequest;
+
+/* prefetch buffer lookup hash table */
+
+typedef struct PrfHashEntry
+{
+	PrefetchRequest *slot;
+	uint32		status;
+	uint32		hash;
+} PrfHashEntry;
+
+#define SH_PREFIX			prfh
+#define SH_ELEMENT_TYPE		PrfHashEntry
+#define SH_KEY_TYPE			PrefetchRequest *
+#define SH_KEY				slot
+#define SH_STORE_HASH
+#define SH_GET_HASH(tb, a)	((a)->hash)
+#define SH_HASH_KEY(tb, key) hash_bytes( \
+	((const unsigned char *) &(key)->buftag), \
+	sizeof(BufferTag) \
+)
+
+#define SH_EQUAL(tb, a, b)	(BufferTagsEqual(&(a)->buftag, &(b)->buftag))
+#define SH_SCOPE			static inline
+#define SH_DEFINE
+#define SH_DECLARE
+#include "lib/simplehash.h"
+
+/*
+ * PrefetchState maintains the state of (prefetch) getPage@LSN requests.
+ * It maintains a (ring) buffer of in-flight requests and responses.
+ *
+ * We maintain several indexes into the ring buffer:
+ * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0
+ *
+ * ring_unused points to the first unused slot of the buffer
+ * ring_receive is the next request that is to be received
+ * ring_last is the oldest received entry in the buffer
+ *
+ * Apart from being an entry in the ring buffer of prefetch requests, each
+ * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag.
+ */
+typedef struct PrefetchState
+{
+	MemoryContext bufctx;		/* context for prf_buffer[].response
+								 * allocations */
+	MemoryContext errctx;		/* context for prf_buffer[].response
+								 * allocations */
+	MemoryContext hashctx;		/* context for prf_buffer */
+
+	/* buffer indexes */
+	uint64		ring_unused;	/* first unused slot */
+	uint64		ring_flush;		/* next request to flush */
+	uint64		ring_receive;	/* next slot that is to receive a response */
+	uint64		ring_last;		/* min slot with a response value */
+
+	/* metrics / statistics  */
+	int			n_responses_buffered;	/* count of PS responses not yet in
+										 * buffers */
+	int			n_requests_inflight;	/* count of PS requests considered in
+										 * flight */
+	int			n_unused;		/* count of buffers < unused, > last, that are
+								 * also unused */
+
+	/* the buffers */
+	prfh_hash	*prf_hash;
+	int			max_shard_no;
+	/* Mark shards involved in prefetch */
+	uint8		shard_bitmap[(MAX_SHARDS + 7)/8];
+	PrefetchRequest prf_buffer[];	/* prefetch buffers */
+} PrefetchState;
+
+static PrefetchState *MyPState;
+
+#define GetPrfSlotNoCheck(ring_index) ( \
+	&MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \
+)
+
+#define GetPrfSlot(ring_index) ( \
+	( \
+		AssertMacro((ring_index) < MyPState->ring_unused && \
+					(ring_index) >= MyPState->ring_last), \
+		GetPrfSlotNoCheck(ring_index) \
+	) \
+)
+
+#define ReceiveBufferNeedsCompaction() (\
+	(MyPState->n_responses_buffered / 8) < ( \
+		MyPState->ring_receive - \
+			MyPState->ring_last - \
+			MyPState->n_responses_buffered \
+	) \
+)
+
+static process_interrupts_callback_t prev_interrupt_cb;
+
+static bool compact_prefetch_buffers(void);
+static void consume_prefetch_responses(void);
+static uint64 prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
+										BlockNumber nblocks, const bits8 *mask,
+										bool is_prefetch);
+static bool prefetch_read(PrefetchRequest *slot);
+static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns);
+static bool prefetch_wait_for(uint64 ring_index);
+static void prefetch_cleanup_trailing_unused(void);
+static inline void prefetch_set_unused(uint64 ring_index);
+
+static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns,
+										  PrefetchRequest *slot);
+static bool communicator_processinterrupts(void);
+
+void
+pg_init_communicator(void)
+{
+	prev_interrupt_cb = ProcessInterruptsCallback;
+	ProcessInterruptsCallback = communicator_processinterrupts;
+}
+
+static bool
+compact_prefetch_buffers(void)
+{
+	uint64		empty_ring_index = MyPState->ring_last;
+	uint64		search_ring_index = MyPState->ring_receive;
+	int			n_moved = 0;
+
+	if (MyPState->ring_receive == MyPState->ring_last)
+		return false;
+
+	while (search_ring_index > MyPState->ring_last)
+	{
+		search_ring_index--;
+		if (GetPrfSlot(search_ring_index)->status == PRFS_UNUSED)
+		{
+			empty_ring_index = search_ring_index;
+			break;
+		}
+	}
+
+	/*
+	 * Here we have established: slots < search_ring_index have an unknown
+	 * state (not scanned) slots >= search_ring_index and <= empty_ring_index
+	 * are unused slots > empty_ring_index are in use, or outside our buffer's
+	 * range. ... unless search_ring_index <= ring_last
+	 *
+	 * Therefore, there is a gap of at least one unused items between
+	 * search_ring_index and empty_ring_index (both inclusive), which grows as
+	 * we hit more unused items while moving backwards through the array.
+	 */
+
+	while (search_ring_index > MyPState->ring_last)
+	{
+		PrefetchRequest *source_slot;
+		PrefetchRequest *target_slot;
+		bool		found;
+
+		/* update search index to an unprocessed entry */
+		search_ring_index--;
+
+		source_slot = GetPrfSlot(search_ring_index);
+
+		if (source_slot->status == PRFS_UNUSED)
+			continue;
+
+		/* slot is used -- start moving slot */
+		target_slot = GetPrfSlot(empty_ring_index);
+
+		Assert(source_slot->status == PRFS_RECEIVED);
+		Assert(target_slot->status == PRFS_UNUSED);
+
+		target_slot->buftag = source_slot->buftag;
+		target_slot->shard_no = source_slot->shard_no;
+		target_slot->status = source_slot->status;
+		target_slot->flags = source_slot->flags;
+		target_slot->response = source_slot->response;
+		target_slot->reqid = source_slot->reqid;
+		target_slot->request_lsns = source_slot->request_lsns;
+		target_slot->my_ring_index = empty_ring_index;
+
+		prfh_delete(MyPState->prf_hash, source_slot);
+		prfh_insert(MyPState->prf_hash, target_slot, &found);
+
+		Assert(!found);
+
+		/* Adjust the location of our known-empty slot */
+		empty_ring_index--;
+
+		/* empty the moved slot */
+		source_slot->status = PRFS_UNUSED;
+		source_slot->buftag = (BufferTag)
+		{
+			0
+		};
+		source_slot->response = NULL;
+		source_slot->my_ring_index = 0;
+		source_slot->request_lsns = (neon_request_lsns) {
+			InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr
+		};
+
+		/* update bookkeeping */
+		n_moved++;
+	}
+
+	/*
+	 * Only when we've moved slots we can expect trailing unused slots, so
+	 * only then we clean up trailing unused slots.
+	 */
+	if (n_moved > 0)
+	{
+		prefetch_cleanup_trailing_unused();
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * If there might be responses still in the TCP buffer, then we should try to
+ * use those, to reduce any TCP backpressure on the OS/PS side.
+ *
+ * This procedure handles that.
+ *
+ * Note that this works because we don't pipeline non-getPage requests.
+ *
+ * NOTE: This procedure is not allowed to throw errors that should be handled
+ * by SMGR-related code, as this can be called from every CHECK_FOR_INTERRUPTS
+ * point inside and outside PostgreSQL.
+ *
+ * This still does throw errors when it receives malformed responses from PS.
+ *
+ * When we're not called from CHECK_FOR_INTERRUPTS (indicated by
+ * IsHandlingInterrupts) we also report we've ended prefetch receive work,
+ * just in case state tracking was lost due to an error in the sync getPage
+ * response code.
+ */
+void
+communicator_prefetch_pump_state(bool IsHandlingInterrupts)
+{
+	while (MyPState->ring_receive != MyPState->ring_flush)
+	{
+		NeonResponse   *response;
+		PrefetchRequest *slot;
+		MemoryContext	old;
+
+		slot = GetPrfSlot(MyPState->ring_receive);
+
+		old = MemoryContextSwitchTo(MyPState->errctx);
+		response = page_server->try_receive(slot->shard_no);
+		MemoryContextSwitchTo(old);
+
+		if (response == NULL)
+			break;
+
+		/* The slot should still be valid */
+		if (slot->status != PRFS_REQUESTED ||
+			slot->response != NULL ||
+			slot->my_ring_index != MyPState->ring_receive)
+			neon_shard_log(slot->shard_no, ERROR,
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   slot->status, slot->response,
+						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+
+		/* update prefetch state */
+		MyPState->n_responses_buffered += 1;
+		MyPState->n_requests_inflight -= 1;
+		MyPState->ring_receive += 1;
+		MyNeonCounters->getpage_prefetches_buffered =
+			MyPState->n_responses_buffered;
+
+		/* update slot state */
+		slot->status = PRFS_RECEIVED;
+		slot->response = response;
+
+		if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result)
+		{
+			/*
+			 * Store prefetched result in LFC (please read comments to lfc_prefetch
+			 * explaining why it can be done without holding shared buffer lock
+			 */
+			if (lfc_prefetch(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since))
+			{
+				slot->flags |= PRFSF_LFC;
+			}
+		}
+	}
+
+	/* We never pump the prefetch state while handling other pages */
+	if (!IsHandlingInterrupts)
+		END_PREFETCH_RECEIVE_WORK();
+
+	communicator_reconfigure_timeout_if_needed();
+}
+
+void
+readahead_buffer_resize(int newsize, void *extra)
+{
+	uint64		end,
+				nfree = newsize;
+	PrefetchState *newPState;
+	Size		newprfs_size = offsetof(PrefetchState, prf_buffer) +
+		(sizeof(PrefetchRequest) * newsize);
+
+	/* don't try to re-initialize if we haven't initialized yet */
+	if (MyPState == NULL)
+		return;
+
+	/*
+	 * Make sure that we don't lose track of active prefetch requests by
+	 * ensuring we have received all but the last n requests (n = newsize).
+	 */
+	if (MyPState->n_requests_inflight > newsize)
+	{
+		prefetch_wait_for(MyPState->ring_unused - newsize - 1);
+		Assert(MyPState->n_requests_inflight <= newsize);
+	}
+
+	/* construct the new PrefetchState, and copy over the memory contexts */
+	newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
+
+	newPState->bufctx = MyPState->bufctx;
+	newPState->errctx = MyPState->errctx;
+	newPState->hashctx = MyPState->hashctx;
+	newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL);
+	newPState->n_unused = newsize;
+	newPState->n_requests_inflight = 0;
+	newPState->n_responses_buffered = 0;
+	newPState->ring_last = newsize;
+	newPState->ring_unused = newsize;
+	newPState->ring_receive = newsize;
+	newPState->max_shard_no = MyPState->max_shard_no;
+	memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap));
+
+	/*
+	 * Copy over the prefetches.
+	 *
+	 * We populate the prefetch array from the end; to retain the most recent
+	 * prefetches, but this has the benefit of only needing to do one
+	 * iteration on the dataset, and trivial compaction.
+	 */
+	for (end = MyPState->ring_unused - 1;
+		 end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0;
+		 end -= 1)
+	{
+		PrefetchRequest *slot = GetPrfSlot(end);
+		PrefetchRequest *newslot;
+		bool		found;
+
+		if (slot->status == PRFS_UNUSED)
+			continue;
+
+		nfree -= 1;
+
+		newslot = &newPState->prf_buffer[nfree];
+		*newslot = *slot;
+		newslot->my_ring_index = nfree;
+
+		prfh_insert(newPState->prf_hash, newslot, &found);
+
+		Assert(!found);
+
+		switch (newslot->status)
+		{
+			case PRFS_UNUSED:
+				pg_unreachable();
+			case PRFS_REQUESTED:
+				newPState->n_requests_inflight += 1;
+				newPState->ring_receive -= 1;
+				newPState->ring_last -= 1;
+				break;
+			case PRFS_RECEIVED:
+				newPState->n_responses_buffered += 1;
+				newPState->ring_last -= 1;
+				break;
+			case PRFS_TAG_REMAINS:
+				newPState->ring_last -= 1;
+				break;
+		}
+		newPState->n_unused -= 1;
+	}
+	newPState->ring_flush = newPState->ring_receive;
+
+	MyNeonCounters->getpage_prefetches_buffered =
+		MyPState->n_responses_buffered;
+	MyNeonCounters->pageserver_open_requests =
+		MyPState->n_requests_inflight;
+
+	for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
+	{
+		PrefetchRequest *slot = GetPrfSlot(end);
+		Assert(slot->status != PRFS_REQUESTED);
+		if (slot->status == PRFS_RECEIVED)
+		{
+			pfree(slot->response);
+		}
+	}
+
+	prfh_destroy(MyPState->prf_hash);
+	pfree(MyPState);
+	MyPState = newPState;
+}
+
+
+
+/*
+ * Make sure that there are no responses still in the buffer.
+ *
+ * This function may indirectly update MyPState->pfs_hash; which invalidates
+ * any active pointers into the hash table.
+ */
+static void
+consume_prefetch_responses(void)
+{
+	if (MyPState->ring_receive < MyPState->ring_unused)
+		prefetch_wait_for(MyPState->ring_unused - 1);
+}
+
+static void
+prefetch_cleanup_trailing_unused(void)
+{
+	uint64		ring_index;
+	PrefetchRequest *slot;
+
+	while (MyPState->ring_last < MyPState->ring_receive)
+	{
+		ring_index = MyPState->ring_last;
+		slot = GetPrfSlot(ring_index);
+
+		if (slot->status == PRFS_UNUSED)
+			MyPState->ring_last += 1;
+		else
+			break;
+	}
+}
+
+
+static bool
+prefetch_flush_requests(void)
+{
+	for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++)
+	{
+		if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no))
+		{
+			if (!page_server->flush(shard_no))
+				return false;
+			BITMAP_CLR(MyPState->shard_bitmap, shard_no);
+		}
+	}
+	MyPState->max_shard_no = 0;
+	return true;
+}
+
+/*
+ * Wait for slot of ring_index to have received its response.
+ * The caller is responsible for making sure the request buffer is flushed.
+ *
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
+ * NOTE: callers should make sure they can handle query cancellations in this
+ * function's call path.
+ */
+static bool
+prefetch_wait_for(uint64 ring_index)
+{
+	PrefetchRequest *entry;
+	bool		result = true;
+
+	if (MyPState->ring_flush <= ring_index &&
+		MyPState->ring_unused > MyPState->ring_flush)
+	{
+		if (!prefetch_flush_requests())
+			return false;
+		MyPState->ring_flush = MyPState->ring_unused;
+	}
+
+	Assert(MyPState->ring_unused > ring_index);
+
+	while (MyPState->ring_receive <= ring_index)
+	{
+		START_PREFETCH_RECEIVE_WORK();
+		entry = GetPrfSlot(MyPState->ring_receive);
+
+		Assert(entry->status == PRFS_REQUESTED);
+		if (!prefetch_read(entry))
+		{
+			result = false;
+			break;
+		}
+
+		END_PREFETCH_RECEIVE_WORK();
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	return result;
+}
+
+/*
+ * Read the response of a prefetch request into its slot.
+ *
+ * The caller is responsible for making sure that the request for this buffer
+ * was flushed to the PageServer.
+ *
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
+ *
+ * NOTE: this does IO, and can get canceled out-of-line.
+ */
+static bool
+prefetch_read(PrefetchRequest *slot)
+{
+	NeonResponse *response;
+	MemoryContext old;
+	BufferTag	buftag;
+	shardno_t	shard_no;
+	uint64		my_ring_index;
+
+	Assert(slot->status == PRFS_REQUESTED);
+	Assert(slot->response == NULL);
+	Assert(slot->my_ring_index == MyPState->ring_receive);
+
+	if (slot->status != PRFS_REQUESTED ||
+		slot->response != NULL ||
+		slot->my_ring_index != MyPState->ring_receive)
+		neon_shard_log(slot->shard_no, ERROR,
+					   "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu",
+					   slot->status, slot->response,
+					   (long)slot->my_ring_index, (long)MyPState->ring_receive);
+
+	/*
+	 * Copy the request info so that if an error happens and the prefetch
+	 * queue is flushed during the receive call, we can print the original
+	 * values in the error message
+	 */
+	buftag = slot->buftag;
+	shard_no = slot->shard_no;
+	my_ring_index = slot->my_ring_index;
+
+	old = MemoryContextSwitchTo(MyPState->errctx);
+	response = (NeonResponse *) page_server->receive(shard_no);
+	MemoryContextSwitchTo(old);
+	if (response)
+	{
+		/* The slot should still be valid */
+		if (slot->status != PRFS_REQUESTED ||
+			slot->response != NULL ||
+			slot->my_ring_index != MyPState->ring_receive)
+			neon_shard_log(shard_no, ERROR,
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   slot->status, slot->response,
+						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+
+		/* update prefetch state */
+		MyPState->n_responses_buffered += 1;
+		MyPState->n_requests_inflight -= 1;
+		MyPState->ring_receive += 1;
+		MyNeonCounters->getpage_prefetches_buffered =
+			MyPState->n_responses_buffered;
+
+		/* update slot state */
+		slot->status = PRFS_RECEIVED;
+		slot->response = response;
+
+		if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result)
+		{
+			/*
+			 * Store prefetched result in LFC (please read comments to lfc_prefetch
+			 * explaining why it can be done without holding shared buffer lock
+			 */
+			if (lfc_prefetch(BufTagGetNRelFileInfo(buftag), buftag.forkNum, buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since))
+			{
+				slot->flags |= PRFSF_LFC;
+			}
+		}
+		return true;
+	}
+	else
+	{
+		/*
+		 * Note: The slot might no longer be valid, if the connection was lost
+		 * and the prefetch queue was flushed during the receive call
+		 */
+		neon_shard_log(shard_no, LOG,
+					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
+					   (long) my_ring_index,
+					   RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
+					   buftag.forkNum, buftag.blockNum);
+		return false;
+	}
+}
+
+/*
+ * Disconnect hook - drop prefetches when the connection drops
+ *
+ * If we don't remove the failed prefetches, we'd be serving incorrect
+ * data to the smgr.
+ */
+void
+prefetch_on_ps_disconnect(void)
+{
+	MyPState->ring_flush = MyPState->ring_unused;
+
+	while (MyPState->ring_receive < MyPState->ring_unused)
+	{
+		PrefetchRequest *slot;
+		uint64		ring_index = MyPState->ring_receive;
+
+		slot = GetPrfSlot(ring_index);
+
+		Assert(slot->status == PRFS_REQUESTED);
+		Assert(slot->my_ring_index == ring_index);
+
+		/*
+		 * Drop connection to all shards which have prefetch requests.
+		 * It is not a problem to call disconnect multiple times on the same connection
+		 * because disconnect implementation in libpagestore.c will check if connection
+		 * is alive and do nothing of connection was already dropped.
+		 */
+		page_server->disconnect(slot->shard_no);
+
+		/* clean up the request */
+		slot->status = PRFS_TAG_REMAINS;
+		MyPState->n_requests_inflight -= 1;
+		MyPState->ring_receive += 1;
+
+		prefetch_set_unused(ring_index);
+		pgBufferUsage.prefetch.expired += 1;
+		MyNeonCounters->getpage_prefetch_discards_total += 1;
+	}
+
+	/*
+	 * We can have gone into retry due to network error, so update stats with
+	 * the latest available
+	 */
+	MyNeonCounters->pageserver_open_requests =
+		MyPState->n_requests_inflight;
+	MyNeonCounters->getpage_prefetches_buffered =
+		MyPState->n_responses_buffered;
+}
+
+/*
+ * prefetch_set_unused() - clear a received prefetch slot
+ *
+ * The slot at ring_index must be a current member of the ring buffer,
+ * and may not be in the PRFS_REQUESTED state.
+ *
+ * NOTE: this function will update MyPState->pfs_hash; which invalidates any
+ * active pointers into the hash table.
+ */
+static inline void
+prefetch_set_unused(uint64 ring_index)
+{
+	PrefetchRequest *slot;
+
+	if (ring_index < MyPState->ring_last)
+		return;					/* Should already be unused */
+
+	slot = GetPrfSlot(ring_index);
+	if (slot->status == PRFS_UNUSED)
+		return;
+
+	Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS);
+
+	if (slot->status == PRFS_RECEIVED)
+	{
+		pfree(slot->response);
+		slot->response = NULL;
+
+		MyPState->n_responses_buffered -= 1;
+		MyPState->n_unused += 1;
+
+		MyNeonCounters->getpage_prefetches_buffered =
+			MyPState->n_responses_buffered;
+	}
+	else
+	{
+		Assert(slot->response == NULL);
+	}
+
+	prfh_delete(MyPState->prf_hash, slot);
+
+	/* clear all fields */
+	MemSet(slot, 0, sizeof(PrefetchRequest));
+	slot->status = PRFS_UNUSED;
+
+	/* run cleanup if we're holding back ring_last */
+	if (MyPState->ring_last == ring_index)
+		prefetch_cleanup_trailing_unused();
+
+	/*
+	 * ... and try to store the buffered responses more compactly if > 12.5%
+	 * of the buffer is gaps
+	 */
+	else if (ReceiveBufferNeedsCompaction())
+		compact_prefetch_buffers();
+}
+
+/*
+ * Send one prefetch request to the pageserver. To wait for the response, call
+ * prefetch_wait_for().
+ */
+static void
+prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
+{
+	bool		found;
+	uint64		mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index;
+
+	NeonGetPageRequest request = {
+		.hdr.tag = T_NeonGetPageRequest,
+		.hdr.reqid = GENERATE_REQUEST_ID(),
+		/* lsn and not_modified_since are filled in below */
+		.rinfo = BufTagGetNRelFileInfo(slot->buftag),
+		.forknum = slot->buftag.forkNum,
+		.blkno = slot->buftag.blockNum,
+	};
+
+	Assert(mySlotNo == MyPState->ring_unused);
+
+	slot->reqid = request.hdr.reqid;
+
+	if (force_request_lsns)
+		slot->request_lsns = *force_request_lsns;
+	else
+		neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
+							  slot->buftag.forkNum, slot->buftag.blockNum,
+							  &slot->request_lsns, 1);
+	request.hdr.lsn = slot->request_lsns.request_lsn;
+	request.hdr.not_modified_since = slot->request_lsns.not_modified_since;
+
+	Assert(slot->response == NULL);
+	Assert(slot->my_ring_index == MyPState->ring_unused);
+
+	while (!page_server->send(slot->shard_no, (NeonRequest *) &request))
+	{
+		Assert(mySlotNo == MyPState->ring_unused);
+		/* loop */
+	}
+
+	/* update prefetch state */
+	MyPState->n_requests_inflight += 1;
+	MyPState->n_unused -= 1;
+	MyPState->ring_unused += 1;
+	BITMAP_SET(MyPState->shard_bitmap, slot->shard_no);
+	MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no);
+
+	/* update slot state */
+	slot->status = PRFS_REQUESTED;
+	prfh_insert(MyPState->prf_hash, slot, &found);
+	Assert(!found);
+}
+
+/*
+ * Lookup of already received prefetch requests. Only already received responses matching required LSNs are accepted.
+ * Present pages are marked in "mask" bitmap and total number of such pages is returned.
+ */
+int
+communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum,
+							  neon_request_lsns *lsns, BlockNumber nblocks,
+							  void **buffers, bits8 *mask)
+{
+	int hits = 0;
+	PrefetchRequest hashkey;
+
+	/*
+	 * Use an intermediate PrefetchRequest struct as the hash key to ensure
+	 * correct alignment and that the padding bytes are cleared.
+	 */
+	memset(&hashkey.buftag, 0, sizeof(BufferTag));
+	CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo);
+	hashkey.buftag.forkNum = forknum;
+
+	for (int i = 0; i < nblocks; i++)
+	{
+		PrfHashEntry *entry;
+
+		hashkey.buftag.blockNum = blocknum + i;
+		entry = prfh_lookup(MyPState->prf_hash, &hashkey);
+
+		if (entry != NULL)
+		{
+			PrefetchRequest *slot = entry->slot;
+			uint64 ring_index = slot->my_ring_index;
+			Assert(slot == GetPrfSlot(ring_index));
+
+			Assert(slot->status != PRFS_UNUSED);
+			Assert(MyPState->ring_last <= ring_index &&
+				   ring_index < MyPState->ring_unused);
+			Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag));
+
+			if (slot->status != PRFS_RECEIVED)
+				continue;
+
+			/*
+			 * If the caller specified a request LSN to use, only accept
+			 * prefetch responses that satisfy that request.
+			 */
+			if (!neon_prefetch_response_usable(&lsns[i], slot))
+				continue;
+
+			/*
+			 * Ignore errors
+			 */
+			if (slot->response->tag != T_NeonGetPageResponse)
+			{
+				if (slot->response->tag != T_NeonErrorResponse)
+				{
+					NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
+											"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
+											T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag);
+				}
+				continue;
+			}
+			memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ);
+
+
+			/*
+			 * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received
+			 * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here
+			 * under buffer lock.
+			 */
+			if (!lfc_store_prefetch_result)
+				lfc_write(rinfo, forknum, blocknum + i, buffers[i]);
+
+			prefetch_set_unused(ring_index);
+			BITMAP_SET(mask, i);
+
+			hits += 1;
+			inc_getpage_wait(0);
+		}
+	}
+	pgBufferUsage.prefetch.hits += hits;
+	return hits;
+}
+
+/*
+ * prefetch_register_bufferv() - register and prefetch buffers
+ *
+ * Register that we may want the contents of BufferTag in the near future.
+ * This is used when issuing a speculative prefetch request, but also when
+ * performing a synchronous request and need the buffer right now.
+ *
+ * If force_request_lsns is not NULL, those values are sent to the
+ * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure
+ * to calculate the LSNs to send.
+ *
+ * Bits set in *mask (if present) indicate pages already read; i.e. pages we
+ * can skip in this process.
+ *
+ * When performing a prefetch rather than a synchronous request,
+ * is_prefetch==true. Currently, it only affects how the request is accounted
+ * in the perf counters.
+ *
+ * NOTE: this function may indirectly update MyPState->pfs_hash; which
+ * invalidates any active pointers into the hash table.
+ */
+void
+communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
+									   BlockNumber nblocks, const bits8 *mask)
+{
+	uint64		ring_index PG_USED_FOR_ASSERTS_ONLY;
+
+	ring_index = prefetch_register_bufferv(tag, frlsns, nblocks, mask, true);
+
+	Assert(ring_index < MyPState->ring_unused &&
+		   MyPState->ring_last <= ring_index);
+}
+
+/* internal version. Returns the ring index */
+static uint64
+prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
+						  BlockNumber nblocks, const bits8 *mask,
+						  bool is_prefetch)
+{
+	uint64		min_ring_index;
+	PrefetchRequest hashkey;
+#ifdef USE_ASSERT_CHECKING
+	bool		any_hits = false;
+#endif
+	/* We will never read further ahead than our buffer can store. */
+	nblocks = Max(1, Min(nblocks, readahead_buffer_size));
+
+	/*
+	 * Use an intermediate PrefetchRequest struct as the hash key to ensure
+	 * correct alignment and that the padding bytes are cleared.
+	 */
+	memset(&hashkey.buftag, 0, sizeof(BufferTag));
+	hashkey.buftag = tag;
+
+Retry:
+	/*
+	 * We can have gone into retry due to network error, so update stats with
+	 * the latest available
+	 */
+	MyNeonCounters->pageserver_open_requests =
+		MyPState->ring_unused - MyPState->ring_receive;
+	MyNeonCounters->getpage_prefetches_buffered =
+		MyPState->n_responses_buffered;
+
+	min_ring_index = UINT64_MAX;
+	for (int i = 0; i < nblocks; i++)
+	{
+		PrefetchRequest *slot = NULL;
+		PrfHashEntry *entry = NULL;
+		uint64		ring_index;
+		neon_request_lsns *lsns;
+
+		if (PointerIsValid(mask) && BITMAP_ISSET(mask, i))
+			continue;
+
+		if (frlsns)
+			lsns = &frlsns[i];
+		else
+			lsns = NULL;
+
+#ifdef USE_ASSERT_CHECKING
+		any_hits = true;
+#endif
+
+		slot = NULL;
+		entry = NULL;
+
+		hashkey.buftag.blockNum = tag.blockNum + i;
+		entry = prfh_lookup(MyPState->prf_hash, &hashkey);
+
+		if (entry != NULL)
+		{
+			slot = entry->slot;
+			ring_index = slot->my_ring_index;
+			Assert(slot == GetPrfSlot(ring_index));
+
+			Assert(slot->status != PRFS_UNUSED);
+			Assert(MyPState->ring_last <= ring_index &&
+				   ring_index < MyPState->ring_unused);
+			Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag));
+
+			/*
+			 * If the caller specified a request LSN to use, only accept
+			 * prefetch responses that satisfy that request.
+			 */
+			if (!is_prefetch)
+			{
+				if (!neon_prefetch_response_usable(lsns, slot))
+				{
+					/* Wait for the old request to finish and discard it */
+					if (!prefetch_wait_for(ring_index))
+						goto Retry;
+					prefetch_set_unused(ring_index);
+					entry = NULL;
+					slot = NULL;
+					pgBufferUsage.prefetch.expired += 1;
+					MyNeonCounters->getpage_prefetch_discards_total += 1;
+				}
+			}
+
+			if (entry != NULL)
+			{
+				/*
+				 * We received a prefetch for a page that was recently read
+				 * and removed from the buffers. Remove that request from the
+				 * buffers.
+				 */
+				if (slot->status == PRFS_TAG_REMAINS)
+				{
+					prefetch_set_unused(ring_index);
+					entry = NULL;
+					slot = NULL;
+				}
+				else
+				{
+					min_ring_index = Min(min_ring_index, ring_index);
+					/* The buffered request is good enough, return that index */
+					if (is_prefetch)
+						pgBufferUsage.prefetch.duplicates++;
+					continue;
+				}
+			}
+		}
+		else if (!is_prefetch)
+		{
+			pgBufferUsage.prefetch.misses += 1;
+			MyNeonCounters->getpage_prefetch_misses_total++;
+		}
+		/*
+		 * We can only leave the block above by finding that there's
+		 * no entry that can satisfy this request, either because there
+		 * was no entry, or because the entry was invalid or didn't satisfy
+		 * the LSNs provided.
+		 *
+		 * The code should've made sure to clear up the data.
+		 */
+		Assert(entry == NULL);
+		Assert(slot == NULL);
+
+		/* There should be no buffer overflow */
+		Assert(MyPState->ring_last + readahead_buffer_size >= MyPState->ring_unused);
+
+		/*
+		 * If the prefetch queue is full, we need to make room by clearing the
+		 * oldest slot. If the oldest slot holds a buffer that was already
+		 * received, we can just throw it away; we fetched the page
+		 * unnecessarily in that case. If the oldest slot holds a request that
+		 * we haven't received a response for yet, we have to wait for the
+		 * response to that before we can continue. We might not have even
+		 * flushed the request to the pageserver yet, it might be just sitting
+		 * in the output buffer. In that case, we flush it and wait for the
+		 * response. (We could decide not to send it, but it's hard to abort
+		 * when the request is already in the output buffer, and 'not sending'
+		 * a prefetch request kind of goes against the principles of
+		 * prefetching)
+		 */
+		if (MyPState->ring_last + readahead_buffer_size == MyPState->ring_unused)
+		{
+			uint64		cleanup_index = MyPState->ring_last;
+
+			slot = GetPrfSlot(cleanup_index);
+
+			Assert(slot->status != PRFS_UNUSED);
+
+			/*
+			 * If there is good reason to run compaction on the prefetch buffers,
+			 * try to do that.
+			 */
+			if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers())
+			{
+				Assert(slot->status == PRFS_UNUSED);
+			}
+			else
+			{
+				/*
+				 * We have the slot for ring_last, so that must still be in
+				 * progress
+				 */
+				switch (slot->status)
+				{
+					case PRFS_REQUESTED:
+						Assert(MyPState->ring_receive == cleanup_index);
+						if (!prefetch_wait_for(cleanup_index))
+							goto Retry;
+						prefetch_set_unused(cleanup_index);
+						pgBufferUsage.prefetch.expired += 1;
+						MyNeonCounters->getpage_prefetch_discards_total += 1;
+						break;
+					case PRFS_RECEIVED:
+					case PRFS_TAG_REMAINS:
+						prefetch_set_unused(cleanup_index);
+						pgBufferUsage.prefetch.expired += 1;
+						MyNeonCounters->getpage_prefetch_discards_total += 1;
+						break;
+					default:
+						pg_unreachable();
+				}
+			}
+		}
+
+		/*
+		 * The next buffer pointed to by `ring_unused` is now definitely empty, so
+		 * we can insert the new request to it.
+		 */
+		ring_index = MyPState->ring_unused;
+
+		Assert(MyPState->ring_last <= ring_index &&
+			   ring_index <= MyPState->ring_unused);
+
+		slot = GetPrfSlotNoCheck(ring_index);
+
+		Assert(slot->status == PRFS_UNUSED);
+
+		/*
+		 * We must update the slot data before insertion, because the hash
+		 * function reads the buffer tag from the slot.
+		 */
+		slot->buftag = hashkey.buftag;
+		slot->shard_no = get_shard_number(&tag);
+		slot->my_ring_index = ring_index;
+		slot->flags = 0;
+
+		min_ring_index = Min(min_ring_index, ring_index);
+
+		if (is_prefetch)
+			MyNeonCounters->getpage_prefetch_requests_total++;
+		else
+			MyNeonCounters->getpage_sync_requests_total++;
+
+		prefetch_do_request(slot, lsns);
+	}
+
+	MyNeonCounters->pageserver_open_requests =
+		MyPState->ring_unused - MyPState->ring_receive;
+
+	Assert(any_hits);
+
+	Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED ||
+		   GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED);
+	Assert(MyPState->ring_last <= min_ring_index &&
+		   min_ring_index < MyPState->ring_unused);
+
+	if (flush_every_n_requests > 0 &&
+		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
+	{
+		if (!prefetch_flush_requests())
+		{
+			/*
+			 * Prefetch set is reset in case of error, so we should try to
+			 * register our request once again
+			 */
+			goto Retry;
+		}
+		MyPState->ring_flush = MyPState->ring_unused;
+	}
+
+	return min_ring_index;
+}
+
+static bool
+equal_requests(NeonRequest* a, NeonRequest* b)
+{
+	return a->reqid == b->reqid && a->lsn == b->lsn && a->not_modified_since == b->not_modified_since;
+}
+
+
+/*
+ * Note: this function can get canceled and use a long jump to the next catch
+ * context. Take care.
+ */
+static NeonResponse *
+page_server_request(void const *req)
+{
+	NeonResponse *resp;
+	BufferTag tag = {0};
+	shardno_t shard_no;
+
+	switch (messageTag(req))
+	{
+		case T_NeonExistsRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
+			break;
+		case T_NeonNblocksRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo);
+			break;
+		case T_NeonDbSizeRequest:
+			NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode;
+			break;
+		case T_NeonGetPageRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo);
+			tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
+			break;
+		default:
+			neon_log(ERROR, "Unexpected request tag: %d", messageTag(req));
+	}
+	shard_no = get_shard_number(&tag);
+
+	/*
+	 * Current sharding model assumes that all metadata is present only at shard 0.
+	 * We still need to call get_shard_no() to check if shard map is up-to-date.
+	 */
+	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest)
+	{
+		shard_no = 0;
+	}
+
+	do
+	{
+		PG_TRY();
+		{
+			while (!page_server->send(shard_no, (NeonRequest *) req)
+				   || !page_server->flush(shard_no))
+			{
+				/* do nothing */
+			}
+			MyNeonCounters->pageserver_open_requests++;
+			consume_prefetch_responses();
+			resp = page_server->receive(shard_no);
+			MyNeonCounters->pageserver_open_requests--;
+		}
+		PG_CATCH();
+		{
+			/*
+			 * Cancellation in this code needs to be handled better at some
+			 * point, but this currently seems fine for now.
+			 */
+			page_server->disconnect(shard_no);
+			MyNeonCounters->pageserver_open_requests = 0;
+
+			/*
+			 * We know for sure we're not working on any prefetch pages after
+			 * this.
+			 */
+			END_PREFETCH_RECEIVE_WORK();
+
+			PG_RE_THROW();
+		}
+		PG_END_TRY();
+
+	} while (resp == NULL);
+
+	return resp;
+}
+
+
+StringInfoData
+nm_pack_request(NeonRequest *msg)
+{
+	StringInfoData s;
+
+	initStringInfo(&s);
+
+	pq_sendbyte(&s, msg->tag);
+	if (neon_protocol_version >= 3)
+	{
+		pq_sendint64(&s, msg->reqid);
+	}
+	pq_sendint64(&s, msg->lsn);
+	pq_sendint64(&s, msg->not_modified_since);
+
+	switch (messageTag(msg))
+	{
+			/* pagestore_client -> pagestore */
+		case T_NeonExistsRequest:
+			{
+				NeonExistsRequest *msg_req = (NeonExistsRequest *) msg;
+
+				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
+				pq_sendbyte(&s, msg_req->forknum);
+
+				break;
+			}
+		case T_NeonNblocksRequest:
+			{
+				NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg;
+
+				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
+				pq_sendbyte(&s, msg_req->forknum);
+
+				break;
+			}
+		case T_NeonDbSizeRequest:
+			{
+				NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;
+
+				pq_sendint32(&s, msg_req->dbNode);
+
+				break;
+			}
+		case T_NeonGetPageRequest:
+			{
+				NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg;
+
+				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
+				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
+				pq_sendbyte(&s, msg_req->forknum);
+				pq_sendint32(&s, msg_req->blkno);
+
+				break;
+			}
+
+		case T_NeonGetSlruSegmentRequest:
+			{
+				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
+
+				pq_sendbyte(&s, msg_req->kind);
+				pq_sendint32(&s, msg_req->segno);
+
+				break;
+			}
+
+			/* pagestore -> pagestore_client. We never need to create these. */
+		case T_NeonExistsResponse:
+		case T_NeonNblocksResponse:
+		case T_NeonGetPageResponse:
+		case T_NeonErrorResponse:
+		case T_NeonDbSizeResponse:
+		case T_NeonGetSlruSegmentResponse:
+		default:
+			neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
+			break;
+	}
+	return s;
+}
+
+NeonResponse *
+nm_unpack_response(StringInfo s)
+{
+	NeonMessageTag tag = pq_getmsgbyte(s);
+	NeonResponse resp_hdr = {0}; /* make valgrind happy */
+	NeonResponse *resp = NULL;
+
+	resp_hdr.tag = tag;
+	if (neon_protocol_version >= 3)
+	{
+		resp_hdr.reqid = pq_getmsgint64(s);
+		resp_hdr.lsn = pq_getmsgint64(s);
+		resp_hdr.not_modified_since = pq_getmsgint64(s);
+	}
+	switch (tag)
+	{
+			/* pagestore -> pagestore_client */
+		case T_NeonExistsResponse:
+			{
+				NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse));
+
+				if (neon_protocol_version >= 3)
+				{
+					NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					msg_resp->req.forknum = pq_getmsgbyte(s);
+				}
+				msg_resp->req.hdr = resp_hdr;
+				msg_resp->exists = pq_getmsgbyte(s);
+				pq_getmsgend(s);
+
+				resp = (NeonResponse *) msg_resp;
+				break;
+			}
+
+		case T_NeonNblocksResponse:
+			{
+				NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse));
+
+				if (neon_protocol_version >= 3)
+				{
+					NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					msg_resp->req.forknum = pq_getmsgbyte(s);
+				}
+				msg_resp->req.hdr = resp_hdr;
+				msg_resp->n_blocks = pq_getmsgint(s, 4);
+				pq_getmsgend(s);
+
+				resp = (NeonResponse *) msg_resp;
+				break;
+			}
+
+		case T_NeonGetPageResponse:
+			{
+				NeonGetPageResponse *msg_resp;
+
+				msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE);
+				if (neon_protocol_version >= 3)
+				{
+					NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
+					msg_resp->req.forknum = pq_getmsgbyte(s);
+					msg_resp->req.blkno = pq_getmsgint(s, 4);
+				}
+				msg_resp->req.hdr = resp_hdr;
+				/* XXX:	should be varlena */
+				memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
+				pq_getmsgend(s);
+
+				Assert(msg_resp->req.hdr.tag == T_NeonGetPageResponse);
+
+				resp = (NeonResponse *) msg_resp;
+				break;
+			}
+
+		case T_NeonDbSizeResponse:
+			{
+				NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse));
+
+				if (neon_protocol_version >= 3)
+				{
+					msg_resp->req.dbNode = pq_getmsgint(s, 4);
+				}
+				msg_resp->req.hdr = resp_hdr;
+				msg_resp->db_size = pq_getmsgint64(s);
+				pq_getmsgend(s);
+
+				resp = (NeonResponse *) msg_resp;
+				break;
+			}
+
+		case T_NeonErrorResponse:
+			{
+				NeonErrorResponse *msg_resp;
+				size_t		msglen;
+				const char *msgtext;
+
+				msgtext = pq_getmsgrawstring(s);
+				msglen = strlen(msgtext);
+
+				msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1);
+				msg_resp->req = resp_hdr;
+				memcpy(msg_resp->message, msgtext, msglen + 1);
+				pq_getmsgend(s);
+
+				resp = (NeonResponse *) msg_resp;
+				break;
+			}
+
+		case T_NeonGetSlruSegmentResponse:
+		    {
+				NeonGetSlruSegmentResponse *msg_resp;
+				int n_blocks;
+				msg_resp = palloc0(sizeof(NeonGetSlruSegmentResponse));
+
+				if (neon_protocol_version >= 3)
+				{
+					msg_resp->req.kind = pq_getmsgbyte(s);
+					msg_resp->req.segno = pq_getmsgint(s, 4);
+				}
+				msg_resp->req.hdr = resp_hdr;
+
+				n_blocks = pq_getmsgint(s, 4);
+				msg_resp->n_blocks = n_blocks;
+				memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ);
+				pq_getmsgend(s);
+
+				resp = (NeonResponse *) msg_resp;
+				break;
+			}
+
+			/*
+			 * pagestore_client -> pagestore
+			 *
+			 * We create these ourselves, and don't need to decode them.
+			 */
+		case T_NeonExistsRequest:
+		case T_NeonNblocksRequest:
+		case T_NeonGetPageRequest:
+		case T_NeonDbSizeRequest:
+		case T_NeonGetSlruSegmentRequest:
+		default:
+			neon_log(ERROR, "unexpected neon message tag 0x%02x", tag);
+			break;
+	}
+
+	return resp;
+}
+
+/* dump to json for debugging / error reporting purposes */
+char *
+nm_to_string(NeonMessage *msg)
+{
+	StringInfoData s;
+
+	initStringInfo(&s);
+
+	switch (messageTag(msg))
+	{
+			/* pagestore_client -> pagestore */
+		case T_NeonExistsRequest:
+			{
+				NeonExistsRequest *msg_req = (NeonExistsRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\"");
+				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
+				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+
+		case T_NeonNblocksRequest:
+			{
+				NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\"");
+				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
+				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+
+		case T_NeonGetPageRequest:
+			{
+				NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\"");
+				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
+				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
+				appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+		case T_NeonDbSizeRequest:
+			{
+				NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
+				appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+		case T_NeonGetSlruSegmentRequest:
+			{
+				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\"");
+				appendStringInfo(&s, ", \"kind\": %u", msg_req->kind);
+				appendStringInfo(&s, ", \"segno\": %u", msg_req->segno);
+				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
+				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+			/* pagestore -> pagestore_client */
+		case T_NeonExistsResponse:
+			{
+				NeonExistsResponse *msg_resp = (NeonExistsResponse *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonExistsResponse\"");
+				appendStringInfo(&s, ", \"exists\": %d}",
+								 msg_resp->exists);
+				appendStringInfoChar(&s, '}');
+
+				break;
+			}
+		case T_NeonNblocksResponse:
+			{
+				NeonNblocksResponse *msg_resp = (NeonNblocksResponse *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonNblocksResponse\"");
+				appendStringInfo(&s, ", \"n_blocks\": %u}",
+								 msg_resp->n_blocks);
+				appendStringInfoChar(&s, '}');
+
+				break;
+			}
+		case T_NeonGetPageResponse:
+			{
+#if 0
+				NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg;
+#endif
+
+				appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\"");
+				appendStringInfo(&s, ", \"page\": \"XXX\"}");
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+		case T_NeonErrorResponse:
+			{
+				NeonErrorResponse *msg_resp = (NeonErrorResponse *) msg;
+
+				/* FIXME: escape double-quotes in the message */
+				appendStringInfoString(&s, "{\"type\": \"NeonErrorResponse\"");
+				appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message);
+				appendStringInfoChar(&s, '}');
+				break;
+			}
+		case T_NeonDbSizeResponse:
+			{
+				NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\"");
+				appendStringInfo(&s, ", \"db_size\": %ld}",
+								 msg_resp->db_size);
+				appendStringInfoChar(&s, '}');
+
+				break;
+			}
+		case T_NeonGetSlruSegmentResponse:
+			{
+				NeonGetSlruSegmentResponse *msg_resp = (NeonGetSlruSegmentResponse *) msg;
+
+				appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentResponse\"");
+				appendStringInfo(&s, ", \"n_blocks\": %u}",
+								 msg_resp->n_blocks);
+				appendStringInfoChar(&s, '}');
+
+				break;
+			}
+
+		default:
+			appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag);
+	}
+	return s.data;
+}
+
+/*
+ *	communicator_init() -- Initialize per-backend private state
+ */
+void
+communicator_init(void)
+{
+	Size		prfs_size;
+
+	if (MyPState != NULL)
+		return;
+
+	/*
+	 * Sanity check that theperf counters array is sized correctly. We got
+	 * this wrong once, and the formula for max number of backends and aux
+	 * processes might well change in the future, so better safe than sorry.
+	 * This is a very cheap check so we do it even without assertions.  On
+	 * v14, this gets called before initializing MyProc, so we cannot perform
+	 * the check here. That's OK, we don't expect the logic to change in old
+	 * releases.
+	 */
+#if PG_VERSION_NUM>=150000
+	if (MyNeonCounters >= &neon_per_backend_counters_shared[NUM_NEON_PERF_COUNTER_SLOTS])
+		elog(ERROR, "MyNeonCounters points past end of array");
+#endif
+
+	prfs_size = offsetof(PrefetchState, prf_buffer) +
+		sizeof(PrefetchRequest) * readahead_buffer_size;
+
+	MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size);
+
+	MyPState->n_unused = readahead_buffer_size;
+
+	MyPState->bufctx = SlabContextCreate(TopMemoryContext,
+										 "NeonSMGR/prefetch",
+										 SLAB_DEFAULT_BLOCK_SIZE * 17,
+										 PS_GETPAGERESPONSE_SIZE);
+	MyPState->errctx = AllocSetContextCreate(TopMemoryContext,
+											 "NeonSMGR/errors",
+											 ALLOCSET_DEFAULT_SIZES);
+	MyPState->hashctx = AllocSetContextCreate(TopMemoryContext,
+											  "NeonSMGR/prefetch",
+											  ALLOCSET_DEFAULT_SIZES);
+
+	MyPState->prf_hash = prfh_create(MyPState->hashctx,
+									 readahead_buffer_size, NULL);
+}
+
+/*
+ *  neon_prefetch_response_usable -- Can a new request be satisfied by old one?
+ *
+ * This is used to check if the response to a prefetch request can be used to
+ * satisfy a page read now.
+ */
+static bool
+neon_prefetch_response_usable(neon_request_lsns *request_lsns,
+							  PrefetchRequest *slot)
+{
+	/* sanity check the LSN's on the old and the new request */
+	Assert(request_lsns->request_lsn >= request_lsns->not_modified_since);
+	Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since);
+	Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn);
+	Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
+	Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
+	Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn);
+	Assert(slot->status != PRFS_UNUSED);
+
+	/*
+	 * The new request's LSN should never be older than the old one.  This
+	 * could be an Assert, except that for testing purposes, we do provide an
+	 * interface in neon_test_utils to fetch pages at arbitary LSNs, which
+	 * violates this.
+	 *
+	 * Similarly, the not_modified_since value calculated for a page should
+	 * never move backwards. This assumption is a bit fragile; if we updated
+	 * the last-written cache when we read in a page, for example, then it
+	 * might. But as the code stands, it should not.
+	 *
+	 * (If two backends issue a request at the same time, they might race and
+	 * calculate LSNs "out of order" with each other, but the prefetch queue
+	 * is backend-private at the moment.)
+	 */
+	if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn ||
+		request_lsns->not_modified_since < slot->request_lsns.not_modified_since)
+	{
+		ereport(LOG,
+				(errcode(ERRCODE_IO_ERROR),
+				 errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
+				 errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
+						   LSN_FORMAT_ARGS(request_lsns->effective_request_lsn),
+						   LSN_FORMAT_ARGS(request_lsns->not_modified_since),
+						   LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn),
+						   LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since))));
+		return false;
+	}
+
+	/*---
+	 * Each request to the pageserver has three LSN values associated with it:
+	 * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'.
+	 * `not_modified_since` and `request_lsn` are sent to the pageserver, but
+	 * in the primary node, we always use UINT64_MAX as the `request_lsn`, so
+	 * we remember `effective_request_lsn` separately. In a primary,
+	 * `effective_request_lsn` is the same as  `not_modified_since`.
+	 * See comments in neon_get_request_lsns why we can not use last flush WAL position here.
+	 *
+	 * To determine whether a response to a GetPage request issued earlier is
+	 * still valid to satisfy a new page read, we look at the
+	 * (not_modified_since, effective_request_lsn] range of the request. It is
+	 * effectively a claim that the page has not been modified between those
+	 * LSNs.  If the range of the old request in the queue overlaps with the
+	 * new request, we know that the page hasn't been modified in the union of
+	 * the ranges. We can use the response to old request to satisfy the new
+	 * request in that case. For example:
+	 *
+	 *              100      500
+	 * Old request:  +--------+
+	 *
+	 *                     400      800
+	 * New request:         +--------+
+	 *
+	 * The old request claims that the page was not modified between LSNs 100
+	 * and 500, and the second claims that it was not modified between 400 and
+	 * 800. Together they mean that the page was not modified between 100 and
+	 * 800. Therefore the response to the old request is also valid for the
+	 * new request.
+	 *
+	 * This logic also holds at the boundary case that the old request's LSN
+	 * matches the new request's not_modified_since LSN exactly:
+	 *
+	 *              100      500
+	 * Old request:  +--------+
+	 *
+	 *                       500      900
+	 * New request:           +--------+
+	 *
+	 * The response to the old request is the page as it was at LSN 500, and
+	 * the page hasn't been changed in the range (500, 900], therefore the
+	 * response is valid also for the new request.
+	 */
+
+	/* this follows from the checks above */
+	Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since);
+
+	return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn;
+}
+
+/*
+ *	Does the physical file exist?
+ */
+bool
+communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *request_lsns)
+{
+	bool		exists;
+	NeonResponse *resp;
+
+	{
+		NeonExistsRequest request = {
+			.hdr.tag = T_NeonExistsRequest,
+			.hdr.reqid = GENERATE_REQUEST_ID(),
+			.hdr.lsn = request_lsns->request_lsn,
+			.hdr.not_modified_since = request_lsns->not_modified_since,
+			.rinfo = rinfo,
+			.forknum = forkNum
+		};
+
+		resp = page_server_request(&request);
+
+		switch (resp->tag)
+		{
+			case T_NeonExistsResponse:
+			{
+				NeonExistsResponse* exists_resp = (NeonExistsResponse *) resp;
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr) ||
+						!RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) ||
+						exists_resp->req.forknum != request.forknum)
+					{
+						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
+													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum,
+													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum);
+					}
+				}
+				exists = exists_resp->exists;
+				break;
+			}
+			case T_NeonErrorResponse:
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr))
+					{
+						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
+							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
+					}
+				}
+				ereport(ERROR,
+						(errcode(ERRCODE_IO_ERROR),
+						 errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+								resp->reqid,
+								RelFileInfoFmt(rinfo),
+								forkNum,
+								LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)),
+						 errdetail("page server returned error: %s",
+								   ((NeonErrorResponse *) resp)->message)));
+				break;
+
+			default:
+				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+											"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
+											T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
+		}
+		pfree(resp);
+	}
+	return exists;
+}
+
+/*
+ * Read N pages at a specific LSN.
+ *
+ * *mask is set for pages read at a previous point in time, and which we
+ * should not touch, nor overwrite.
+ * New bits should be set in *mask for the pages we'successfully read.
+ *
+ * The offsets in request_lsns, buffers, and mask are linked.
+ */
+void
+communicator_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno,
+						  neon_request_lsns *request_lsns,
+						  void **buffers, BlockNumber nblocks, const bits8 *mask)
+{
+	NeonResponse *resp;
+	uint64		ring_index;
+	PrfHashEntry *entry;
+	PrefetchRequest *slot;
+	PrefetchRequest hashkey;
+
+	Assert(PointerIsValid(request_lsns));
+	Assert(nblocks >= 1);
+
+	/*
+	 * Use an intermediate PrefetchRequest struct as the hash key to ensure
+	 * correct alignment and that the padding bytes are cleared.
+	 */
+	memset(&hashkey.buftag, 0, sizeof(BufferTag));
+	CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo);
+	hashkey.buftag.forkNum = forkNum;
+	hashkey.buftag.blockNum = base_blockno;
+
+	/*
+	 * The redo process does not lock pages that it needs to replay but are
+	 * not in the shared buffers, so a concurrent process may request the page
+	 * after redo has decided it won't redo that page and updated the LwLSN
+	 * for that page. If we're in hot standby we need to take care that we
+	 * don't return until after REDO has finished replaying up to that LwLSN,
+	 * as the page should have been locked up to that point.
+	 *
+	 * See also the description on neon_redo_read_buffer_filter below.
+	 *
+	 * NOTE: It is possible that the WAL redo process will still do IO due to
+	 * concurrent failed read IOs. Those IOs should never have a request_lsn
+	 * that is as large as the WAL record we're currently replaying, if it
+	 * weren't for the behaviour of the LwLsn cache that uses the highest
+	 * value of the LwLsn cache when the entry is not found.
+	 */
+	(void) prefetch_register_bufferv(hashkey.buftag, request_lsns, nblocks, mask, false);
+
+	for (int i = 0; i < nblocks; i++)
+	{
+		void	   *buffer = buffers[i];
+		BlockNumber blockno = base_blockno + i;
+		neon_request_lsns *reqlsns = &request_lsns[i];
+		TimestampTz		start_ts, end_ts;
+
+		if (PointerIsValid(mask) && BITMAP_ISSET(mask, i))
+			continue;
+
+		start_ts = GetCurrentTimestamp();
+
+		if (RecoveryInProgress() && MyBackendType != B_STARTUP)
+			XLogWaitForReplayOf(reqlsns->request_lsn);
+
+		/*
+		 * Try to find prefetched page in the list of received pages.
+		 */
+Retry:
+		hashkey.buftag.blockNum = blockno;
+		entry = prfh_lookup(MyPState->prf_hash, &hashkey);
+
+		if (entry != NULL)
+		{
+			slot = entry->slot;
+			if (neon_prefetch_response_usable(reqlsns, slot))
+			{
+				ring_index = slot->my_ring_index;
+			}
+			else
+			{
+				/*
+				 * Cannot use this prefetch, discard it
+				 *
+				 * We can't drop cache for not-yet-received requested items. It is
+				 * unlikely this happens, but it can happen if prefetch distance
+				 * is large enough and a backend didn't consume all prefetch
+				 * requests.
+				 */
+				if (slot->status == PRFS_REQUESTED)
+				{
+					if (!prefetch_wait_for(slot->my_ring_index))
+						goto Retry;
+				}
+				/* drop caches */
+				prefetch_set_unused(slot->my_ring_index);
+				pgBufferUsage.prefetch.expired += 1;
+				MyNeonCounters->getpage_prefetch_discards_total++;
+				/* make it look like a prefetch cache miss */
+				entry = NULL;
+			}
+		}
+
+		do
+		{
+			if (entry == NULL)
+			{
+				ring_index = prefetch_register_bufferv(hashkey.buftag, reqlsns, 1, NULL, false);
+				Assert(ring_index != UINT64_MAX);
+				slot = GetPrfSlot(ring_index);
+			}
+			else
+			{
+				/*
+				 * Empty our reference to the prefetch buffer's hash entry. When
+				 * we wait for prefetches, the entry reference is invalidated by
+				 * potential updates to the hash, and when we reconnect to the
+				 * pageserver the prefetch we're waiting for may be dropped, in
+				 * which case we need to retry and take the branch above.
+				 */
+				entry = NULL;
+			}
+
+			Assert(slot->my_ring_index == ring_index);
+			Assert(MyPState->ring_last <= ring_index &&
+				   MyPState->ring_unused > ring_index);
+			Assert(slot->status != PRFS_UNUSED);
+			Assert(GetPrfSlot(ring_index) == slot);
+
+		} while (!prefetch_wait_for(ring_index));
+
+		Assert(slot->status == PRFS_RECEIVED);
+		Assert(memcmp(&hashkey.buftag, &slot->buftag, sizeof(BufferTag)) == 0);
+		Assert(hashkey.buftag.blockNum == base_blockno + i);
+
+		resp = slot->response;
+
+		switch (resp->tag)
+		{
+			case T_NeonGetPageResponse:
+			{
+				NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp;
+				if (neon_protocol_version >= 3)
+				{
+					if (resp->reqid != slot->reqid ||
+						resp->lsn != slot->request_lsns.request_lsn ||
+						resp->not_modified_since != slot->request_lsns.not_modified_since ||
+						!RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) ||
+						getpage_resp->req.forknum != forkNum ||
+						getpage_resp->req.blkno != base_blockno + i)
+					{
+						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
+													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno,
+													slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i);
+					}
+				}
+				memcpy(buffer, getpage_resp->page, BLCKSZ);
+
+				/*
+				 * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received
+				 * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here
+				 * under buffer lock.
+				 */
+				if (!lfc_store_prefetch_result)
+					lfc_write(rinfo, forkNum, blockno, buffer);
+				break;
+			}
+			case T_NeonErrorResponse:
+				if (neon_protocol_version >= 3)
+				{
+					if (resp->reqid != slot->reqid ||
+						resp->lsn != slot->request_lsns.request_lsn ||
+						resp->not_modified_since != slot->request_lsns.not_modified_since)
+					{
+						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
+							 slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since));
+					}
+				}
+				ereport(ERROR,
+						(errcode(ERRCODE_IO_ERROR),
+						 errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+								slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo),
+								forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)),
+						 errdetail("page server returned error: %s",
+								   ((NeonErrorResponse *) resp)->message)));
+				break;
+			default:
+				NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
+											"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
+											T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
+		}
+
+		/* buffer was used, clean up for later reuse */
+		prefetch_set_unused(ring_index);
+		prefetch_cleanup_trailing_unused();
+
+		end_ts = GetCurrentTimestamp();
+		inc_getpage_wait(end_ts >= start_ts ? (end_ts - start_ts) : 0);
+	}
+}
+
+/*
+ *	neon_nblocks() -- Get the number of blocks stored in a relation.
+ */
+BlockNumber
+communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *request_lsns)
+{
+	NeonResponse *resp;
+	BlockNumber n_blocks;
+
+	{
+		NeonNblocksRequest request = {
+			.hdr.tag = T_NeonNblocksRequest,
+			.hdr.reqid = GENERATE_REQUEST_ID(),
+			.hdr.lsn = request_lsns->request_lsn,
+			.hdr.not_modified_since = request_lsns->not_modified_since,
+			.rinfo = rinfo,
+			.forknum = forknum,
+		};
+
+		resp = page_server_request(&request);
+
+		switch (resp->tag)
+		{
+			case T_NeonNblocksResponse:
+			{
+				NeonNblocksResponse * relsize_resp = (NeonNblocksResponse *) resp;
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr) ||
+						!RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) ||
+						relsize_resp->req.forknum != forknum)
+					{
+						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
+													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum,
+													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum);
+					}
+				}
+				n_blocks = relsize_resp->n_blocks;
+				break;
+			}
+			case T_NeonErrorResponse:
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr))
+					{
+						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
+							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
+					}
+				}
+				ereport(ERROR,
+						(errcode(ERRCODE_IO_ERROR),
+						 errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+								resp->reqid,
+								RelFileInfoFmt(rinfo),
+								forknum,
+								LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)),
+						 errdetail("page server returned error: %s",
+								   ((NeonErrorResponse *) resp)->message)));
+				break;
+
+			default:
+				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+											"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
+											T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
+		}
+
+		pfree(resp);
+	}
+	return n_blocks;
+}
+
+/*
+ *	neon_db_size() -- Get the size of the database in bytes.
+ */
+int64
+communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns)
+{
+	NeonResponse *resp;
+	int64		db_size;
+
+	{
+		NeonDbSizeRequest request = {
+			.hdr.tag = T_NeonDbSizeRequest,
+			.hdr.reqid = GENERATE_REQUEST_ID(),
+			.hdr.lsn = request_lsns->request_lsn,
+			.hdr.not_modified_since = request_lsns->not_modified_since,
+			.dbNode = dbNode,
+		};
+
+		resp = page_server_request(&request);
+
+		switch (resp->tag)
+		{
+			case T_NeonDbSizeResponse:
+			{
+				NeonDbSizeResponse* dbsize_resp = (NeonDbSizeResponse *) resp;
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr) ||
+						dbsize_resp->req.dbNode != dbNode)
+					{
+						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
+													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode,
+													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode);
+					}
+				}
+				db_size = dbsize_resp->db_size;
+				break;
+			}
+			case T_NeonErrorResponse:
+				if (neon_protocol_version >= 3)
+				{
+					if (!equal_requests(resp, &request.hdr))
+					{
+						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
+							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
+					}
+				}
+				ereport(ERROR,
+						(errcode(ERRCODE_IO_ERROR),
+						 errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X",
+								resp->reqid,
+								dbNode, LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)),
+						 errdetail("page server returned error: %s",
+								   ((NeonErrorResponse *) resp)->message)));
+				break;
+
+			default:
+				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+											"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
+											T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
+		}
+
+		pfree(resp);
+	}
+	return db_size;
+}
+
+int
+communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *request_lsns,
+							   void *buffer)
+{
+	int			n_blocks;
+	shardno_t	shard_no = 0; /* All SLRUs are at shard 0 */
+	NeonResponse *resp;
+	NeonGetSlruSegmentRequest request;
+
+	request = (NeonGetSlruSegmentRequest) {
+		.hdr.tag = T_NeonGetSlruSegmentRequest,
+		.hdr.reqid = GENERATE_REQUEST_ID(),
+		.hdr.lsn = request_lsns->request_lsn,
+		.hdr.not_modified_since = request_lsns->not_modified_since,
+		.kind = kind,
+		.segno = segno
+	};
+
+	do
+	{
+		while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no));
+
+		consume_prefetch_responses();
+
+		resp = page_server->receive(shard_no);
+	} while (resp == NULL);
+
+	switch (resp->tag)
+	{
+		case T_NeonGetSlruSegmentResponse:
+		{
+			NeonGetSlruSegmentResponse* slru_resp = (NeonGetSlruSegmentResponse *) resp;
+			if (neon_protocol_version >= 3)
+			{
+				if (!equal_requests(resp, &request.hdr) ||
+					slru_resp->req.kind != kind ||
+					slru_resp->req.segno != segno)
+				{
+					NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+												"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}",
+												resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno,
+												request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, (unsigned long long) segno);
+				}
+			}
+			n_blocks = slru_resp->n_blocks;
+			memcpy(buffer, slru_resp->data, n_blocks*BLCKSZ);
+			break;
+		}
+		case T_NeonErrorResponse:
+			if (neon_protocol_version >= 3)
+			{
+				if (!equal_requests(resp, &request.hdr))
+				{
+					elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+						 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
+						 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
+				}
+			}
+			ereport(ERROR,
+					(errcode(ERRCODE_IO_ERROR),
+					 errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %llu at lsn %X/%08X",
+							resp->reqid,
+							kind,
+							(unsigned long long) segno,
+							LSN_FORMAT_ARGS(request_lsns->request_lsn)),
+					 errdetail("page server returned error: %s",
+							   ((NeonErrorResponse *) resp)->message)));
+			break;
+
+		default:
+			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+										"Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x",
+										T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag);
+	}
+	pfree(resp);
+
+	communicator_reconfigure_timeout_if_needed();
+	return n_blocks;
+}
+
+void
+communicator_reconfigure_timeout_if_needed(void)
+{
+	bool	needs_set = MyPState->ring_receive != MyPState->ring_unused &&
+						readahead_getpage_pull_timeout_ms > 0;
+
+	if (needs_set != timeout_set)
+	{
+		/* The background writer doens't (shouldn't) read any pages */
+		Assert(!AmBackgroundWriterProcess());
+		/* The checkpointer doens't (shouldn't) read any pages */
+		Assert(!AmCheckpointerProcess());
+
+		if (unlikely(PS_TIMEOUT_ID == 0))
+		{
+			PS_TIMEOUT_ID = RegisterTimeout(USER_TIMEOUT, pagestore_timeout_handler);
+		}
+
+		if (needs_set)
+		{
+#if PG_MAJORVERSION_NUM <= 14
+			enable_timeout_after(PS_TIMEOUT_ID, readahead_getpage_pull_timeout_ms);
+#else
+			enable_timeout_every(
+				PS_TIMEOUT_ID,
+				TimestampTzPlusMilliseconds(GetCurrentTimestamp(),
+											readahead_getpage_pull_timeout_ms),
+				readahead_getpage_pull_timeout_ms
+			);
+#endif
+			timeout_set = true;
+		}
+		else
+		{
+			Assert(timeout_set);
+			disable_timeout(PS_TIMEOUT_ID, false);
+			timeout_set = false;
+		}
+	}
+}
+
+static void
+pagestore_timeout_handler(void)
+{
+#if PG_MAJORVERSION_NUM <= 14
+	/*
+	 * PG14: Setting a repeating timeout is not possible, so we signal here
+	 * that the timeout has already been reset, and by telling the system
+	 * that system will re-schedule it later if we need to.
+	 */
+	timeout_set = false;
+#endif
+	timeout_signaled = true;
+	InterruptPending = true;
+}
+
+/*
+ * Process new data received in our active PageStream sockets.
+ *
+ * This relies on the invariant that all pipelined yet-to-be-received requests
+ * are getPage requests managed by MyPState. This is currently true, any
+ * modification will probably require some stuff to make it work again.
+ */
+static bool
+communicator_processinterrupts(void)
+{
+	if (timeout_signaled)
+	{
+		if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0)
+			communicator_prefetch_pump_state(true);
+
+		timeout_signaled = false;
+		communicator_reconfigure_timeout_if_needed();
+	}
+
+	if (!prev_interrupt_cb)
+		return false;
+
+	return prev_interrupt_cb();
+}
diff --git a/pgxn/neon/communicator.h b/pgxn/neon/communicator.h
new file mode 100644
index 000000000000..72cba526c1a0
--- /dev/null
+++ b/pgxn/neon/communicator.h
@@ -0,0 +1,48 @@
+/*-------------------------------------------------------------------------
+ *
+ * communicator.h
+ *	  internal interface for communicating with remote pageservers
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef COMMUNICATOR_h
+#define COMMUNICATOR_h
+
+#include "neon_pgversioncompat.h"
+
+#include "storage/buf_internals.h"
+
+#include "pagestore_client.h"
+
+/* initialization at postmaster startup */
+extern void pg_init_communicator(void);
+
+/* initialization at backend startup */
+extern void communicator_init(void);
+
+extern bool communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum,
+								neon_request_lsns *request_lsns);
+extern BlockNumber communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum,
+										neon_request_lsns *request_lsns);
+extern int64 communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns);
+extern void communicator_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum,
+									  BlockNumber base_blockno, neon_request_lsns *request_lsns,
+									  void **buffers, BlockNumber nblocks, const bits8 *mask);
+extern int communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum,
+										 neon_request_lsns *lsns,
+										 BlockNumber nblocks, void **buffers, bits8 *mask);
+extern void communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
+												   BlockNumber nblocks, const bits8 *mask);
+extern int communicator_read_slru_segment(SlruKind kind, int64 segno,
+										  neon_request_lsns *request_lsns,
+										  void *buffer);
+
+extern void communicator_reconfigure_timeout_if_needed(void);
+extern void communicator_prefetch_pump_state(bool IsHandlingInterrupts);
+
+
+#endif
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 2505fcb84723..8c2990e57aed 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -21,7 +21,6 @@
 #include "access/xlog.h"
 #include "funcapi.h"
 #include "miscadmin.h"
-#include "pagestore_client.h"
 #include "common/hashfn.h"
 #include "pgstat.h"
 #include "port/pg_iovec.h"
@@ -43,6 +42,7 @@
 
 #include "hll.h"
 #include "bitmap.h"
+#include "file_cache.h"
 #include "neon.h"
 #include "neon_lwlsncache.h"
 #include "neon_perf_counters.h"
diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h
new file mode 100644
index 000000000000..849558b83d9d
--- /dev/null
+++ b/pgxn/neon/file_cache.h
@@ -0,0 +1,52 @@
+/*-------------------------------------------------------------------------
+ *
+ * file_cache.h
+ *	  Local File Cache definitions
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef FILE_CACHE_h
+#define FILE_CACHE_h
+
+#include "neon_pgversioncompat.h"
+
+/* GUCs */
+extern bool lfc_store_prefetch_result;
+
+/* functions for local file cache */
+extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
+					   BlockNumber blkno, const void *const *buffers,
+					   BlockNumber nblocks);
+/* returns number of blocks read, with one bit set in *read for each  */
+extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum,
+							BlockNumber blkno, void **buffers,
+							BlockNumber nblocks, bits8 *mask);
+
+extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
+							   BlockNumber blkno);
+extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
+							   BlockNumber blkno, int nblocks, bits8 *bitmap);
+extern void lfc_init(void);
+extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
+						 const void* buffer, XLogRecPtr lsn);
+
+
+static inline bool
+lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+		 void *buffer)
+{
+	bits8		rv = 0;
+	return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1;
+}
+
+static inline void
+lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
+		  const void *buffer)
+{
+	return lfc_writev(rinfo, forkNum, blkno, &buffer, 1);
+}
+
+#endif							/* FILE_CACHE_H */
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 60b2249461a0..dfabb6919e5f 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -65,6 +65,9 @@ static const struct config_enum_entry neon_compute_modes[] = {
 /* GUCs */
 char	   *neon_timeline;
 char	   *neon_tenant;
+char	   *neon_project_id;
+char	   *neon_branch_id;
+char	   *neon_endpoint_id;
 int32		max_cluster_size;
 char	   *page_server_connstring;
 char	   *neon_auth_token;
@@ -1352,6 +1355,31 @@ pg_init_libpagestore(void)
 							   0,	/* no flags required */
 							   check_neon_id, NULL, NULL);
 
+	DefineCustomStringVariable("neon.project_id",
+							   "Neon project_id the server is running on",
+							   NULL,
+							   &neon_project_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+	DefineCustomStringVariable("neon.branch_id",
+							   "Neon branch_id the server is running on",
+							   NULL,
+							   &neon_branch_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+	DefineCustomStringVariable("neon.endpoint_id",
+							   "Neon endpoint_id the server is running on",
+							   NULL,
+							   &neon_endpoint_id,
+							   "",
+							   PGC_POSTMASTER,
+							   0,	/* no flags required */
+							   check_neon_id, NULL, NULL);
+
 	DefineCustomIntVariable("neon.stripe_size",
 							"sharding stripe size",
 							NULL,
@@ -1475,6 +1503,4 @@ pg_init_libpagestore(void)
 	}
 
 	memset(page_servers, 0, sizeof(page_servers));
-
-	lfc_init();
 }
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 081025e2d52a..a6a70217566a 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -28,7 +28,9 @@
 #include "utils/guc.h"
 #include "utils/guc_tables.h"
 
+#include "communicator.h"
 #include "extension_server.h"
+#include "file_cache.h"
 #include "neon.h"
 #include "neon_lwlsncache.h"
 #include "control_plane_connector.h"
@@ -434,10 +436,11 @@ _PG_init(void)
 #endif
 
 	pg_init_libpagestore();
+	lfc_init();
 	pg_init_walproposer();
 	init_lwlsncache();
 
-	pagestore_smgr_init();
+	pg_init_communicator();
 	Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 
 	InitUnstableExtensionsSupport();
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index e2fa136e37e9..a2e81feb5f75 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -47,9 +47,18 @@ extern uint32		WAIT_EVENT_NEON_WAL_DL;
 #define WAIT_EVENT_NEON_WAL_DL			WAIT_EVENT_WAL_READ
 #endif
 
+
+#define NEON_TAG "[NEON_SMGR] "
+#define neon_log(tag, fmt, ...) ereport(tag,                                  \
+										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
+										 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
+#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag,	\
+														(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
+														 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
+
+
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);
-extern void pagestore_smgr_init(void);
 
 extern uint64 BackpressureThrottlingTime(void);
 extern void SetNeonCurrentClusterSize(uint64 size);
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index a2e3d57e4743..0ab539fe5633 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -58,14 +58,6 @@ typedef struct
 
 #define messageTag(m) (((const NeonMessage *)(m))->tag)
 
-#define NEON_TAG "[NEON_SMGR] "
-#define neon_log(tag, fmt, ...) ereport(tag,                                  \
-										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
-										 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
-#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag,	\
-														(errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \
-														 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
-
 /* SLRUs downloadable from page server */
 typedef enum {
 	SLRU_CLOG,
@@ -234,7 +226,6 @@ extern char *neon_timeline;
 extern char *neon_tenant;
 extern int32 max_cluster_size;
 extern int  neon_protocol_version;
-extern bool lfc_store_prefetch_result;
 
 extern shardno_t get_shard_number(BufferTag* tag);
 
@@ -242,6 +233,7 @@ extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);
 extern void smgr_init_neon(void);
 extern void readahead_buffer_resize(int newsize, void *extra);
 
+
 /*
  * LSN values associated with each request to the pageserver
  */
@@ -278,6 +270,10 @@ extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum,
 										 neon_request_lsns request_lsns, void *buffer);
 extern int64 neon_dbsize(Oid dbNode);
 
+extern void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum,
+								  BlockNumber blkno, neon_request_lsns *output,
+								  BlockNumber nblocks);
+
 /* utils for neon relsize cache */
 extern void relsize_hash_init(void);
 extern bool get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size);
@@ -285,37 +281,4 @@ extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumb
 extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size);
 extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum);
 
-/* functions for local file cache */
-extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
-					   BlockNumber blkno, const void *const *buffers,
-					   BlockNumber nblocks);
-/* returns number of blocks read, with one bit set in *read for each  */
-extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum,
-							BlockNumber blkno, void **buffers,
-							BlockNumber nblocks, bits8 *mask);
-
-extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
-							   BlockNumber blkno);
-extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
-							   BlockNumber blkno, int nblocks, bits8 *bitmap);
-extern void lfc_init(void);
-extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
-						 const void* buffer, XLogRecPtr lsn);
-
-
-static inline bool
-lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-		 void *buffer)
-{
-	bits8		rv = 0;
-	return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1;
-}
-
-static inline void
-lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
-		  const void *buffer)
-{
-	return lfc_writev(rinfo, forkNum, blkno, &buffer, 1);
-}
-
 #endif							/* PAGESTORE_CLIENT_H */
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index eb8df11923a1..ef6bd038bbd3 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -49,9 +49,6 @@
 #include "access/xlog_internal.h"
 #include "access/xlogutils.h"
 #include "catalog/pg_class.h"
-#include "common/hashfn.h"
-#include "executor/instrument.h"
-#include "libpq/pqformat.h"
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
 #include "postmaster/interrupt.h"
@@ -62,9 +59,10 @@
 #include "storage/fsm_internals.h"
 #include "storage/md.h"
 #include "storage/smgr.h"
-#include "utils/timeout.h"
 
 #include "bitmap.h"
+#include "communicator.h"
+#include "file_cache.h"
 #include "neon.h"
 #include "neon_lwlsncache.h"
 #include "neon_perf_counters.h"
@@ -101,1707 +99,22 @@ static char *hexdump_page(char *page);
 
 const int	SmgrTrace = DEBUG5;
 
-#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \
-	neon_shard_log(shard_no, elvl, "Broken connection state: " message, \
-				   ##__VA_ARGS__)
-
-page_server_api *page_server;
-
 /* unlogged relation build states */
 typedef enum
-{
-	UNLOGGED_BUILD_NOT_IN_PROGRESS = 0,
-	UNLOGGED_BUILD_PHASE_1,
-	UNLOGGED_BUILD_PHASE_2,
-	UNLOGGED_BUILD_NOT_PERMANENT
-} UnloggedBuildPhase;
-
-static SMgrRelation unlogged_build_rel = NULL;
-static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
-
-static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
-static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
-
-static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum);
-
-static uint32 local_request_counter;
-#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter)
-
-/*
- * Various settings related to prompt (fast) handling of PageStream responses
- * at any CHECK_FOR_INTERRUPTS point.
- */
-int				readahead_getpage_pull_timeout_ms = 0;
-static int		PS_TIMEOUT_ID = 0;
-static bool		timeout_set = false;
-static bool		timeout_signaled = false;
-
-/*
- * We have a CHECK_FOR_INTERRUPTS in page_server->receive(), and we don't want
- * that to handle any getpage responses if we're already working on the
- * backlog of those, as we'd hit issues with determining which prefetch slot
- * we just got a response for.
- *
- * To protect against that, we have this variable that's set whenever we start
- * receiving data for prefetch slots, so that we don't get confused.
- *
- * Note that in certain error cases during readpage we may leak r_r_g=true,
- * which results in a failure to pick up further responses until we first
- * actively try to receive new getpage responses.
- */
-static bool		readpage_reentrant_guard = false;
-
-static void reconfigure_timeout_if_needed(void);
-static void pagestore_timeout_handler(void);
-
-#define START_PREFETCH_RECEIVE_WORK() \
-	do { \
-		readpage_reentrant_guard = true; \
-	} while (false)
-
-#define END_PREFETCH_RECEIVE_WORK() \
-	do { \
-		readpage_reentrant_guard = false; \
-		if (unlikely(timeout_signaled && !InterruptPending)) \
-			InterruptPending = true; \
-	} while (false)
-
-/*
- * Prefetch implementation:
- *
- * Prefetch is performed locally by each backend.
- *
- * There can be up to readahead_buffer_size active IO requests registered at
- * any time. Requests using smgr_prefetch are sent to the pageserver, but we
- * don't wait on the response. Requests using smgr_read are either read from
- * the buffer, or (if that's not possible) we wait on the response to arrive -
- * this also will allow us to receive other prefetched pages.
- * Each request is immediately written to the output buffer of the pageserver
- * connection, but may not be flushed if smgr_prefetch is used: pageserver
- * flushes sent requests on manual flush, or every neon.flush_output_after
- * unflushed requests; which is not necessarily always and all the time.
- *
- * Once we have received a response, this value will be stored in the response
- * buffer, indexed in a hash table. This allows us to retain our buffered
- * prefetch responses even when we have cache misses.
- *
- * Reading of prefetch responses is delayed until them are actually needed
- * (smgr_read). In case of prefetch miss or any other SMGR request other than
- * smgr_read, all prefetch responses in the pipeline will need to be read from
- * the connection; the responses are stored for later use.
- *
- * NOTE: The current implementation of the prefetch system implements a ring
- * buffer of up to readahead_buffer_size requests. If there are more _read and
- * _prefetch requests between the initial _prefetch and the _read of a buffer,
- * the prefetch request will have been dropped from this prefetch buffer, and
- * your prefetch was wasted.
- */
-
-/*
- * State machine:
- *
- * not in hash : in hash
- *             :
- * UNUSED ------> REQUESTED --> RECEIVED
- *   ^         :      |            |
- *   |         :      v            |
- *   |         : TAG_REMAINS       |
- *   |         :      |            |
- *   +----------------+------------+
- *             :
- */
-typedef enum PrefetchStatus
-{
-	PRFS_UNUSED = 0,			/* unused slot */
-	PRFS_REQUESTED,				/* request was written to the sendbuffer to
-								 * PS, but not necessarily flushed. all fields
-								 * except response valid */
-	PRFS_RECEIVED,				/* all fields valid */
-	PRFS_TAG_REMAINS,			/* only buftag and my_ring_index are still
-								 * valid */
-} PrefetchStatus;
-
-/* must fit in uint8; bits 0x1 are used */
-typedef enum {
-	PRFSF_NONE	= 0x0,
-	PRFSF_LFC	= 0x1  /* received prefetch result is stored in LFC */
-} PrefetchRequestFlags;
-
-typedef struct PrefetchRequest
-{
-	BufferTag	buftag;			/* must be first entry in the struct */
-	shardno_t	shard_no;
-	uint8		status;		/* see PrefetchStatus for valid values */
-	uint8		flags;		/* see PrefetchRequestFlags */
-	neon_request_lsns request_lsns;
-	NeonRequestId reqid;
-	NeonResponse *response;		/* may be null */
-	uint64		my_ring_index;
-} PrefetchRequest;
-
-/* prefetch buffer lookup hash table */
-
-typedef struct PrfHashEntry
-{
-	PrefetchRequest *slot;
-	uint32		status;
-	uint32		hash;
-} PrfHashEntry;
-
-#define SH_PREFIX			prfh
-#define SH_ELEMENT_TYPE		PrfHashEntry
-#define SH_KEY_TYPE			PrefetchRequest *
-#define SH_KEY				slot
-#define SH_STORE_HASH
-#define SH_GET_HASH(tb, a)	((a)->hash)
-#define SH_HASH_KEY(tb, key) hash_bytes( \
-	((const unsigned char *) &(key)->buftag), \
-	sizeof(BufferTag) \
-)
-
-#define SH_EQUAL(tb, a, b)	(BufferTagsEqual(&(a)->buftag, &(b)->buftag))
-#define SH_SCOPE			static inline
-#define SH_DEFINE
-#define SH_DECLARE
-#include "lib/simplehash.h"
-
-/*
- * PrefetchState maintains the state of (prefetch) getPage@LSN requests.
- * It maintains a (ring) buffer of in-flight requests and responses.
- *
- * We maintain several indexes into the ring buffer:
- * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0
- *
- * ring_unused points to the first unused slot of the buffer
- * ring_receive is the next request that is to be received
- * ring_last is the oldest received entry in the buffer
- *
- * Apart from being an entry in the ring buffer of prefetch requests, each
- * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag.
- */
-typedef struct PrefetchState
-{
-	MemoryContext bufctx;		/* context for prf_buffer[].response
-								 * allocations */
-	MemoryContext errctx;		/* context for prf_buffer[].response
-								 * allocations */
-	MemoryContext hashctx;		/* context for prf_buffer */
-
-	/* buffer indexes */
-	uint64		ring_unused;	/* first unused slot */
-	uint64		ring_flush;		/* next request to flush */
-	uint64		ring_receive;	/* next slot that is to receive a response */
-	uint64		ring_last;		/* min slot with a response value */
-
-	/* metrics / statistics  */
-	int			n_responses_buffered;	/* count of PS responses not yet in
-										 * buffers */
-	int			n_requests_inflight;	/* count of PS requests considered in
-										 * flight */
-	int			n_unused;		/* count of buffers < unused, > last, that are
-								 * also unused */
-
-	/* the buffers */
-	prfh_hash	*prf_hash;
-	int			max_shard_no;
-	/* Mark shards involved in prefetch */
-	uint8		shard_bitmap[(MAX_SHARDS + 7)/8];
-	PrefetchRequest prf_buffer[];	/* prefetch buffers */
-} PrefetchState;
-
-static PrefetchState *MyPState;
-
-#define GetPrfSlotNoCheck(ring_index) ( \
-	&MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \
-)
-
-#define GetPrfSlot(ring_index) ( \
-	( \
-		AssertMacro((ring_index) < MyPState->ring_unused && \
-					(ring_index) >= MyPState->ring_last), \
-		GetPrfSlotNoCheck(ring_index) \
-	) \
-)
-
-#define ReceiveBufferNeedsCompaction() (\
-	(MyPState->n_responses_buffered / 8) < ( \
-		MyPState->ring_receive - \
-			MyPState->ring_last - \
-			MyPState->n_responses_buffered \
-	) \
-)
-
-static bool compact_prefetch_buffers(void);
-static void consume_prefetch_responses(void);
-static bool prefetch_read(PrefetchRequest *slot);
-static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns);
-static bool prefetch_wait_for(uint64 ring_index);
-static void prefetch_cleanup_trailing_unused(void);
-static inline void prefetch_set_unused(uint64 ring_index);
-
-static void
-neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum,
-					  BlockNumber blkno, neon_request_lsns *output,
-					  BlockNumber nblocks);
-static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns,
-										  PrefetchRequest *slot);
-
-static bool
-compact_prefetch_buffers(void)
-{
-	uint64		empty_ring_index = MyPState->ring_last;
-	uint64		search_ring_index = MyPState->ring_receive;
-	int			n_moved = 0;
-
-	if (MyPState->ring_receive == MyPState->ring_last)
-		return false;
-
-	while (search_ring_index > MyPState->ring_last)
-	{
-		search_ring_index--;
-		if (GetPrfSlot(search_ring_index)->status == PRFS_UNUSED)
-		{
-			empty_ring_index = search_ring_index;
-			break;
-		}
-	}
-
-	/*
-	 * Here we have established: slots < search_ring_index have an unknown
-	 * state (not scanned) slots >= search_ring_index and <= empty_ring_index
-	 * are unused slots > empty_ring_index are in use, or outside our buffer's
-	 * range. ... unless search_ring_index <= ring_last
-	 *
-	 * Therefore, there is a gap of at least one unused items between
-	 * search_ring_index and empty_ring_index (both inclusive), which grows as
-	 * we hit more unused items while moving backwards through the array.
-	 */
-
-	while (search_ring_index > MyPState->ring_last)
-	{
-		PrefetchRequest *source_slot;
-		PrefetchRequest *target_slot;
-		bool		found;
-
-		/* update search index to an unprocessed entry */
-		search_ring_index--;
-
-		source_slot = GetPrfSlot(search_ring_index);
-
-		if (source_slot->status == PRFS_UNUSED)
-			continue;
-
-		/* slot is used -- start moving slot */
-		target_slot = GetPrfSlot(empty_ring_index);
-
-		Assert(source_slot->status == PRFS_RECEIVED);
-		Assert(target_slot->status == PRFS_UNUSED);
-
-		target_slot->buftag = source_slot->buftag;
-		target_slot->shard_no = source_slot->shard_no;
-		target_slot->status = source_slot->status;
-		target_slot->flags = source_slot->flags;
-		target_slot->response = source_slot->response;
-		target_slot->reqid = source_slot->reqid;
-		target_slot->request_lsns = source_slot->request_lsns;
-		target_slot->my_ring_index = empty_ring_index;
-
-		prfh_delete(MyPState->prf_hash, source_slot);
-		prfh_insert(MyPState->prf_hash, target_slot, &found);
-
-		Assert(!found);
-
-		/* Adjust the location of our known-empty slot */
-		empty_ring_index--;
-
-		/* empty the moved slot */
-		source_slot->status = PRFS_UNUSED;
-		source_slot->buftag = (BufferTag)
-		{
-			0
-		};
-		source_slot->response = NULL;
-		source_slot->my_ring_index = 0;
-		source_slot->request_lsns = (neon_request_lsns) {
-			InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr
-		};
-
-		/* update bookkeeping */
-		n_moved++;
-	}
-
-	/*
-	 * Only when we've moved slots we can expect trailing unused slots, so
-	 * only then we clean up trailing unused slots.
-	 */
-	if (n_moved > 0)
-	{
-		prefetch_cleanup_trailing_unused();
-		return true;
-	}
-
-	return false;
-}
-
-/*
- * If there might be responses still in the TCP buffer, then we should try to
- * use those, to reduce any TCP backpressure on the OS/PS side.
- *
- * This procedure handles that.
- *
- * Note that this works because we don't pipeline non-getPage requests.
- *
- * NOTE: This procedure is not allowed to throw errors that should be handled
- * by SMGR-related code, as this can be called from every CHECK_FOR_INTERRUPTS
- * point inside and outside PostgreSQL.
- *
- * This still does throw errors when it receives malformed responses from PS.
- *
- * When we're not called from CHECK_FOR_INTERRUPTS (indicated by
- * IsHandlingInterrupts) we also report we've ended prefetch receive work,
- * just in case state tracking was lost due to an error in the sync getPage
- * response code.
- */
-static void
-prefetch_pump_state(bool IsHandlingInterrupts)
-{
-	while (MyPState->ring_receive != MyPState->ring_flush)
-	{
-		NeonResponse   *response;
-		PrefetchRequest *slot;
-		MemoryContext	old;
-
-		slot = GetPrfSlot(MyPState->ring_receive);
-
-		old = MemoryContextSwitchTo(MyPState->errctx);
-		response = page_server->try_receive(slot->shard_no);
-		MemoryContextSwitchTo(old);
-
-		if (response == NULL)
-			break;
-
-		/* The slot should still be valid */
-		if (slot->status != PRFS_REQUESTED ||
-			slot->response != NULL ||
-			slot->my_ring_index != MyPState->ring_receive)
-			neon_shard_log(slot->shard_no, ERROR,
-						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
-						   slot->status, slot->response,
-						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
-
-		/* update prefetch state */
-		MyPState->n_responses_buffered += 1;
-		MyPState->n_requests_inflight -= 1;
-		MyPState->ring_receive += 1;
-		MyNeonCounters->getpage_prefetches_buffered =
-			MyPState->n_responses_buffered;
-
-		/* update slot state */
-		slot->status = PRFS_RECEIVED;
-		slot->response = response;
-
-		if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result)
-		{
-			/*
-			 * Store prefetched result in LFC (please read comments to lfc_prefetch
-			 * explaining why it can be done without holding shared buffer lock
-			 */
-			if (lfc_prefetch(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since))
-			{
-				slot->flags |= PRFSF_LFC;
-			}
-		}
-	}
-
-	/* We never pump the prefetch state while handling other pages */
-	if (!IsHandlingInterrupts)
-		END_PREFETCH_RECEIVE_WORK();
-
-	reconfigure_timeout_if_needed();
-}
-
-void
-readahead_buffer_resize(int newsize, void *extra)
-{
-	uint64		end,
-				nfree = newsize;
-	PrefetchState *newPState;
-	Size		newprfs_size = offsetof(PrefetchState, prf_buffer) +
-		(sizeof(PrefetchRequest) * newsize);
-
-	/* don't try to re-initialize if we haven't initialized yet */
-	if (MyPState == NULL)
-		return;
-
-	/*
-	 * Make sure that we don't lose track of active prefetch requests by
-	 * ensuring we have received all but the last n requests (n = newsize).
-	 */
-	if (MyPState->n_requests_inflight > newsize)
-	{
-		prefetch_wait_for(MyPState->ring_unused - newsize - 1);
-		Assert(MyPState->n_requests_inflight <= newsize);
-	}
-
-	/* construct the new PrefetchState, and copy over the memory contexts */
-	newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size);
-
-	newPState->bufctx = MyPState->bufctx;
-	newPState->errctx = MyPState->errctx;
-	newPState->hashctx = MyPState->hashctx;
-	newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL);
-	newPState->n_unused = newsize;
-	newPState->n_requests_inflight = 0;
-	newPState->n_responses_buffered = 0;
-	newPState->ring_last = newsize;
-	newPState->ring_unused = newsize;
-	newPState->ring_receive = newsize;
-	newPState->max_shard_no = MyPState->max_shard_no;
-	memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap));
-
-	/*
-	 * Copy over the prefetches.
-	 *
-	 * We populate the prefetch array from the end; to retain the most recent
-	 * prefetches, but this has the benefit of only needing to do one
-	 * iteration on the dataset, and trivial compaction.
-	 */
-	for (end = MyPState->ring_unused - 1;
-		 end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0;
-		 end -= 1)
-	{
-		PrefetchRequest *slot = GetPrfSlot(end);
-		PrefetchRequest *newslot;
-		bool		found;
-
-		if (slot->status == PRFS_UNUSED)
-			continue;
-
-		nfree -= 1;
-
-		newslot = &newPState->prf_buffer[nfree];
-		*newslot = *slot;
-		newslot->my_ring_index = nfree;
-
-		prfh_insert(newPState->prf_hash, newslot, &found);
-
-		Assert(!found);
-
-		switch (newslot->status)
-		{
-			case PRFS_UNUSED:
-				pg_unreachable();
-			case PRFS_REQUESTED:
-				newPState->n_requests_inflight += 1;
-				newPState->ring_receive -= 1;
-				newPState->ring_last -= 1;
-				break;
-			case PRFS_RECEIVED:
-				newPState->n_responses_buffered += 1;
-				newPState->ring_last -= 1;
-				break;
-			case PRFS_TAG_REMAINS:
-				newPState->ring_last -= 1;
-				break;
-		}
-		newPState->n_unused -= 1;
-	}
-	newPState->ring_flush = newPState->ring_receive;
-
-	MyNeonCounters->getpage_prefetches_buffered =
-		MyPState->n_responses_buffered;
-	MyNeonCounters->pageserver_open_requests =
-		MyPState->n_requests_inflight;
-
-	for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
-	{
-		PrefetchRequest *slot = GetPrfSlot(end);
-		Assert(slot->status != PRFS_REQUESTED);
-		if (slot->status == PRFS_RECEIVED)
-		{
-			pfree(slot->response);
-		}
-	}
-
-	prfh_destroy(MyPState->prf_hash);
-	pfree(MyPState);
-	MyPState = newPState;
-}
-
-
-
-/*
- * Make sure that there are no responses still in the buffer.
- *
- * This function may indirectly update MyPState->pfs_hash; which invalidates
- * any active pointers into the hash table.
- */
-static void
-consume_prefetch_responses(void)
-{
-	if (MyPState->ring_receive < MyPState->ring_unused)
-		prefetch_wait_for(MyPState->ring_unused - 1);
-}
-
-static void
-prefetch_cleanup_trailing_unused(void)
-{
-	uint64		ring_index;
-	PrefetchRequest *slot;
-
-	while (MyPState->ring_last < MyPState->ring_receive)
-	{
-		ring_index = MyPState->ring_last;
-		slot = GetPrfSlot(ring_index);
-
-		if (slot->status == PRFS_UNUSED)
-			MyPState->ring_last += 1;
-		else
-			break;
-	}
-}
-
-
-static bool
-prefetch_flush_requests(void)
-{
-	for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++)
-	{
-		if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no))
-		{
-			if (!page_server->flush(shard_no))
-				return false;
-			BITMAP_CLR(MyPState->shard_bitmap, shard_no);
-		}
-	}
-	MyPState->max_shard_no = 0;
-	return true;
-}
-
-/*
- * Wait for slot of ring_index to have received its response.
- * The caller is responsible for making sure the request buffer is flushed.
- *
- * NOTE: this function may indirectly update MyPState->pfs_hash; which
- * invalidates any active pointers into the hash table.
- * NOTE: callers should make sure they can handle query cancellations in this
- * function's call path.
- */
-static bool
-prefetch_wait_for(uint64 ring_index)
-{
-	PrefetchRequest *entry;
-	bool		result = true;
-
-	if (MyPState->ring_flush <= ring_index &&
-		MyPState->ring_unused > MyPState->ring_flush)
-	{
-		if (!prefetch_flush_requests())
-			return false;
-		MyPState->ring_flush = MyPState->ring_unused;
-	}
-
-	Assert(MyPState->ring_unused > ring_index);
-
-	while (MyPState->ring_receive <= ring_index)
-	{
-		START_PREFETCH_RECEIVE_WORK();
-		entry = GetPrfSlot(MyPState->ring_receive);
-
-		Assert(entry->status == PRFS_REQUESTED);
-		if (!prefetch_read(entry))
-		{
-			result = false;
-			break;
-		}
-
-		END_PREFETCH_RECEIVE_WORK();
-		CHECK_FOR_INTERRUPTS();
-	}
-
-	return result;
-}
-
-/*
- * Read the response of a prefetch request into its slot.
- *
- * The caller is responsible for making sure that the request for this buffer
- * was flushed to the PageServer.
- *
- * NOTE: this function may indirectly update MyPState->pfs_hash; which
- * invalidates any active pointers into the hash table.
- *
- * NOTE: this does IO, and can get canceled out-of-line.
- */
-static bool
-prefetch_read(PrefetchRequest *slot)
-{
-	NeonResponse *response;
-	MemoryContext old;
-	BufferTag	buftag;
-	shardno_t	shard_no;
-	uint64		my_ring_index;
-
-	Assert(slot->status == PRFS_REQUESTED);
-	Assert(slot->response == NULL);
-	Assert(slot->my_ring_index == MyPState->ring_receive);
-
-	if (slot->status != PRFS_REQUESTED ||
-		slot->response != NULL ||
-		slot->my_ring_index != MyPState->ring_receive)
-		neon_shard_log(slot->shard_no, ERROR,
-					   "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu",
-					   slot->status, slot->response,
-					   (long)slot->my_ring_index, (long)MyPState->ring_receive);
-
-	/*
-	 * Copy the request info so that if an error happens and the prefetch
-	 * queue is flushed during the receive call, we can print the original
-	 * values in the error message
-	 */
-	buftag = slot->buftag;
-	shard_no = slot->shard_no;
-	my_ring_index = slot->my_ring_index;
-
-	old = MemoryContextSwitchTo(MyPState->errctx);
-	response = (NeonResponse *) page_server->receive(shard_no);
-	MemoryContextSwitchTo(old);
-	if (response)
-	{
-		/* The slot should still be valid */
-		if (slot->status != PRFS_REQUESTED ||
-			slot->response != NULL ||
-			slot->my_ring_index != MyPState->ring_receive)
-			neon_shard_log(shard_no, ERROR,
-						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
-						   slot->status, slot->response,
-						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
-
-		/* update prefetch state */
-		MyPState->n_responses_buffered += 1;
-		MyPState->n_requests_inflight -= 1;
-		MyPState->ring_receive += 1;
-		MyNeonCounters->getpage_prefetches_buffered =
-			MyPState->n_responses_buffered;
-
-		/* update slot state */
-		slot->status = PRFS_RECEIVED;
-		slot->response = response;
-
-		if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result)
-		{
-			/*
-			 * Store prefetched result in LFC (please read comments to lfc_prefetch
-			 * explaining why it can be done without holding shared buffer lock
-			 */
-			if (lfc_prefetch(BufTagGetNRelFileInfo(buftag), buftag.forkNum, buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since))
-			{
-				slot->flags |= PRFSF_LFC;
-			}
-		}
-		return true;
-	}
-	else
-	{
-		/*
-		 * Note: The slot might no longer be valid, if the connection was lost
-		 * and the prefetch queue was flushed during the receive call
-		 */
-		neon_shard_log(shard_no, LOG,
-					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
-					   (long) my_ring_index,
-					   RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
-					   buftag.forkNum, buftag.blockNum);
-		return false;
-	}
-}
-
-/*
- * Disconnect hook - drop prefetches when the connection drops
- *
- * If we don't remove the failed prefetches, we'd be serving incorrect
- * data to the smgr.
- */
-void
-prefetch_on_ps_disconnect(void)
-{
-	MyPState->ring_flush = MyPState->ring_unused;
-
-	while (MyPState->ring_receive < MyPState->ring_unused)
-	{
-		PrefetchRequest *slot;
-		uint64		ring_index = MyPState->ring_receive;
-
-		slot = GetPrfSlot(ring_index);
-
-		Assert(slot->status == PRFS_REQUESTED);
-		Assert(slot->my_ring_index == ring_index);
-
-		/*
-		 * Drop connection to all shards which have prefetch requests.
-		 * It is not a problem to call disconnect multiple times on the same connection
-		 * because disconnect implementation in libpagestore.c will check if connection
-		 * is alive and do nothing of connection was already dropped.
-		 */
-		page_server->disconnect(slot->shard_no);
-
-		/* clean up the request */
-		slot->status = PRFS_TAG_REMAINS;
-		MyPState->n_requests_inflight -= 1;
-		MyPState->ring_receive += 1;
-
-		prefetch_set_unused(ring_index);
-		pgBufferUsage.prefetch.expired += 1;
-		MyNeonCounters->getpage_prefetch_discards_total += 1;
-	}
-
-	/*
-	 * We can have gone into retry due to network error, so update stats with
-	 * the latest available
-	 */
-	MyNeonCounters->pageserver_open_requests =
-		MyPState->n_requests_inflight;
-	MyNeonCounters->getpage_prefetches_buffered =
-		MyPState->n_responses_buffered;
-}
-
-/*
- * prefetch_set_unused() - clear a received prefetch slot
- *
- * The slot at ring_index must be a current member of the ring buffer,
- * and may not be in the PRFS_REQUESTED state.
- *
- * NOTE: this function will update MyPState->pfs_hash; which invalidates any
- * active pointers into the hash table.
- */
-static inline void
-prefetch_set_unused(uint64 ring_index)
-{
-	PrefetchRequest *slot;
-
-	if (ring_index < MyPState->ring_last)
-		return;					/* Should already be unused */
-
-	slot = GetPrfSlot(ring_index);
-	if (slot->status == PRFS_UNUSED)
-		return;
-
-	Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS);
-
-	if (slot->status == PRFS_RECEIVED)
-	{
-		pfree(slot->response);
-		slot->response = NULL;
-
-		MyPState->n_responses_buffered -= 1;
-		MyPState->n_unused += 1;
-
-		MyNeonCounters->getpage_prefetches_buffered =
-			MyPState->n_responses_buffered;
-	}
-	else
-	{
-		Assert(slot->response == NULL);
-	}
-
-	prfh_delete(MyPState->prf_hash, slot);
-
-	/* clear all fields */
-	MemSet(slot, 0, sizeof(PrefetchRequest));
-	slot->status = PRFS_UNUSED;
-
-	/* run cleanup if we're holding back ring_last */
-	if (MyPState->ring_last == ring_index)
-		prefetch_cleanup_trailing_unused();
-
-	/*
-	 * ... and try to store the buffered responses more compactly if > 12.5%
-	 * of the buffer is gaps
-	 */
-	else if (ReceiveBufferNeedsCompaction())
-		compact_prefetch_buffers();
-}
-
-/*
- * Send one prefetch request to the pageserver. To wait for the response, call
- * prefetch_wait_for().
- */
-static void
-prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns)
-{
-	bool		found;
-	uint64		mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index;
-
-	NeonGetPageRequest request = {
-		.hdr.tag = T_NeonGetPageRequest,
-		.hdr.reqid = GENERATE_REQUEST_ID(),
-		/* lsn and not_modified_since are filled in below */
-		.rinfo = BufTagGetNRelFileInfo(slot->buftag),
-		.forknum = slot->buftag.forkNum,
-		.blkno = slot->buftag.blockNum,
-	};
-
-	Assert(mySlotNo == MyPState->ring_unused);
-
-	slot->reqid = request.hdr.reqid;
-
-	if (force_request_lsns)
-		slot->request_lsns = *force_request_lsns;
-	else
-		neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag),
-							  slot->buftag.forkNum, slot->buftag.blockNum,
-							  &slot->request_lsns, 1);
-	request.hdr.lsn = slot->request_lsns.request_lsn;
-	request.hdr.not_modified_since = slot->request_lsns.not_modified_since;
-
-	Assert(slot->response == NULL);
-	Assert(slot->my_ring_index == MyPState->ring_unused);
-
-	while (!page_server->send(slot->shard_no, (NeonRequest *) &request))
-	{
-		Assert(mySlotNo == MyPState->ring_unused);
-		/* loop */
-	}
-
-	/* update prefetch state */
-	MyPState->n_requests_inflight += 1;
-	MyPState->n_unused -= 1;
-	MyPState->ring_unused += 1;
-	BITMAP_SET(MyPState->shard_bitmap, slot->shard_no);
-	MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no);
-
-	/* update slot state */
-	slot->status = PRFS_REQUESTED;
-	prfh_insert(MyPState->prf_hash, slot, &found);
-	Assert(!found);
-}
-
-/*
- * Lookup of already received prefetch requests. Only already received responses matching required LSNs are accepted.
- * Present pages are marked in "mask" bitmap and total number of such pages is returned.
- */
-static int
-prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, neon_request_lsns *lsns,
-				 BlockNumber nblocks, void **buffers, bits8 *mask)
-{
-	int hits = 0;
-	PrefetchRequest hashkey;
-
-	/*
-	 * Use an intermediate PrefetchRequest struct as the hash key to ensure
-	 * correct alignment and that the padding bytes are cleared.
-	 */
-	memset(&hashkey.buftag, 0, sizeof(BufferTag));
-	CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo);
-	hashkey.buftag.forkNum = forknum;
-
-	for (int i = 0; i < nblocks; i++)
-	{
-		PrfHashEntry *entry;
-
-		hashkey.buftag.blockNum = blocknum + i;
-		entry = prfh_lookup(MyPState->prf_hash, &hashkey);
-
-		if (entry != NULL)
-		{
-			PrefetchRequest *slot = entry->slot;
-			uint64 ring_index = slot->my_ring_index;
-			Assert(slot == GetPrfSlot(ring_index));
-
-			Assert(slot->status != PRFS_UNUSED);
-			Assert(MyPState->ring_last <= ring_index &&
-				   ring_index < MyPState->ring_unused);
-			Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag));
-
-			if (slot->status != PRFS_RECEIVED)
-				continue;
-
-			/*
-			 * If the caller specified a request LSN to use, only accept
-			 * prefetch responses that satisfy that request.
-			 */
-			if (!neon_prefetch_response_usable(&lsns[i], slot))
-				continue;
-
-			/*
-			 * Ignore errors
-			 */
-			if (slot->response->tag != T_NeonGetPageResponse)
-			{
-				if (slot->response->tag != T_NeonErrorResponse)
-				{
-					NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
-											"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
-											T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag);
-				}
-				continue;
-			}
-			memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ);
-
-
-			/*
-			 * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received
-			 * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here
-			 * under buffer lock.
-			 */
-			if (!lfc_store_prefetch_result)
-				lfc_write(rinfo, forknum, blocknum + i, buffers[i]);
-
-			prefetch_set_unused(ring_index);
-			BITMAP_SET(mask, i);
-
-			hits += 1;
-			inc_getpage_wait(0);
-		}
-	}
-	pgBufferUsage.prefetch.hits += hits;
-	return hits;
-}
-
-#if PG_MAJORVERSION_NUM < 17
-static bool
-prefetch_lookup(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkn, neon_request_lsns *lsns, void *buffer)
-{
-	bits8 present = 0;
-	return prefetch_lookupv(rinfo, forkNum, blkn, lsns, 1, &buffer, &present) != 0;
-}
-#endif
-
-/*
- * prefetch_register_bufferv() - register and prefetch buffers
- *
- * Register that we may want the contents of BufferTag in the near future.
- * This is used when issuing a speculative prefetch request, but also when
- * performing a synchronous request and need the buffer right now.
- *
- * If force_request_lsns is not NULL, those values are sent to the
- * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure
- * to calculate the LSNs to send.
- *
- * Bits set in *mask (if present) indicate pages already read; i.e. pages we
- * can skip in this process.
- *
- * When performing a prefetch rather than a synchronous request,
- * is_prefetch==true. Currently, it only affects how the request is accounted
- * in the perf counters.
- *
- * NOTE: this function may indirectly update MyPState->pfs_hash; which
- * invalidates any active pointers into the hash table.
- */
-static uint64
-prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns,
-						  BlockNumber nblocks, const bits8 *mask,
-						  bool is_prefetch)
-{
-	uint64		min_ring_index;
-	PrefetchRequest hashkey;
-#ifdef USE_ASSERT_CHECKING
-	bool		any_hits = false;
-#endif
-	/* We will never read further ahead than our buffer can store. */
-	nblocks = Max(1, Min(nblocks, readahead_buffer_size));
-
-	/*
-	 * Use an intermediate PrefetchRequest struct as the hash key to ensure
-	 * correct alignment and that the padding bytes are cleared.
-	 */
-	memset(&hashkey.buftag, 0, sizeof(BufferTag));
-	hashkey.buftag = tag;
-
-Retry:
-	/*
-	 * We can have gone into retry due to network error, so update stats with
-	 * the latest available
-	 */
-	MyNeonCounters->pageserver_open_requests =
-		MyPState->ring_unused - MyPState->ring_receive;
-	MyNeonCounters->getpage_prefetches_buffered =
-		MyPState->n_responses_buffered;
-
-	min_ring_index = UINT64_MAX;
-	for (int i = 0; i < nblocks; i++)
-	{
-		PrefetchRequest *slot = NULL;
-		PrfHashEntry *entry = NULL;
-		uint64		ring_index;
-		neon_request_lsns *lsns;
-
-		if (PointerIsValid(mask) && BITMAP_ISSET(mask, i))
-			continue;
-
-		if (frlsns)
-			lsns = &frlsns[i];
-		else
-			lsns = NULL;
-
-#ifdef USE_ASSERT_CHECKING
-		any_hits = true;
-#endif
-
-		slot = NULL;
-		entry = NULL;
-
-		hashkey.buftag.blockNum = tag.blockNum + i;
-		entry = prfh_lookup(MyPState->prf_hash, &hashkey);
-
-		if (entry != NULL)
-		{
-			slot = entry->slot;
-			ring_index = slot->my_ring_index;
-			Assert(slot == GetPrfSlot(ring_index));
-
-			Assert(slot->status != PRFS_UNUSED);
-			Assert(MyPState->ring_last <= ring_index &&
-				   ring_index < MyPState->ring_unused);
-			Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag));
-
-			/*
-			 * If the caller specified a request LSN to use, only accept
-			 * prefetch responses that satisfy that request.
-			 */
-			if (lsns)
-			{
-				if (!neon_prefetch_response_usable(lsns, slot))
-				{
-					/* Wait for the old request to finish and discard it */
-					if (!prefetch_wait_for(ring_index))
-						goto Retry;
-					prefetch_set_unused(ring_index);
-					entry = NULL;
-					slot = NULL;
-					pgBufferUsage.prefetch.expired += 1;
-					MyNeonCounters->getpage_prefetch_discards_total += 1;
-				}
-			}
-
-			if (entry != NULL)
-			{
-				/*
-				 * We received a prefetch for a page that was recently read
-				 * and removed from the buffers. Remove that request from the
-				 * buffers.
-				 */
-				if (slot->status == PRFS_TAG_REMAINS)
-				{
-					prefetch_set_unused(ring_index);
-					entry = NULL;
-					slot = NULL;
-				}
-				else
-				{
-					min_ring_index = Min(min_ring_index, ring_index);
-					/* The buffered request is good enough, return that index */
-					if (is_prefetch)
-						pgBufferUsage.prefetch.duplicates++;
-					continue;
-				}
-			}
-		}
-		else if (!is_prefetch)
-		{
-			pgBufferUsage.prefetch.misses += 1;
-			MyNeonCounters->getpage_prefetch_misses_total++;
-		}
-		/*
-		 * We can only leave the block above by finding that there's
-		 * no entry that can satisfy this request, either because there
-		 * was no entry, or because the entry was invalid or didn't satisfy
-		 * the LSNs provided.
-		 *
-		 * The code should've made sure to clear up the data.
-		 */
-		Assert(entry == NULL);
-		Assert(slot == NULL);
-
-		/* There should be no buffer overflow */
-		Assert(MyPState->ring_last + readahead_buffer_size >= MyPState->ring_unused);
-
-		/*
-		 * If the prefetch queue is full, we need to make room by clearing the
-		 * oldest slot. If the oldest slot holds a buffer that was already
-		 * received, we can just throw it away; we fetched the page
-		 * unnecessarily in that case. If the oldest slot holds a request that
-		 * we haven't received a response for yet, we have to wait for the
-		 * response to that before we can continue. We might not have even
-		 * flushed the request to the pageserver yet, it might be just sitting
-		 * in the output buffer. In that case, we flush it and wait for the
-		 * response. (We could decide not to send it, but it's hard to abort
-		 * when the request is already in the output buffer, and 'not sending'
-		 * a prefetch request kind of goes against the principles of
-		 * prefetching)
-		 */
-		if (MyPState->ring_last + readahead_buffer_size == MyPState->ring_unused)
-		{
-			uint64		cleanup_index = MyPState->ring_last;
-
-			slot = GetPrfSlot(cleanup_index);
-
-			Assert(slot->status != PRFS_UNUSED);
-
-			/*
-			 * If there is good reason to run compaction on the prefetch buffers,
-			 * try to do that.
-			 */
-			if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers())
-			{
-				Assert(slot->status == PRFS_UNUSED);
-			}
-			else
-			{
-				/*
-				 * We have the slot for ring_last, so that must still be in
-				 * progress
-				 */
-				switch (slot->status)
-				{
-					case PRFS_REQUESTED:
-						Assert(MyPState->ring_receive == cleanup_index);
-						if (!prefetch_wait_for(cleanup_index))
-							goto Retry;
-						prefetch_set_unused(cleanup_index);
-						pgBufferUsage.prefetch.expired += 1;
-						MyNeonCounters->getpage_prefetch_discards_total += 1;
-						break;
-					case PRFS_RECEIVED:
-					case PRFS_TAG_REMAINS:
-						prefetch_set_unused(cleanup_index);
-						pgBufferUsage.prefetch.expired += 1;
-						MyNeonCounters->getpage_prefetch_discards_total += 1;
-						break;
-					default:
-						pg_unreachable();
-				}
-			}
-		}
-
-		/*
-		 * The next buffer pointed to by `ring_unused` is now definitely empty, so
-		 * we can insert the new request to it.
-		 */
-		ring_index = MyPState->ring_unused;
-
-		Assert(MyPState->ring_last <= ring_index &&
-			   ring_index <= MyPState->ring_unused);
-
-		slot = GetPrfSlotNoCheck(ring_index);
-
-		Assert(slot->status == PRFS_UNUSED);
-
-		/*
-		 * We must update the slot data before insertion, because the hash
-		 * function reads the buffer tag from the slot.
-		 */
-		slot->buftag = hashkey.buftag;
-		slot->shard_no = get_shard_number(&tag);
-		slot->my_ring_index = ring_index;
-		slot->flags = 0;
-
-		min_ring_index = Min(min_ring_index, ring_index);
-
-		if (is_prefetch)
-			MyNeonCounters->getpage_prefetch_requests_total++;
-		else
-			MyNeonCounters->getpage_sync_requests_total++;
-
-		prefetch_do_request(slot, lsns);
-	}
-
-	MyNeonCounters->pageserver_open_requests =
-		MyPState->ring_unused - MyPState->ring_receive;
-
-	Assert(any_hits);
-
-	Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED ||
-		   GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED);
-	Assert(MyPState->ring_last <= min_ring_index &&
-		   min_ring_index < MyPState->ring_unused);
-
-	if (flush_every_n_requests > 0 &&
-		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
-	{
-		if (!prefetch_flush_requests())
-		{
-			/*
-			 * Prefetch set is reset in case of error, so we should try to
-			 * register our request once again
-			 */
-			goto Retry;
-		}
-		MyPState->ring_flush = MyPState->ring_unused;
-	}
-
-	return min_ring_index;
-}
-
-static bool
-equal_requests(NeonRequest* a, NeonRequest* b)
-{
-	return a->reqid == b->reqid && a->lsn == b->lsn && a->not_modified_since == b->not_modified_since;
-}
-
-
-/*
- * Note: this function can get canceled and use a long jump to the next catch
- * context. Take care.
- */
-static NeonResponse *
-page_server_request(void const *req)
-{
-	NeonResponse *resp;
-	BufferTag tag = {0};
-	shardno_t shard_no;
-
-	switch (messageTag(req))
-	{
-		case T_NeonExistsRequest:
-			CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
-			break;
-		case T_NeonNblocksRequest:
-			CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo);
-			break;
-		case T_NeonDbSizeRequest:
-			NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode;
-			break;
-		case T_NeonGetPageRequest:
-			CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo);
-			tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
-			break;
-		default:
-			neon_log(ERROR, "Unexpected request tag: %d", messageTag(req));
-	}
-	shard_no = get_shard_number(&tag);
-
-	/*
-	 * Current sharding model assumes that all metadata is present only at shard 0.
-	 * We still need to call get_shard_no() to check if shard map is up-to-date.
-	 */
-	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest)
-	{
-		shard_no = 0;
-	}
-
-	do
-	{
-		PG_TRY();
-		{
-			while (!page_server->send(shard_no, (NeonRequest *) req)
-				   || !page_server->flush(shard_no))
-			{
-				/* do nothing */
-			}
-			MyNeonCounters->pageserver_open_requests++;
-			consume_prefetch_responses();
-			resp = page_server->receive(shard_no);
-			MyNeonCounters->pageserver_open_requests--;
-		}
-		PG_CATCH();
-		{
-			/*
-			 * Cancellation in this code needs to be handled better at some
-			 * point, but this currently seems fine for now.
-			 */
-			page_server->disconnect(shard_no);
-			MyNeonCounters->pageserver_open_requests = 0;
-
-			/*
-			 * We know for sure we're not working on any prefetch pages after
-			 * this.
-			 */
-			END_PREFETCH_RECEIVE_WORK();
-
-			PG_RE_THROW();
-		}
-		PG_END_TRY();
-
-	} while (resp == NULL);
-
-	return resp;
-}
-
-
-StringInfoData
-nm_pack_request(NeonRequest *msg)
-{
-	StringInfoData s;
-
-	initStringInfo(&s);
-
-	pq_sendbyte(&s, msg->tag);
-	if (neon_protocol_version >= 3)
-	{
-		pq_sendint64(&s, msg->reqid);
-	}
-	pq_sendint64(&s, msg->lsn);
-	pq_sendint64(&s, msg->not_modified_since);
-
-	switch (messageTag(msg))
-	{
-			/* pagestore_client -> pagestore */
-		case T_NeonExistsRequest:
-			{
-				NeonExistsRequest *msg_req = (NeonExistsRequest *) msg;
-
-				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
-				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
-				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
-				pq_sendbyte(&s, msg_req->forknum);
-
-				break;
-			}
-		case T_NeonNblocksRequest:
-			{
-				NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg;
-
-				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
-				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
-				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
-				pq_sendbyte(&s, msg_req->forknum);
-
-				break;
-			}
-		case T_NeonDbSizeRequest:
-			{
-				NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;
-
-				pq_sendint32(&s, msg_req->dbNode);
-
-				break;
-			}
-		case T_NeonGetPageRequest:
-			{
-				NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg;
-
-				pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo));
-				pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo));
-				pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo));
-				pq_sendbyte(&s, msg_req->forknum);
-				pq_sendint32(&s, msg_req->blkno);
-
-				break;
-			}
-
-		case T_NeonGetSlruSegmentRequest:
-			{
-				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
-
-				pq_sendbyte(&s, msg_req->kind);
-				pq_sendint32(&s, msg_req->segno);
-
-				break;
-			}
-
-			/* pagestore -> pagestore_client. We never need to create these. */
-		case T_NeonExistsResponse:
-		case T_NeonNblocksResponse:
-		case T_NeonGetPageResponse:
-		case T_NeonErrorResponse:
-		case T_NeonDbSizeResponse:
-		case T_NeonGetSlruSegmentResponse:
-		default:
-			neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
-			break;
-	}
-	return s;
-}
-
-NeonResponse *
-nm_unpack_response(StringInfo s)
-{
-	NeonMessageTag tag = pq_getmsgbyte(s);
-	NeonResponse resp_hdr = {0}; /* make valgrind happy */
-	NeonResponse *resp = NULL;
-
-	resp_hdr.tag = tag;
-	if (neon_protocol_version >= 3)
-	{
-		resp_hdr.reqid = pq_getmsgint64(s);
-		resp_hdr.lsn = pq_getmsgint64(s);
-		resp_hdr.not_modified_since = pq_getmsgint64(s);
-	}
-	switch (tag)
-	{
-			/* pagestore -> pagestore_client */
-		case T_NeonExistsResponse:
-			{
-				NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse));
-
-				if (neon_protocol_version >= 3)
-				{
-					NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					msg_resp->req.forknum = pq_getmsgbyte(s);
-				}
-				msg_resp->req.hdr = resp_hdr;
-				msg_resp->exists = pq_getmsgbyte(s);
-				pq_getmsgend(s);
-
-				resp = (NeonResponse *) msg_resp;
-				break;
-			}
-
-		case T_NeonNblocksResponse:
-			{
-				NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse));
-
-				if (neon_protocol_version >= 3)
-				{
-					NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					msg_resp->req.forknum = pq_getmsgbyte(s);
-				}
-				msg_resp->req.hdr = resp_hdr;
-				msg_resp->n_blocks = pq_getmsgint(s, 4);
-				pq_getmsgend(s);
-
-				resp = (NeonResponse *) msg_resp;
-				break;
-			}
-
-		case T_NeonGetPageResponse:
-			{
-				NeonGetPageResponse *msg_resp;
-
-				msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE);
-				if (neon_protocol_version >= 3)
-				{
-					NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4);
-					msg_resp->req.forknum = pq_getmsgbyte(s);
-					msg_resp->req.blkno = pq_getmsgint(s, 4);
-				}
-				msg_resp->req.hdr = resp_hdr;
-				/* XXX:	should be varlena */
-				memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ);
-				pq_getmsgend(s);
-
-				Assert(msg_resp->req.hdr.tag == T_NeonGetPageResponse);
-
-				resp = (NeonResponse *) msg_resp;
-				break;
-			}
-
-		case T_NeonDbSizeResponse:
-			{
-				NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse));
-
-				if (neon_protocol_version >= 3)
-				{
-					msg_resp->req.dbNode = pq_getmsgint(s, 4);
-				}
-				msg_resp->req.hdr = resp_hdr;
-				msg_resp->db_size = pq_getmsgint64(s);
-				pq_getmsgend(s);
-
-				resp = (NeonResponse *) msg_resp;
-				break;
-			}
-
-		case T_NeonErrorResponse:
-			{
-				NeonErrorResponse *msg_resp;
-				size_t		msglen;
-				const char *msgtext;
-
-				msgtext = pq_getmsgrawstring(s);
-				msglen = strlen(msgtext);
-
-				msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1);
-				msg_resp->req = resp_hdr;
-				memcpy(msg_resp->message, msgtext, msglen + 1);
-				pq_getmsgend(s);
-
-				resp = (NeonResponse *) msg_resp;
-				break;
-			}
-
-		case T_NeonGetSlruSegmentResponse:
-		    {
-				NeonGetSlruSegmentResponse *msg_resp;
-				int n_blocks;
-				msg_resp = palloc0(sizeof(NeonGetSlruSegmentResponse));
-
-				if (neon_protocol_version >= 3)
-				{
-					msg_resp->req.kind = pq_getmsgbyte(s);
-					msg_resp->req.segno = pq_getmsgint(s, 4);
-				}
-				msg_resp->req.hdr = resp_hdr;
-
-				n_blocks = pq_getmsgint(s, 4);
-				msg_resp->n_blocks = n_blocks;
-				memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ);
-				pq_getmsgend(s);
-
-				resp = (NeonResponse *) msg_resp;
-				break;
-			}
-
-			/*
-			 * pagestore_client -> pagestore
-			 *
-			 * We create these ourselves, and don't need to decode them.
-			 */
-		case T_NeonExistsRequest:
-		case T_NeonNblocksRequest:
-		case T_NeonGetPageRequest:
-		case T_NeonDbSizeRequest:
-		case T_NeonGetSlruSegmentRequest:
-		default:
-			neon_log(ERROR, "unexpected neon message tag 0x%02x", tag);
-			break;
-	}
-
-	return resp;
-}
-
-/* dump to json for debugging / error reporting purposes */
-char *
-nm_to_string(NeonMessage *msg)
-{
-	StringInfoData s;
-
-	initStringInfo(&s);
-
-	switch (messageTag(msg))
-	{
-			/* pagestore_client -> pagestore */
-		case T_NeonExistsRequest:
-			{
-				NeonExistsRequest *msg_req = (NeonExistsRequest *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\"");
-				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
-				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-
-		case T_NeonNblocksRequest:
-			{
-				NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\"");
-				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
-				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-
-		case T_NeonGetPageRequest:
-			{
-				NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\"");
-				appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo));
-				appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum);
-				appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-		case T_NeonDbSizeRequest:
-			{
-				NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\"");
-				appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-		case T_NeonGetSlruSegmentRequest:
-			{
-				NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\"");
-				appendStringInfo(&s, ", \"kind\": %u", msg_req->kind);
-				appendStringInfo(&s, ", \"segno\": %u", msg_req->segno);
-				appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn));
-				appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since));
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-			/* pagestore -> pagestore_client */
-		case T_NeonExistsResponse:
-			{
-				NeonExistsResponse *msg_resp = (NeonExistsResponse *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"NeonExistsResponse\"");
-				appendStringInfo(&s, ", \"exists\": %d}",
-								 msg_resp->exists);
-				appendStringInfoChar(&s, '}');
-
-				break;
-			}
-		case T_NeonNblocksResponse:
-			{
-				NeonNblocksResponse *msg_resp = (NeonNblocksResponse *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"NeonNblocksResponse\"");
-				appendStringInfo(&s, ", \"n_blocks\": %u}",
-								 msg_resp->n_blocks);
-				appendStringInfoChar(&s, '}');
-
-				break;
-			}
-		case T_NeonGetPageResponse:
-			{
-#if 0
-				NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg;
-#endif
-
-				appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\"");
-				appendStringInfo(&s, ", \"page\": \"XXX\"}");
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-		case T_NeonErrorResponse:
-			{
-				NeonErrorResponse *msg_resp = (NeonErrorResponse *) msg;
-
-				/* FIXME: escape double-quotes in the message */
-				appendStringInfoString(&s, "{\"type\": \"NeonErrorResponse\"");
-				appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message);
-				appendStringInfoChar(&s, '}');
-				break;
-			}
-		case T_NeonDbSizeResponse:
-			{
-				NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg;
-
-				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\"");
-				appendStringInfo(&s, ", \"db_size\": %ld}",
-								 msg_resp->db_size);
-				appendStringInfoChar(&s, '}');
-
-				break;
-			}
-		case T_NeonGetSlruSegmentResponse:
-			{
-				NeonGetSlruSegmentResponse *msg_resp = (NeonGetSlruSegmentResponse *) msg;
+{
+	UNLOGGED_BUILD_NOT_IN_PROGRESS = 0,
+	UNLOGGED_BUILD_PHASE_1,
+	UNLOGGED_BUILD_PHASE_2,
+	UNLOGGED_BUILD_NOT_PERMANENT
+} UnloggedBuildPhase;
 
-				appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentResponse\"");
-				appendStringInfo(&s, ", \"n_blocks\": %u}",
-								 msg_resp->n_blocks);
-				appendStringInfoChar(&s, '}');
+static SMgrRelation unlogged_build_rel = NULL;
+static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 
-				break;
-			}
+static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
+static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;
 
-		default:
-			appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag);
-	}
-	return s.data;
-}
+static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum);
 
 /*
  * Wrapper around log_newpage() that makes a temporary copy of the block and
@@ -2148,11 +461,6 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
 static void
 neon_init(void)
 {
-	Size		prfs_size;
-
-	if (MyPState != NULL)
-		return;
-
 	/*
 	 * Sanity check that theperf counters array is sized correctly. We got
 	 * this wrong once, and the formula for max number of backends and aux
@@ -2167,27 +475,6 @@ neon_init(void)
 		elog(ERROR, "MyNeonCounters points past end of array");
 #endif
 
-	prfs_size = offsetof(PrefetchState, prf_buffer) +
-		sizeof(PrefetchRequest) * readahead_buffer_size;
-
-	MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size);
-
-	MyPState->n_unused = readahead_buffer_size;
-
-	MyPState->bufctx = SlabContextCreate(TopMemoryContext,
-										 "NeonSMGR/prefetch",
-										 SLAB_DEFAULT_BLOCK_SIZE * 17,
-										 PS_GETPAGERESPONSE_SIZE);
-	MyPState->errctx = AllocSetContextCreate(TopMemoryContext,
-											 "NeonSMGR/errors",
-											 ALLOCSET_DEFAULT_SIZES);
-	MyPState->hashctx = AllocSetContextCreate(TopMemoryContext,
-											  "NeonSMGR/prefetch",
-											  ALLOCSET_DEFAULT_SIZES);
-
-	MyPState->prf_hash = prfh_create(MyPState->hashctx,
-									 readahead_buffer_size, NULL);
-
 	old_redo_read_buffer_filter = redo_read_buffer_filter;
 	redo_read_buffer_filter = neon_redo_read_buffer_filter;
 
@@ -2224,8 +511,10 @@ nm_adjust_lsn(XLogRecPtr lsn)
 
 /*
  * Return LSN for requesting pages and number of blocks from page server
+ *
+ * XXX: exposed so that prefetch_do_request() can call back here.
  */
-static void
+void
 neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 					  neon_request_lsns *output, BlockNumber nblocks)
 {
@@ -2428,112 +717,12 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	}
 }
 
-/*
- *  neon_prefetch_response_usable -- Can a new request be satisfied by old one?
- *
- * This is used to check if the response to a prefetch request can be used to
- * satisfy a page read now.
- */
-static bool
-neon_prefetch_response_usable(neon_request_lsns *request_lsns,
-							  PrefetchRequest *slot)
-{
-	/* sanity check the LSN's on the old and the new request */
-	Assert(request_lsns->request_lsn >= request_lsns->not_modified_since);
-	Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since);
-	Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn);
-	Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since);
-	Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since);
-	Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn);
-	Assert(slot->status != PRFS_UNUSED);
-
-	/*
-	 * The new request's LSN should never be older than the old one.  This
-	 * could be an Assert, except that for testing purposes, we do provide an
-	 * interface in neon_test_utils to fetch pages at arbitary LSNs, which
-	 * violates this.
-	 *
-	 * Similarly, the not_modified_since value calculated for a page should
-	 * never move backwards. This assumption is a bit fragile; if we updated
-	 * the last-written cache when we read in a page, for example, then it
-	 * might. But as the code stands, it should not.
-	 *
-	 * (If two backends issue a request at the same time, they might race and
-	 * calculate LSNs "out of order" with each other, but the prefetch queue
-	 * is backend-private at the moment.)
-	 */
-	if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn ||
-		request_lsns->not_modified_since < slot->request_lsns.not_modified_since)
-	{
-		ereport(LOG,
-				(errcode(ERRCODE_IO_ERROR),
-				 errmsg(NEON_TAG "request with unexpected LSN after prefetch"),
-				 errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)",
-						   LSN_FORMAT_ARGS(request_lsns->effective_request_lsn),
-						   LSN_FORMAT_ARGS(request_lsns->not_modified_since),
-						   LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn),
-						   LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since))));
-		return false;
-	}
-
-	/*---
-	 * Each request to the pageserver has three LSN values associated with it:
-	 * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'.
-	 * `not_modified_since` and `request_lsn` are sent to the pageserver, but
-	 * in the primary node, we always use UINT64_MAX as the `request_lsn`, so
-	 * we remember `effective_request_lsn` separately. In a primary,
-	 * `effective_request_lsn` is the same as  `not_modified_since`.
-	 * See comments in neon_get_request_lsns why we can not use last flush WAL position here.
-	 *
-	 * To determine whether a response to a GetPage request issued earlier is
-	 * still valid to satisfy a new page read, we look at the
-	 * (not_modified_since, effective_request_lsn] range of the request. It is
-	 * effectively a claim that the page has not been modified between those
-	 * LSNs.  If the range of the old request in the queue overlaps with the
-	 * new request, we know that the page hasn't been modified in the union of
-	 * the ranges. We can use the response to old request to satisfy the new
-	 * request in that case. For example:
-	 *
-	 *              100      500
-	 * Old request:  +--------+
-	 *
-	 *                     400      800
-	 * New request:         +--------+
-	 *
-	 * The old request claims that the page was not modified between LSNs 100
-	 * and 500, and the second claims that it was not modified between 400 and
-	 * 800. Together they mean that the page was not modified between 100 and
-	 * 800. Therefore the response to the old request is also valid for the
-	 * new request.
-	 *
-	 * This logic also holds at the boundary case that the old request's LSN
-	 * matches the new request's not_modified_since LSN exactly:
-	 *
-	 *              100      500
-	 * Old request:  +--------+
-	 *
-	 *                       500      900
-	 * New request:           +--------+
-	 *
-	 * The response to the old request is the page as it was at LSN 500, and
-	 * the page hasn't been changed in the range (500, 900], therefore the
-	 * response is valid also for the new request.
-	 */
-
-	/* this follows from the checks above */
-	Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since);
-
-	return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn;
-}
-
 /*
  *	neon_exists() -- Does the physical file exist?
  */
 static bool
 neon_exists(SMgrRelation reln, ForkNumber forkNum)
 {
-	bool		exists;
-	NeonResponse *resp;
 	BlockNumber n_blocks;
 	neon_request_lsns request_lsns;
 
@@ -2592,67 +781,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 
 	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
 						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
-	{
-		NeonExistsRequest request = {
-			.hdr.tag = T_NeonExistsRequest,
-			.hdr.reqid = GENERATE_REQUEST_ID(),
-			.hdr.lsn = request_lsns.request_lsn,
-			.hdr.not_modified_since = request_lsns.not_modified_since,
-			.rinfo = InfoFromSMgrRel(reln),
-			.forknum = forkNum
-		};
-
-		resp = page_server_request(&request);
-
-		switch (resp->tag)
-		{
-			case T_NeonExistsResponse:
-			{
-				NeonExistsResponse* exists_resp = (NeonExistsResponse *) resp;
-				if (neon_protocol_version >= 3)
-				{
-					if (!equal_requests(resp, &request.hdr) ||
-						!RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) ||
-						exists_resp->req.forknum != request.forknum)
-					{
-						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
-													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum,
-													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum);
-					}
-				}
-				exists = exists_resp->exists;
-				break;
-			}
-			case T_NeonErrorResponse:
-				if (neon_protocol_version >= 3)
-				{
-					if (!equal_requests(resp, &request.hdr))
-					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
-							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
-							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
-					}
-				}
-				ereport(ERROR,
-						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
-								resp->reqid,
-								RelFileInfoFmt(InfoFromSMgrRel(reln)),
-								forkNum,
-								LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
-						 errdetail("page server returned error: %s",
-								   ((NeonErrorResponse *) resp)->message)));
-				break;
-
-			default:
-				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-											"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
-											T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
-		}
-		pfree(resp);
-	}
-	return exists;
+
+	return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
 }
 
 /*
@@ -3001,7 +1131,6 @@ static bool
 neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			  int nblocks)
 {
-	uint64		ring_index PG_USED_FOR_ASSERTS_ONLY;
 	BufferTag	tag;
 
 	switch (reln->smgr_relpersistence)
@@ -3038,17 +1167,13 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 		tag.blockNum = blocknum;
 
-		ring_index = prefetch_register_bufferv(tag, NULL, iterblocks,
-											   lfc_present, true);
+		communicator_prefetch_register_bufferv(tag, NULL, iterblocks, lfc_present);
 
 		nblocks -= iterblocks;
 		blocknum += iterblocks;
-
-		Assert(ring_index < MyPState->ring_unused &&
-			   MyPState->ring_last <= ring_index);
 	}
 
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 	return false;
 }
@@ -3061,7 +1186,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 static bool
 neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 {
-	uint64		ring_index PG_USED_FOR_ASSERTS_ONLY;
 	BufferTag	tag;
 
 	switch (reln->smgr_relpersistence)
@@ -3086,12 +1210,9 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 
 	CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
 
-	ring_index = prefetch_register_bufferv(tag, NULL, 1, NULL, true);
-
-	Assert(ring_index < MyPState->ring_unused &&
-		   MyPState->ring_last <= ring_index);
+	communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
 
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 	return false;
 }
@@ -3135,7 +1256,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 	 */
 	neon_log(SmgrTrace, "writeback noop");
 
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -3143,208 +1264,6 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 #endif
 }
 
-/*
- * Read N pages at a specific LSN.
- *
- * *mask is set for pages read at a previous point in time, and which we
- * should not touch, nor overwrite.
- * New bits should be set in *mask for the pages we'successfully read.
- *
- * The offsets in request_lsns, buffers, and mask are linked.
- */
-static void
-neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns,
-				  void **buffers, BlockNumber nblocks, const bits8 *mask)
-{
-	NeonResponse *resp;
-	uint64		ring_index;
-	PrfHashEntry *entry;
-	PrefetchRequest *slot;
-	PrefetchRequest hashkey;
-
-	Assert(PointerIsValid(request_lsns));
-	Assert(nblocks >= 1);
-
-	/*
-	 * Use an intermediate PrefetchRequest struct as the hash key to ensure
-	 * correct alignment and that the padding bytes are cleared.
-	 */
-	memset(&hashkey.buftag, 0, sizeof(BufferTag));
-	CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo);
-	hashkey.buftag.forkNum = forkNum;
-	hashkey.buftag.blockNum = base_blockno;
-
-	/*
-	 * The redo process does not lock pages that it needs to replay but are
-	 * not in the shared buffers, so a concurrent process may request the page
-	 * after redo has decided it won't redo that page and updated the LwLSN
-	 * for that page. If we're in hot standby we need to take care that we
-	 * don't return until after REDO has finished replaying up to that LwLSN,
-	 * as the page should have been locked up to that point.
-	 *
-	 * See also the description on neon_redo_read_buffer_filter below.
-	 *
-	 * NOTE: It is possible that the WAL redo process will still do IO due to
-	 * concurrent failed read IOs. Those IOs should never have a request_lsn
-	 * that is as large as the WAL record we're currently replaying, if it
-	 * weren't for the behaviour of the LwLsn cache that uses the highest
-	 * value of the LwLsn cache when the entry is not found.
-	 */
-	prefetch_register_bufferv(hashkey.buftag, request_lsns, nblocks, mask, false);
-
-	for (int i = 0; i < nblocks; i++)
-	{
-		void	   *buffer = buffers[i];
-		BlockNumber blockno = base_blockno + i;
-		neon_request_lsns *reqlsns = &request_lsns[i];
-		TimestampTz		start_ts, end_ts;
-
-		if (PointerIsValid(mask) && BITMAP_ISSET(mask, i))
-			continue;
-
-		start_ts = GetCurrentTimestamp();
-
-		if (RecoveryInProgress() && MyBackendType != B_STARTUP)
-			XLogWaitForReplayOf(reqlsns->request_lsn);
-
-		/*
-		 * Try to find prefetched page in the list of received pages.
-		 */
-Retry:
-		hashkey.buftag.blockNum = blockno;
-		entry = prfh_lookup(MyPState->prf_hash, &hashkey);
-
-		if (entry != NULL)
-		{
-			slot = entry->slot;
-			if (neon_prefetch_response_usable(reqlsns, slot))
-			{
-				ring_index = slot->my_ring_index;
-			}
-			else
-			{
-				/*
-				 * Cannot use this prefetch, discard it
-				 *
-				 * We can't drop cache for not-yet-received requested items. It is
-				 * unlikely this happens, but it can happen if prefetch distance
-				 * is large enough and a backend didn't consume all prefetch
-				 * requests.
-				 */
-				if (slot->status == PRFS_REQUESTED)
-				{
-					if (!prefetch_wait_for(slot->my_ring_index))
-						goto Retry;
-				}
-				/* drop caches */
-				prefetch_set_unused(slot->my_ring_index);
-				pgBufferUsage.prefetch.expired += 1;
-				MyNeonCounters->getpage_prefetch_discards_total++;
-				/* make it look like a prefetch cache miss */
-				entry = NULL;
-			}
-		}
-
-		do
-		{
-			if (entry == NULL)
-			{
-				ring_index = prefetch_register_bufferv(hashkey.buftag, reqlsns, 1, NULL, false);
-				Assert(ring_index != UINT64_MAX);
-				slot = GetPrfSlot(ring_index);
-			}
-			else
-			{
-				/*
-				 * Empty our reference to the prefetch buffer's hash entry. When
-				 * we wait for prefetches, the entry reference is invalidated by
-				 * potential updates to the hash, and when we reconnect to the
-				 * pageserver the prefetch we're waiting for may be dropped, in
-				 * which case we need to retry and take the branch above.
-				 */
-				entry = NULL;
-			}
-
-			Assert(slot->my_ring_index == ring_index);
-			Assert(MyPState->ring_last <= ring_index &&
-				   MyPState->ring_unused > ring_index);
-			Assert(slot->status != PRFS_UNUSED);
-			Assert(GetPrfSlot(ring_index) == slot);
-
-		} while (!prefetch_wait_for(ring_index));
-
-		Assert(slot->status == PRFS_RECEIVED);
-		Assert(memcmp(&hashkey.buftag, &slot->buftag, sizeof(BufferTag)) == 0);
-		Assert(hashkey.buftag.blockNum == base_blockno + i);
-
-		resp = slot->response;
-
-		switch (resp->tag)
-		{
-			case T_NeonGetPageResponse:
-			{
-				NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp;
-				if (neon_protocol_version >= 3)
-				{
-					if (resp->reqid != slot->reqid ||
-						resp->lsn != slot->request_lsns.request_lsn ||
-						resp->not_modified_since != slot->request_lsns.not_modified_since ||
-						!RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) ||
-						getpage_resp->req.forknum != forkNum ||
-						getpage_resp->req.blkno != base_blockno + i)
-					{
-						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
-													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno,
-													slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i);
-					}
-				}
-				memcpy(buffer, getpage_resp->page, BLCKSZ);
-
-				/*
-				 * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received
-				 * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here
-				 * under buffer lock.
-				 */
-				if (!lfc_store_prefetch_result)
-					lfc_write(rinfo, forkNum, blockno, buffer);
-				break;
-			}
-			case T_NeonErrorResponse:
-				if (neon_protocol_version >= 3)
-				{
-					if (resp->reqid != slot->reqid ||
-						resp->lsn != slot->request_lsns.request_lsn ||
-						resp->not_modified_since != slot->request_lsns.not_modified_since)
-					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
-							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
-							 slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since));
-					}
-				}
-				ereport(ERROR,
-						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
-								slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo),
-								forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)),
-						 errdetail("page server returned error: %s",
-								   ((NeonErrorResponse *) resp)->message)));
-				break;
-			default:
-				NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
-											"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
-											T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag);
-		}
-
-		/* buffer was used, clean up for later reuse */
-		prefetch_set_unused(ring_index);
-		prefetch_cleanup_trailing_unused();
-
-		end_ts = GetCurrentTimestamp();
-		inc_getpage_wait(end_ts >= start_ts ? (end_ts - start_ts) : 0);
-	}
-}
-
 /*
  * While function is defined in the neon extension it's used within neon_test_utils directly.
  * To avoid breaking tests in the runtime please keep function signature in sync.
@@ -3353,7 +1272,7 @@ void
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 neon_request_lsns request_lsns, void *buffer)
 {
-	neon_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
+	communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
 }
 
 #if PG_MAJORVERSION_NUM < 17
@@ -3369,6 +1288,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 #endif
 {
 	neon_request_lsns request_lsns;
+	bits8		present;
+	void	   *bufferp;
 
 	switch (reln->smgr_relpersistence)
 	{
@@ -3388,11 +1309,13 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	}
 
 	/* Try to read PS results if they are available */
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
 
-	if (prefetch_lookup(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, buffer))
+	present = 0;
+	bufferp = buffer;
+	if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
 	{
 		/* Prefetch hit */
 		return;
@@ -3410,7 +1333,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	/*
 	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
 	 */
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -3520,16 +1443,16 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 				 nblocks, PG_IOV_MAX);
 
 	/* Try to read PS results if they are available */
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
 						  request_lsns, nblocks);
 
 	memset(read_pages, 0, sizeof(read_pages));
 
-	prefetch_result = prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
-									   blocknum, request_lsns, nblocks,
-									   buffers, read_pages);
+	prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
+													blocknum, request_lsns, nblocks,
+													buffers, read_pages);
 
 	if (prefetch_result == nblocks)
 		return;
@@ -3545,13 +1468,13 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	if (prefetch_result + lfc_result == nblocks)
 		return;
 
-	neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
-					  buffers, nblocks, read_pages);
+	communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
+							  buffers, nblocks, read_pages);
 
 	/*
 	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
 	 */
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
@@ -3564,7 +1487,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		for (int i = 0; i < nblocks; i++)
 		{
 			BlockNumber blkno = blocknum + i;
-			if (!BITMAP_ISSET(read, i))
+			if (!BITMAP_ISSET(read_pages, i))
 				continue;
 
 #if PG_MAJORVERSION_NUM >= 17
@@ -3687,6 +1610,9 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 #ifndef DEBUG_COMPARE_LOCAL
 			/* This is a bit tricky. Check if the relation exists locally */
 			if (mdexists(reln, forknum))
+#else
+			if (mdexists(reln, INIT_FORKNUM))
+#endif
 			{
 				/* It exists locally. Guess it's unlogged then. */
 #if PG_MAJORVERSION_NUM >= 17
@@ -3703,7 +1629,6 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 				 */
 				return;
 			}
-#endif
 			break;
 
 		case RELPERSISTENCE_PERMANENT:
@@ -3734,7 +1659,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 
 	lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
 
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -3760,6 +1685,9 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 #ifndef DEBUG_COMPARE_LOCAL
 			/* This is a bit tricky. Check if the relation exists locally */
 			if (mdexists(reln, forknum))
+#else
+			if (mdexists(reln, INIT_FORKNUM))
+#endif
 			{
 				/* It exists locally. Guess it's unlogged then. */
 				mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync);
@@ -3773,7 +1701,6 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 				 */
 				return;
 			}
-#endif
 			break;
 
 		case RELPERSISTENCE_PERMANENT:
@@ -3794,7 +1721,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 
 	lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
 
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -3810,7 +1737,6 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 static BlockNumber
 neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 {
-	NeonResponse *resp;
 	BlockNumber n_blocks;
 	neon_request_lsns request_lsns;
 
@@ -3842,74 +1768,15 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
 						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
 
-	{
-		NeonNblocksRequest request = {
-			.hdr.tag = T_NeonNblocksRequest,
-			.hdr.reqid = GENERATE_REQUEST_ID(),
-			.hdr.lsn = request_lsns.request_lsn,
-			.hdr.not_modified_since = request_lsns.not_modified_since,
-			.rinfo = InfoFromSMgrRel(reln),
-			.forknum = forknum,
-		};
-
-		resp = page_server_request(&request);
-
-		switch (resp->tag)
-		{
-			case T_NeonNblocksResponse:
-			{
-				NeonNblocksResponse * relsize_resp = (NeonNblocksResponse *) resp;
-				if (neon_protocol_version >= 3)
-				{
-					if (!equal_requests(resp, &request.hdr) ||
-						!RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) ||
-						relsize_resp->req.forknum != forknum)
-					{
-						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
-													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum,
-													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum);
-					}
-				}
-				n_blocks = relsize_resp->n_blocks;
-				break;
-			}
-			case T_NeonErrorResponse:
-				if (neon_protocol_version >= 3)
-				{
-					if (!equal_requests(resp, &request.hdr))
-					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
-							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
-							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
-					}
-				}
-				ereport(ERROR,
-						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
-								resp->reqid,
-								RelFileInfoFmt(InfoFromSMgrRel(reln)),
-								forknum,
-								LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
-						 errdetail("page server returned error: %s",
-								   ((NeonErrorResponse *) resp)->message)));
-				break;
-
-			default:
-				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-											"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
-											T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
-		}
-		update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
+	n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
+	update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
 
-		neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
-				 RelFileInfoFmt(InfoFromSMgrRel(reln)),
-				 forknum,
-				 LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
-				 n_blocks);
+	neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
+			 RelFileInfoFmt(InfoFromSMgrRel(reln)),
+			 forknum,
+			 LSN_FORMAT_ARGS(request_lsns.effective_request_lsn),
+			 n_blocks);
 
-		pfree(resp);
-	}
 	return n_blocks;
 }
 
@@ -3919,7 +1786,6 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 int64
 neon_dbsize(Oid dbNode)
 {
-	NeonResponse *resp;
 	int64		db_size;
 	neon_request_lsns request_lsns;
 	NRelFileInfo dummy_node = {0};
@@ -3927,66 +1793,11 @@ neon_dbsize(Oid dbNode)
 	neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
 						  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
 
-	{
-		NeonDbSizeRequest request = {
-			.hdr.tag = T_NeonDbSizeRequest,
-			.hdr.reqid = GENERATE_REQUEST_ID(),
-			.hdr.lsn = request_lsns.request_lsn,
-			.hdr.not_modified_since = request_lsns.not_modified_since,
-			.dbNode = dbNode,
-		};
-
-		resp = page_server_request(&request);
-
-		switch (resp->tag)
-		{
-			case T_NeonDbSizeResponse:
-			{
-				NeonDbSizeResponse* dbsize_resp = (NeonDbSizeResponse *) resp;
-				if (neon_protocol_version >= 3)
-				{
-					if (!equal_requests(resp, &request.hdr) ||
-						dbsize_resp->req.dbNode != dbNode)
-					{
-						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
-													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode,
-													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode);
-					}
-				}
-				db_size = dbsize_resp->db_size;
-				break;
-			}
-			case T_NeonErrorResponse:
-				if (neon_protocol_version >= 3)
-				{
-					if (!equal_requests(resp, &request.hdr))
-					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
-							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
-							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
-					}
-				}
-				ereport(ERROR,
-						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X",
-								resp->reqid,
-								dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)),
-						 errdetail("page server returned error: %s",
-								   ((NeonErrorResponse *) resp)->message)));
-				break;
-
-			default:
-				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-											"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
-											T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
-		}
+	db_size = communicator_dbsize(dbNode, &request_lsns);
 
-		neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
-				 dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
+	neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
+			 dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
 
-		pfree(resp);
-	}
 	return db_size;
 }
 
@@ -4085,7 +1896,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 
 	neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
 
-	prefetch_pump_state(false);
+	communicator_prefetch_pump_state(false);
 
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
@@ -4187,6 +1998,8 @@ neon_start_unlogged_build(SMgrRelation reln)
 #ifndef DEBUG_COMPARE_LOCAL
  	if (!IsParallelWorker())
 		mdcreate(reln, MAIN_FORKNUM, false);
+#else
+	mdcreate(reln, INIT_FORKNUM, false);
 #endif
 }
 
@@ -4265,6 +2078,8 @@ neon_end_unlogged_build(SMgrRelation reln)
 #ifndef DEBUG_COMPARE_LOCAL
 			/* use isRedo == true, so that we drop it immediately */
 			mdunlink(rinfob, forknum, true);
+#else
+			mdunlink(rinfob, INIT_FORKNUM, true);
 #endif
 		}
 	}
@@ -4282,9 +2097,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 				not_modified_since;
 	SlruKind	kind;
 	int			n_blocks;
-	shardno_t	shard_no = 0; /* All SLRUs are at shard 0 */
-	NeonResponse *resp;
-	NeonGetSlruSegmentRequest request;
+	neon_request_lsns request_lsns;
 
 	/*
 	 * Compute a request LSN to use, similar to neon_get_request_lsns() but the
@@ -4323,74 +2136,12 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	else
 		return -1;
 
-	request = (NeonGetSlruSegmentRequest) {
-		.hdr.tag = T_NeonGetSlruSegmentRequest,
-		.hdr.reqid = GENERATE_REQUEST_ID(),
-		.hdr.lsn = request_lsn,
-		.hdr.not_modified_since = not_modified_since,
-		.kind = kind,
-		.segno = segno
-	};
-
-	do
-	{
-		while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no));
-
-		consume_prefetch_responses();
-
-		resp = page_server->receive(shard_no);
-	} while (resp == NULL);
+	request_lsns.request_lsn = request_lsn;
+	request_lsns.not_modified_since = not_modified_since;
+	request_lsns.effective_request_lsn = request_lsn;
 
-	switch (resp->tag)
-	{
-		case T_NeonGetSlruSegmentResponse:
-		{
-			NeonGetSlruSegmentResponse* slru_resp = (NeonGetSlruSegmentResponse *) resp;
-			if (neon_protocol_version >= 3)
-			{
-				if (!equal_requests(resp, &request.hdr) ||
-					slru_resp->req.kind != kind ||
-					slru_resp->req.segno != segno)
-				{
-					NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-												"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u}",
-												resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno,
-												request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, segno);
-				}
-			}
-			n_blocks = slru_resp->n_blocks;
-			memcpy(buffer, slru_resp->data, n_blocks*BLCKSZ);
-			break;
-		}
-		case T_NeonErrorResponse:
-			if (neon_protocol_version >= 3)
-			{
-				if (!equal_requests(resp, &request.hdr))
-				{
-					elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
-						 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
-						 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
-				}
-			}
-			ereport(ERROR,
-					(errcode(ERRCODE_IO_ERROR),
-					 errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %d at lsn %X/%08X",
-							resp->reqid,
-							kind,
-							segno,
-							LSN_FORMAT_ARGS(request_lsn)),
-					 errdetail("page server returned error: %s",
-							   ((NeonErrorResponse *) resp)->message)));
-			break;
-
-		default:
-			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-										"Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x",
-										T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag);
-	}
-	pfree(resp);
+	n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);
 
-	reconfigure_timeout_if_needed();
 	return n_blocks;
 }
 
@@ -4426,7 +2177,7 @@ AtEOXact_neon(XactEvent event, void *arg)
 			}
 			break;
 	}
-	reconfigure_timeout_if_needed();
+	communicator_reconfigure_timeout_if_needed();
 }
 
 static const struct f_smgr neon_smgr =
@@ -4484,6 +2235,7 @@ smgr_init_neon(void)
 
 	smgr_init_standard();
 	neon_init();
+	communicator_init();
 }
 
 
@@ -4513,25 +2265,14 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		 * This length is later reused when we open the smgr to read the
 		 * block, which is fine and expected.
 		 */
-		NeonResponse *response;
-		NeonNblocksResponse *nbresponse;
-		NeonNblocksRequest request = {
-			.hdr = (NeonRequest) {
-				.tag = T_NeonNblocksRequest,
-				.reqid = GENERATE_REQUEST_ID(),
-				.lsn = end_recptr,
-				.not_modified_since = end_recptr,
-			},
-			.rinfo = rinfo,
-			.forknum = forknum,
-		};
-
-		response = page_server_request(&request);
-
-		Assert(response->tag == T_NeonNblocksResponse);
-		nbresponse = (NeonNblocksResponse *) response;
-
-		relsize = Max(nbresponse->n_blocks, blkno + 1);
+		neon_request_lsns request_lsns;
+
+		neon_get_request_lsns(rinfo, forknum,
+							  REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
+
+		relsize = communicator_nblocks(rinfo, forknum, &request_lsns);
+
+		relsize = Max(relsize, blkno + 1);
 
 		set_cached_relsize(rinfo, forknum, relsize);
 		neon_set_lwlsn_relation(end_recptr, rinfo, forknum);
@@ -4683,94 +2424,3 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	}
 	return no_redo_needed;
 }
-
-static void
-reconfigure_timeout_if_needed(void)
-{
-	bool	needs_set = MyPState->ring_receive != MyPState->ring_unused &&
-						readahead_getpage_pull_timeout_ms > 0;
-
-	if (needs_set != timeout_set)
-	{
-		/* The background writer doens't (shouldn't) read any pages */
-		Assert(!AmBackgroundWriterProcess());
-		/* The checkpointer doens't (shouldn't) read any pages */
-		Assert(!AmCheckpointerProcess());
-
-		if (unlikely(PS_TIMEOUT_ID == 0))
-		{
-			PS_TIMEOUT_ID = RegisterTimeout(USER_TIMEOUT, pagestore_timeout_handler);
-		}
-
-		if (needs_set)
-		{
-#if PG_MAJORVERSION_NUM <= 14
-			enable_timeout_after(PS_TIMEOUT_ID, readahead_getpage_pull_timeout_ms);
-#else
-			enable_timeout_every(
-				PS_TIMEOUT_ID,
-				TimestampTzPlusMilliseconds(GetCurrentTimestamp(),
-											readahead_getpage_pull_timeout_ms),
-				readahead_getpage_pull_timeout_ms
-			);
-#endif
-			timeout_set = true;
-		}
-		else
-		{
-			Assert(timeout_set);
-			disable_timeout(PS_TIMEOUT_ID, false);
-			timeout_set = false;
-		}
-	}
-}
-
-static void
-pagestore_timeout_handler(void)
-{
-#if PG_MAJORVERSION_NUM <= 14
-	/*
-	 * PG14: Setting a repeating timeout is not possible, so we signal here
-	 * that the timeout has already been reset, and by telling the system
-	 * that system will re-schedule it later if we need to.
-	 */
-	timeout_set = false;
-#endif
-	timeout_signaled = true;
-	InterruptPending = true;
-}
-
-static process_interrupts_callback_t prev_interrupt_cb;
-
-/*
- * Process new data received in our active PageStream sockets.
- *
- * This relies on the invariant that all pipelined yet-to-be-received requests
- * are getPage requests managed by MyPState. This is currently true, any
- * modification will probably require some stuff to make it work again.
- */
-static bool
-pagestore_smgr_processinterrupts(void)
-{
-	if (timeout_signaled)
-	{
-		if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0)
-			prefetch_pump_state(true);
-
-		timeout_signaled = false;
-		reconfigure_timeout_if_needed();
-	}
-
-	if (!prev_interrupt_cb)
-		return false;
-
-	return prev_interrupt_cb();
-}
-
-
-void
-pagestore_smgr_init(void)
-{
-	prev_interrupt_cb = ProcessInterruptsCallback;
-	ProcessInterruptsCallback = pagestore_smgr_processinterrupts;
-}
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 0336d63e8d70..b95b1451e4ed 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -99,6 +99,9 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	wp->config = config;
 	wp->api = api;
 	wp->state = WPS_COLLECTING_TERMS;
+	wp->mconf.generation = INVALID_GENERATION;
+	wp->mconf.members.len = 0;
+	wp->mconf.new_members.len = 0;
 
 	wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list);
 
@@ -170,6 +173,8 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 
 	if (wp->config->proto_version != 2 && wp->config->proto_version != 3)
 		wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version);
+	if (wp->safekeepers_generation > INVALID_GENERATION && wp->config->proto_version < 3)
+		wp_log(FATAL, "enabling generations requires protocol version 3");
 	wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version);
 
 	/* Fill the greeting package */
@@ -214,7 +219,7 @@ WalProposerFree(WalProposer *wp)
 static bool
 WalProposerGenerationsEnabled(WalProposer *wp)
 {
-	return wp->safekeepers_generation != 0;
+	return wp->safekeepers_generation != INVALID_GENERATION;
 }
 
 /*
@@ -723,13 +728,176 @@ SendProposerGreeting(Safekeeper *sk)
 	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV);
 }
 
+/*
+ * Assuming `sk` sent its node id, find such member(s) in wp->mconf and set ptr in
+ * members_safekeepers & new_members_safekeepers to sk.
+ */
+static void
+UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk)
+{
+	/* members_safekeepers etc are fixed size, sanity check mconf size */
+	if (wp->mconf.members.len > MAX_SAFEKEEPERS)
+		wp_log(FATAL, "too many members %d in mconf", wp->mconf.members.len);
+	if (wp->mconf.new_members.len > MAX_SAFEKEEPERS)
+		wp_log(FATAL, "too many new_members %d in mconf", wp->mconf.new_members.len);
+
+	/* node id is not known until greeting is received */
+	if (sk->state < SS_WAIT_VOTING)
+		return;
+
+	/* 0 is assumed to be invalid node id, should never happen */
+	if (sk->greetResponse.nodeId == 0)
+	{
+		wp_log(WARNING, "safekeeper %s:%s sent zero node id", sk->host, sk->port);
+		return;
+	}
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		SafekeeperId *sk_id = &wp->mconf.members.m[i];
+
+		if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId)
+		{
+			/*
+			 * If mconf or list of safekeepers to connect to changed (the
+			 * latter always currently goes through restart though),
+			 * ResetMemberSafekeeperPtrs is expected to be called before
+			 * UpdateMemberSafekeeperPtr. So, other value suggests that we are
+			 * connected to the same sk under different host name, complain
+			 * about that.
+			 */
+			if (wp->members_safekeepers[i] != NULL && wp->members_safekeepers[i] != sk)
+			{
+				wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in members[%u] is already mapped to connection slot %lu",
+					   sk_id->node_id, sk_id->host, sk_id->port, i, wp->members_safekeepers[i] - wp->safekeeper);
+			}
+			wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in members[%u] mapped to connection slot %lu",
+				   sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper);
+			wp->members_safekeepers[i] = sk;
+		}
+	}
+	/* repeat for new_members */
+	for (uint32 i = 0; i < wp->mconf.new_members.len; i++)
+	{
+		SafekeeperId *sk_id = &wp->mconf.new_members.m[i];
+
+		if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId)
+		{
+			if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk)
+			{
+				wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] is already mapped to connection slot %lu",
+					   sk_id->node_id, sk_id->host, sk_id->port, i, wp->new_members_safekeepers[i] - wp->safekeeper);
+			}
+			wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] mapped to connection slot %lu",
+				   sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper);
+			wp->new_members_safekeepers[i] = sk;
+		}
+	}
+}
+
+/*
+ * Reset wp->members_safekeepers & new_members_safekeepers and refill them.
+ * Called after wp changes mconf.
+ */
+static void
+ResetMemberSafekeeperPtrs(WalProposer *wp)
+{
+	memset(&wp->members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS);
+	memset(&wp->new_members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS);
+	for (int i = 0; i < wp->n_safekeepers; i++)
+	{
+		if (wp->safekeeper[i].state >= SS_WAIT_VOTING)
+			UpdateMemberSafekeeperPtr(wp, &wp->safekeeper[i]);
+	}
+}
+
+static uint32
+MsetQuorum(MemberSet *mset)
+{
+	Assert(mset->len > 0);
+	return mset->len / 2 + 1;
+}
+
+/* Does n forms quorum in mset? */
+static bool
+MsetHasQuorum(MemberSet *mset, uint32 n)
+{
+	return n >= MsetQuorum(mset);
+}
+
+/*
+ * TermsCollected helper for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
+ */
+static bool
+TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s)
+{
+	uint32		n_greeted = 0;
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		Safekeeper *sk = msk[i];
+
+		if (sk != NULL && sk->state == SS_WAIT_VOTING)
+		{
+			if (n_greeted > 0)
+				appendStringInfoString(s, ", ");
+			appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port);
+			n_greeted++;
+		}
+	}
+	appendStringInfo(s, ", %u/%u total", n_greeted, mset->len);
+	return MsetHasQuorum(mset, n_greeted);
+}
+
 /*
  * Have we received greeting from enough (quorum) safekeepers to start voting?
  */
 static bool
 TermsCollected(WalProposer *wp)
 {
-	return wp->n_connected >= wp->quorum;
+	StringInfoData s;			/* str for logging */
+	bool		collected = false;
+
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
+	{
+		collected = wp->n_connected >= wp->quorum;
+		if (collected)
+		{
+			wp->propTerm++;
+			wp_log(LOG, "walproposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT ", starting voting", wp->quorum, wp->propTerm);
+		}
+		return collected;
+	}
+
+	/*
+	 * With generations enabled, we start campaign only when 1) some mconf is
+	 * actually received 2) we have greetings from majority of members as well
+	 * as from majority of new_members if it exists.
+	 */
+	if (wp->mconf.generation == INVALID_GENERATION)
+		return false;
+
+	initStringInfo(&s);
+	appendStringInfoString(&s, "mset greeters: ");
+	if (!TermsCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s))
+		goto res;
+	if (wp->mconf.new_members.len > 0)
+	{
+		appendStringInfoString(&s, ", new_mset greeters: ");
+		if (!TermsCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s))
+			goto res;
+	}
+	wp->propTerm++;
+	wp_log(LOG, "walproposer connected to quorum of safekeepers: %s, propTerm=" INT64_FORMAT ", starting voting", s.data, wp->propTerm);
+	collected = true;
+
+res:
+	pfree(s.data);
+	return collected;
 }
 
 static void
@@ -753,13 +921,41 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	pfree(mconf_toml);
 
 	/*
-	 * Adopt mconf of safekeepers if it is higher. TODO: mconf change should
-	 * restart wp if it started voting.
+	 * Adopt mconf of safekeepers if it is higher.
 	 */
 	if (sk->greetResponse.mconf.generation > wp->mconf.generation)
 	{
+		/* sanity check before adopting, should never happen */
+		if (sk->greetResponse.mconf.members.len == 0)
+		{
+			wp_log(FATAL, "mconf %u has zero members", sk->greetResponse.mconf.generation);
+		}
+
+		/*
+		 * If we at least started campaign, restart wp to get elected in the
+		 * new mconf. Note: in principle once wp is already elected
+		 * re-election is not required, but being conservative here is not
+		 * bad.
+		 *
+		 * TODO: put mconf to shmem to immediately pick it up on start,
+		 * otherwise if some safekeeper(s) misses latest mconf and gets
+		 * connected the first, it may cause redundant restarts here.
+		 *
+		 * More generally, it would be nice to restart walproposer (wiping
+		 * election state) without restarting the process. In particular, that
+		 * would allow sync-safekeepers not to die here if it intersected with
+		 * sk migration (as well as remove 1s delay).
+		 *
+		 * Note that assign_neon_safekeepers also currently restarts the
+		 * process, so during normal migration walproposer may restart twice.
+		 */
+		if (wp->state >= WPS_CAMPAIGN)
+		{
+			wp_log(FATAL, "restarting to adopt mconf generation %d", sk->greetResponse.mconf.generation);
+		}
 		MembershipConfigurationFree(&wp->mconf);
 		MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf);
+		ResetMemberSafekeeperPtrs(wp);
 		/* full conf was just logged above */
 		wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation);
 	}
@@ -767,6 +963,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_WAIT_VOTING;
 
+	/* In greeting safekeeper sent its id; update mappings accordingly. */
+	UpdateMemberSafekeeperPtr(wp, sk);
+
 	/*
 	 * Note: it would be better to track the counter on per safekeeper basis,
 	 * but at worst walproposer would restart with 'term rejected', so leave
@@ -778,12 +977,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 		/* We're still collecting terms from the majority. */
 		wp->propTerm = Max(sk->greetResponse.term, wp->propTerm);
 
-		/* Quorum is acquried, prepare the vote request. */
+		/* Quorum is acquired, prepare the vote request. */
 		if (TermsCollected(wp))
 		{
-			wp->propTerm++;
-			wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
-
 			wp->state = WPS_CAMPAIGN;
 			wp->voteRequest.pam.tag = 'v';
 			wp->voteRequest.generation = wp->mconf.generation;
@@ -832,8 +1028,8 @@ SendVoteRequest(Safekeeper *sk)
 					   &sk->outbuf, wp->config->proto_version);
 
 	/* We have quorum for voting, send our vote request */
-	wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port,
-		   wp->voteRequest.generation, wp->voteRequest.term);
+	wp_log(LOG, "requesting vote from sk {id = %lu, ep = %s:%s} for generation %u term " UINT64_FORMAT,
+		   sk->greetResponse.nodeId, sk->host, sk->port, wp->voteRequest.generation, wp->voteRequest.term);
 	/* On failure, logging & resetting is handled */
 	BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT);
 	/* If successful, wait for read-ready with SS_WAIT_VERDICT */
@@ -851,8 +1047,8 @@ RecvVoteResponse(Safekeeper *sk)
 		return;
 
 	wp_log(LOG,
-		   "got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
-		   sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
+		   "got VoteResponse from sk {id = %lu, ep = %s:%s}, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X",
+		   sk->greetResponse.nodeId, sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term,
 		   sk->voteResponse.voteGiven,
 		   GetHighestTerm(&sk->voteResponse.termHistory),
 		   LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
@@ -899,6 +1095,53 @@ RecvVoteResponse(Safekeeper *sk)
 	}
 }
 
+/*
+ * VotesCollected helper for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
+ */
+static bool
+VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s)
+{
+	uint32		n_votes = 0;
+
+	for (uint32 i = 0; i < wp->mconf.members.len; i++)
+	{
+		Safekeeper *sk = msk[i];
+
+		if (sk != NULL && sk->state == SS_WAIT_ELECTED)
+		{
+			Assert(sk->voteResponse.voteGiven);
+
+			/*
+			 * Find the highest vote. NULL check is for the legacy case where
+			 * safekeeper might be not initialized with LSN at all and return
+			 * 0 LSN in the vote response; we still want to set donor to
+			 * something in this case.
+			 */
+			if (GetLastLogTerm(sk) > wp->donorLastLogTerm ||
+				(GetLastLogTerm(sk) == wp->donorLastLogTerm &&
+				 sk->voteResponse.flushLsn > wp->propTermStartLsn) ||
+				wp->donor == NULL)
+			{
+				wp->donorLastLogTerm = GetLastLogTerm(sk);
+				wp->propTermStartLsn = sk->voteResponse.flushLsn;
+				wp->donor = sk;
+			}
+			wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
+
+			if (n_votes > 0)
+				appendStringInfoString(s, ", ");
+			appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port);
+			n_votes++;
+		}
+	}
+	appendStringInfo(s, ", %u/%u total", n_votes, mset->len);
+	return MsetHasQuorum(mset, n_votes);
+}
+
+
 /*
  * Checks if enough votes has been collected to get elected and if that's the
  * case finds the highest vote, setting donor, donorLastLogTerm,
@@ -907,7 +1150,8 @@ RecvVoteResponse(Safekeeper *sk)
 static bool
 VotesCollected(WalProposer *wp)
 {
-	int			n_ready = 0;
+	StringInfoData s;			/* str for logging */
+	bool		collected = false;
 
 	/* assumed to be called only when not elected yet */
 	Assert(wp->state == WPS_CAMPAIGN);
@@ -916,25 +1160,62 @@ VotesCollected(WalProposer *wp)
 	wp->donorLastLogTerm = 0;
 	wp->truncateLsn = InvalidXLogRecPtr;
 
-	for (int i = 0; i < wp->n_safekeepers; i++)
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
 	{
-		if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
-		{
-			n_ready++;
+		int			n_ready = 0;
 
-			if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm ||
-				(GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm &&
-				 wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn))
+		for (int i = 0; i < wp->n_safekeepers; i++)
+		{
+			if (wp->safekeeper[i].state == SS_WAIT_ELECTED)
 			{
-				wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]);
-				wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
-				wp->donor = i;
+				n_ready++;
+
+				if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm ||
+					(GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm &&
+					 wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn) ||
+					wp->donor == NULL)
+				{
+					wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]);
+					wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn;
+					wp->donor = &wp->safekeeper[i];
+				}
+				wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
 			}
-			wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn);
 		}
+		collected = n_ready >= wp->quorum;
+		if (collected)
+		{
+			wp_log(LOG, "walproposer elected with %d/%d votes", n_ready, wp->n_safekeepers);
+		}
+		return collected;
+	}
+
+	/*
+	 * if generations are enabled we're expected to get to voting only when
+	 * mconf is established.
+	 */
+	Assert(wp->mconf.generation != INVALID_GENERATION);
+
+	/*
+	 * We must get votes from both msets if both are present.
+	 */
+	initStringInfo(&s);
+	appendStringInfoString(&s, "mset voters: ");
+	if (!VotesCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s))
+		goto res;
+	if (wp->mconf.new_members.len > 0)
+	{
+		appendStringInfoString(&s, ", new_mset voters: ");
+		if (!VotesCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s))
+			goto res;
 	}
+	wp_log(LOG, "walproposer elected, %s", s.data);
+	collected = true;
 
-	return n_ready >= wp->quorum;
+res:
+	pfree(s.data);
+	return collected;
 }
 
 /*
@@ -955,7 +1236,7 @@ HandleElectedProposer(WalProposer *wp)
 	 * that only for logical replication (and switching logical walsenders to
 	 * neon_walreader is a todo.)
 	 */
-	if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
+	if (!wp->api.recovery_download(wp, wp->donor))
 	{
 		wp_log(FATAL, "failed to download WAL for logical replicaiton");
 	}
@@ -1078,7 +1359,7 @@ ProcessPropStartPos(WalProposer *wp)
 	/*
 	 * Proposer's term history is the donor's + its own entry.
 	 */
-	dth = &wp->safekeeper[wp->donor].voteResponse.termHistory;
+	dth = &wp->donor->voteResponse.termHistory;
 	wp->propTermHistory.n_entries = dth->n_entries + 1;
 	wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries);
 	if (dth->n_entries > 0)
@@ -1086,11 +1367,10 @@ ProcessPropStartPos(WalProposer *wp)
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propTermStartLsn;
 
-	wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
-		   wp->quorum,
+	wp_log(LOG, "walproposer elected in term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
 		   wp->propTerm,
 		   LSN_FORMAT_ARGS(wp->propTermStartLsn),
-		   wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
+		   wp->donor->host, wp->donor->port,
 		   LSN_FORMAT_ARGS(wp->truncateLsn));
 
 	/*
@@ -1508,6 +1788,14 @@ RecvAppendResponses(Safekeeper *sk)
 
 		readAnything = true;
 
+		/* should never happen: sk is expected to send ERROR instead */
+		if (sk->appendResponse.generation != wp->mconf.generation)
+		{
+			wp_log(FATAL, "safekeeper {id = %lu, ep = %s:%s} sent response with generation %u, expected %u",
+				   sk->greetResponse.nodeId, sk->host, sk->port,
+				   sk->appendResponse.generation, wp->mconf.generation);
+		}
+
 		if (sk->appendResponse.term > wp->propTerm)
 		{
 			/*
@@ -1624,30 +1912,101 @@ CalculateMinFlushLsn(WalProposer *wp)
 }
 
 /*
- * Calculate WAL position acknowledged by quorum
+ * GetAcknowledgedByQuorumWALPosition for a single member set `mset`.
+ *
+ * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers
+ * or new_members_safekeepers.
  */
 static XLogRecPtr
-GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
+GetCommittedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk)
 {
 	XLogRecPtr	responses[MAX_SAFEKEEPERS];
 
 	/*
-	 * Sort acknowledged LSNs
+	 * Ascending sort acknowledged LSNs.
 	 */
-	for (int i = 0; i < wp->n_safekeepers; i++)
+	Assert(mset->len <= MAX_SAFEKEEPERS);
+	for (uint32 i = 0; i < mset->len; i++)
 	{
+		Safekeeper *sk = msk[i];
+
 		/*
 		 * Like in Raft, we aren't allowed to commit entries from previous
-		 * terms, so ignore reported LSN until it gets to epochStartLsn.
+		 * terms, so ignore reported LSN until it gets to propTermStartLsn.
+		 *
+		 * Note: we ignore sk state, which is ok: before first ack flushLsn is
+		 * 0, and later we just preserve value across reconnections. It would
+		 * be ok to check for SS_ACTIVE as well.
 		 */
-		responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
+		if (sk != NULL && sk->appendResponse.flushLsn >= wp->propTermStartLsn)
+		{
+			responses[i] = sk->appendResponse.flushLsn;
+		}
+		else
+		{
+			responses[i] = 0;
+		}
 	}
-	qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
+	qsort(responses, mset->len, sizeof(XLogRecPtr), CompareLsn);
 
 	/*
-	 * Get the smallest LSN committed by quorum
+	 * And get value committed by the quorum. A way to view this: to get the
+	 * highest value committed on the quorum, in the ordered array we skip n -
+	 * n_quorum elements to get to the first (lowest) value present on all sks
+	 * of the highest quorum.
 	 */
-	return responses[wp->n_safekeepers - wp->quorum];
+	return responses[mset->len - MsetQuorum(mset)];
+}
+
+/*
+ * Calculate WAL position acknowledged by quorum, i.e. which may be regarded
+ * committed.
+ *
+ * Zero may be returned when there is no quorum of nodes recovered to term start
+ * lsn which sent feedback yet.
+ */
+static XLogRecPtr
+GetAcknowledgedByQuorumWALPosition(WalProposer *wp)
+{
+	XLogRecPtr	committed;
+
+	/* legacy: generations disabled */
+	if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION)
+	{
+		XLogRecPtr	responses[MAX_SAFEKEEPERS];
+
+		/*
+		 * Sort acknowledged LSNs
+		 */
+		for (int i = 0; i < wp->n_safekeepers; i++)
+		{
+			/*
+			 * Like in Raft, we aren't allowed to commit entries from previous
+			 * terms, so ignore reported LSN until it gets to
+			 * propTermStartLsn.
+			 *
+			 * Note: we ignore sk state, which is ok: before first ack
+			 * flushLsn is 0, and later we just preserve value across
+			 * reconnections. It would be ok to check for SS_ACTIVE as well.
+			 */
+			responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0;
+		}
+		qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn);
+
+		/*
+		 * Get the smallest LSN committed by quorum
+		 */
+		return responses[wp->n_safekeepers - wp->quorum];
+	}
+
+	committed = GetCommittedMset(wp, &wp->mconf.members, wp->members_safekeepers);
+	if (wp->mconf.new_members.len > 0)
+	{
+		XLogRecPtr	new_mset_committed = GetCommittedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers);
+
+		committed = Min(committed, new_mset_committed);
+	}
+	return committed;
 }
 
 /*
@@ -1662,7 +2021,7 @@ UpdateDonorShmem(WalProposer *wp)
 	int			i;
 	XLogRecPtr	donor_lsn = InvalidXLogRecPtr;
 
-	if (wp->n_votes < wp->quorum)
+	if (wp->state < WPS_ELECTED)
 	{
 		wp_log(WARNING, "UpdateDonorShmem called before elections are won");
 		return;
@@ -1673,9 +2032,9 @@ UpdateDonorShmem(WalProposer *wp)
 	 * about its position immediately after election before any feedbacks are
 	 * sent.
 	 */
-	if (wp->safekeeper[wp->donor].state >= SS_WAIT_ELECTED)
+	if (wp->donor->state >= SS_WAIT_ELECTED)
 	{
-		donor = &wp->safekeeper[wp->donor];
+		donor = wp->donor;
 		donor_lsn = wp->propTermStartLsn;
 	}
 
@@ -1746,22 +2105,19 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 	}
 
 	/*
-	 * Generally sync is done when majority switched the epoch so we committed
-	 * epochStartLsn and made the majority aware of it, ensuring they are
-	 * ready to give all WAL to pageserver. It would mean whichever majority
-	 * is alive, there will be at least one safekeeper who is able to stream
-	 * WAL to pageserver to make basebackup possible. However, since at the
-	 * moment we don't have any good mechanism of defining the healthy and
-	 * most advanced safekeeper who should push the wal into pageserver and
+	 * Generally sync is done when majority reached propTermStartLsn so we
+	 * committed it and made the majority aware of it, ensuring they are ready
+	 * to give all WAL to pageserver. It would mean whichever majority is
+	 * alive, there will be at least one safekeeper who is able to stream WAL
+	 * to pageserver to make basebackup possible. However, since at the moment
+	 * we don't have any good mechanism of defining the healthy and most
+	 * advanced safekeeper who should push the wal into pageserver and
 	 * basically the random one gets connected, to prevent hanging basebackup
 	 * (due to pageserver connecting to not-synced-safekeeper) we currently
 	 * wait for all seemingly alive safekeepers to get synced.
 	 */
 	if (wp->config->syncSafekeepers)
 	{
-		int			n_synced;
-
-		n_synced = 0;
 		for (int i = 0; i < wp->n_safekeepers; i++)
 		{
 			Safekeeper *sk = &wp->safekeeper[i];
@@ -1770,11 +2126,9 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk)
 			/* alive safekeeper which is not synced yet; wait for it */
 			if (sk->state != SS_OFFLINE && !synced)
 				return;
-			if (synced)
-				n_synced++;
 		}
 
-		if (n_synced >= wp->quorum)
+		if (newCommitLsn >= wp->propTermStartLsn)
 		{
 			/* A quorum of safekeepers has been synced! */
 
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index d116bce80644..648b0015ad8f 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -145,6 +145,7 @@ typedef uint64 NNodeId;
  * This and following structs pair ones in membership.rs.
  */
 typedef uint32 Generation;
+#define INVALID_GENERATION 0
 
 typedef struct SafekeeperId
 {
@@ -771,7 +772,17 @@ typedef struct WalProposer
 	/* Current walproposer membership configuration */
 	MembershipConfiguration mconf;
 
-	/* (n_safekeepers / 2) + 1 */
+	/*
+	 * Parallels mconf.members with pointers to the member's slot in
+	 * safekeepers array of connections, or NULL if such member is not
+	 * connected. Helps to avoid looking slot per id through all
+	 * .safekeepers[] when doing quorum checks.
+	 */
+	Safekeeper *members_safekeepers[MAX_SAFEKEEPERS];
+	/* As above, but for new_members. */
+	Safekeeper *new_members_safekeepers[MAX_SAFEKEEPERS];
+
+	/* (n_safekeepers / 2) + 1. Used for static pre-generations quorum checks. */
 	int			quorum;
 
 	/*
@@ -829,7 +840,7 @@ typedef struct WalProposer
 	term_t		donorLastLogTerm;
 
 	/* Most advanced acceptor */
-	int			donor;
+	Safekeeper *donor;
 
 	/* timeline globally starts at this LSN */
 	XLogRecPtr	timelineStartLsn;
diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs
index 62fdc18207f1..e03f2f33d972 100644
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -509,7 +509,14 @@ pub async fn run() -> anyhow::Result<()> {
             if let Some(mut redis_kv_client) = redis_kv_client {
                 maintenance_tasks.spawn(async move {
                     redis_kv_client.try_connect().await?;
-                    handle_cancel_messages(&mut redis_kv_client, rx_cancel).await
+                    handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?;
+
+                    drop(redis_kv_client);
+
+                    // `handle_cancel_messages` was terminated due to the tx_cancel
+                    // being dropped. this is not worthy of an error, and this task can only return `Err`,
+                    // so let's wait forever instead.
+                    std::future::pending().await
                 });
             }
 
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 8263e5aa2aa8..c5ba04eb8c8e 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,16 +1,17 @@
-use std::convert::Infallible;
 use std::net::{IpAddr, SocketAddr};
 use std::sync::Arc;
 
+use anyhow::{Context, anyhow};
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
 use postgres_client::CancelToken;
 use postgres_client::tls::MakeTlsConnect;
 use pq_proto::CancelKeyData;
+use redis::{FromRedisValue, Pipeline, Value, pipe};
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio::sync::{mpsc, oneshot};
-use tracing::{debug, info};
+use tracing::{debug, info, warn};
 
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::{AuthError, check_peer_addr_is_in_list};
@@ -30,6 +31,7 @@ type IpSubnetKey = IpNet;
 
 const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time
 const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10);
+const BATCH_SIZE: usize = 8;
 
 // Message types for sending through mpsc channel
 pub enum CancelKeyOp {
@@ -54,78 +56,168 @@ pub enum CancelKeyOp {
     },
 }
 
+impl CancelKeyOp {
+    fn register(self, pipe: &mut Pipeline) -> Option<CancelReplyOp> {
+        #[allow(clippy::used_underscore_binding)]
+        match self {
+            CancelKeyOp::StoreCancelKey {
+                key,
+                field,
+                value,
+                resp_tx,
+                _guard,
+                expire,
+            } => {
+                pipe.hset(&key, field, value);
+                pipe.expire(key, expire);
+                let resp_tx = resp_tx?;
+                Some(CancelReplyOp::StoreCancelKey { resp_tx, _guard })
+            }
+            CancelKeyOp::GetCancelData {
+                key,
+                resp_tx,
+                _guard,
+            } => {
+                pipe.hgetall(key);
+                Some(CancelReplyOp::GetCancelData { resp_tx, _guard })
+            }
+            CancelKeyOp::RemoveCancelKey {
+                key,
+                field,
+                resp_tx,
+                _guard,
+            } => {
+                pipe.hdel(key, field);
+                let resp_tx = resp_tx?;
+                Some(CancelReplyOp::RemoveCancelKey { resp_tx, _guard })
+            }
+        }
+    }
+}
+
+// Message types for sending through mpsc channel
+pub enum CancelReplyOp {
+    StoreCancelKey {
+        resp_tx: oneshot::Sender<anyhow::Result<()>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+    GetCancelData {
+        resp_tx: oneshot::Sender<anyhow::Result<Vec<(String, String)>>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+    RemoveCancelKey {
+        resp_tx: oneshot::Sender<anyhow::Result<()>>,
+        _guard: CancelChannelSizeGuard<'static>,
+    },
+}
+
+impl CancelReplyOp {
+    fn send_err(self, e: anyhow::Error) {
+        match self {
+            CancelReplyOp::StoreCancelKey { resp_tx, _guard } => {
+                resp_tx
+                    .send(Err(e))
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::GetCancelData { resp_tx, _guard } => {
+                resp_tx
+                    .send(Err(e))
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => {
+                resp_tx
+                    .send(Err(e))
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+        }
+    }
+
+    fn send_value(self, v: redis::Value) {
+        match self {
+            CancelReplyOp::StoreCancelKey { resp_tx, _guard } => {
+                let send =
+                    FromRedisValue::from_owned_redis_value(v).context("could not parse value");
+                resp_tx
+                    .send(send)
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::GetCancelData { resp_tx, _guard } => {
+                let send =
+                    FromRedisValue::from_owned_redis_value(v).context("could not parse value");
+                resp_tx
+                    .send(send)
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+            CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => {
+                let send =
+                    FromRedisValue::from_owned_redis_value(v).context("could not parse value");
+                resp_tx
+                    .send(send)
+                    .inspect_err(|_| tracing::debug!("could not send reply"))
+                    .ok();
+            }
+        }
+    }
+}
+
 // Running as a separate task to accept messages through the rx channel
-// In case of problems with RTT: switch to recv_many() + redis pipeline
 pub async fn handle_cancel_messages(
     client: &mut RedisKVClient,
     mut rx: mpsc::Receiver<CancelKeyOp>,
-) -> anyhow::Result<Infallible> {
+) -> anyhow::Result<()> {
+    let mut batch = Vec::new();
+    let mut replies = vec![];
+
     loop {
-        if let Some(msg) = rx.recv().await {
-            match msg {
-                CancelKeyOp::StoreCancelKey {
-                    key,
-                    field,
-                    value,
-                    resp_tx,
-                    _guard,
-                    expire,
-                } => {
-                    let res = client.hset(&key, field, value).await;
-                    if let Some(resp_tx) = resp_tx {
-                        if res.is_ok() {
-                            resp_tx
-                                .send(client.expire(key, expire).await)
-                                .inspect_err(|e| {
-                                    tracing::debug!(
-                                        "failed to send StoreCancelKey response: {:?}",
-                                        e
-                                    );
-                                })
-                                .ok();
-                        } else {
-                            resp_tx
-                                .send(res)
-                                .inspect_err(|e| {
-                                    tracing::debug!(
-                                        "failed to send StoreCancelKey response: {:?}",
-                                        e
-                                    );
-                                })
-                                .ok();
-                        }
-                    } else if res.is_ok() {
-                        drop(client.expire(key, expire).await);
-                    } else {
-                        tracing::warn!("failed to store cancel key: {:?}", res);
-                    }
+        if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 {
+            warn!("shutting down cancellation queue");
+            break Ok(());
+        }
+
+        let batch_size = batch.len();
+        debug!(batch_size, "running cancellation jobs");
+
+        let mut pipe = pipe();
+        for msg in batch.drain(..) {
+            if let Some(reply) = msg.register(&mut pipe) {
+                replies.push(reply);
+            } else {
+                pipe.ignore();
+            }
+        }
+
+        let responses = replies.len();
+
+        match client.query(pipe).await {
+            // for each reply, we expect that many values.
+            Ok(Value::Array(values)) if values.len() == responses => {
+                debug!(
+                    batch_size,
+                    responses, "successfully completed cancellation jobs",
+                );
+                for (value, reply) in std::iter::zip(values, replies.drain(..)) {
+                    reply.send_value(value);
                 }
-                CancelKeyOp::GetCancelData {
-                    key,
-                    resp_tx,
-                    _guard,
-                } => {
-                    drop(resp_tx.send(client.hget_all(key).await));
+            }
+            Ok(value) => {
+                debug!(?value, "unexpected redis return value");
+                for reply in replies.drain(..) {
+                    reply.send_err(anyhow!("incorrect response type from redis"));
                 }
-                CancelKeyOp::RemoveCancelKey {
-                    key,
-                    field,
-                    resp_tx,
-                    _guard,
-                } => {
-                    if let Some(resp_tx) = resp_tx {
-                        resp_tx
-                            .send(client.hdel(key, field).await)
-                            .inspect_err(|e| {
-                                tracing::debug!("failed to send StoreCancelKey response: {:?}", e);
-                            })
-                            .ok();
-                    } else {
-                        drop(client.hdel(key, field).await);
-                    }
+            }
+            Err(err) => {
+                for reply in replies.drain(..) {
+                    reply.send_err(anyhow!("could not send cmd to redis: {err}"));
                 }
             }
         }
+
+        replies.clear();
     }
 }
 
@@ -425,12 +517,7 @@ impl CancelClosure {
             &mut mk_tls,
             &self.hostname,
         )
-        .map_err(|e| {
-            CancelError::IO(std::io::Error::new(
-                std::io::ErrorKind::Other,
-                e.to_string(),
-            ))
-        })?;
+        .map_err(|e| CancelError::IO(std::io::Error::other(e.to_string())))?;
 
         self.cancel_token.cancel_query_raw(socket, tls).await?;
         debug!("query was cancelled");
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 2c3e70138d9e..2268e60d257d 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -568,7 +568,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
 fn helper_create_connect_info(
     mechanism: &TestConnectMechanism,
 ) -> auth::Backend<'static, ComputeCredentials> {
-    let user_info = auth::Backend::ControlPlane(
+    auth::Backend::ControlPlane(
         MaybeOwned::Owned(ControlPlaneClient::Test(Box::new(mechanism.clone()))),
         ComputeCredentials {
             info: ComputeUserInfo {
@@ -578,8 +578,7 @@ fn helper_create_connect_info(
             },
             keys: ComputeCredentialKeys::Password("password".into()),
         },
-    );
-    user_info
+    )
 }
 
 fn config() -> ComputeConfig {
diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs
index 3689bf7ae29b..aa627b29a6f2 100644
--- a/proxy/src/redis/kv_ops.rs
+++ b/proxy/src/redis/kv_ops.rs
@@ -1,4 +1,5 @@
-use redis::{AsyncCommands, ToRedisArgs};
+use redis::aio::ConnectionLike;
+use redis::{Cmd, FromRedisValue, Pipeline, RedisResult};
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
@@ -8,6 +9,23 @@ pub struct RedisKVClient {
     limiter: GlobalRateLimiter,
 }
 
+#[allow(async_fn_in_trait)]
+pub trait Queryable {
+    async fn query<T: FromRedisValue>(&self, conn: &mut impl ConnectionLike) -> RedisResult<T>;
+}
+
+impl Queryable for Pipeline {
+    async fn query<T: FromRedisValue>(&self, conn: &mut impl ConnectionLike) -> RedisResult<T> {
+        self.query_async(conn).await
+    }
+}
+
+impl Queryable for Cmd {
+    async fn query<T: FromRedisValue>(&self, conn: &mut impl ConnectionLike) -> RedisResult<T> {
+        self.query_async(conn).await
+    }
+}
+
 impl RedisKVClient {
     pub fn new(client: ConnectionWithCredentialsProvider, info: &'static [RateBucketInfo]) -> Self {
         Self {
@@ -27,158 +45,24 @@ impl RedisKVClient {
         Ok(())
     }
 
-    pub(crate) async fn hset<K, F, V>(&mut self, key: K, field: F, value: V) -> anyhow::Result<()>
-    where
-        K: ToRedisArgs + Send + Sync,
-        F: ToRedisArgs + Send + Sync,
-        V: ToRedisArgs + Send + Sync,
-    {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping hset");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-
-        match self.client.hset(&key, &field, &value).await {
-            Ok(()) => return Ok(()),
-            Err(e) => {
-                tracing::error!("failed to set a key-value pair: {e}");
-            }
-        }
-
-        tracing::info!("Redis client is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.client
-            .hset(key, field, value)
-            .await
-            .map_err(anyhow::Error::new)
-    }
-
-    #[allow(dead_code)]
-    pub(crate) async fn hset_multiple<K, V>(
+    pub(crate) async fn query<T: FromRedisValue>(
         &mut self,
-        key: &str,
-        items: &[(K, V)],
-    ) -> anyhow::Result<()>
-    where
-        K: ToRedisArgs + Send + Sync,
-        V: ToRedisArgs + Send + Sync,
-    {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping hset_multiple");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-
-        match self.client.hset_multiple(key, items).await {
-            Ok(()) => return Ok(()),
-            Err(e) => {
-                tracing::error!("failed to set a key-value pair: {e}");
-            }
-        }
-
-        tracing::info!("Redis client is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.client
-            .hset_multiple(key, items)
-            .await
-            .map_err(anyhow::Error::new)
-    }
-
-    #[allow(dead_code)]
-    pub(crate) async fn expire<K>(&mut self, key: K, seconds: i64) -> anyhow::Result<()>
-    where
-        K: ToRedisArgs + Send + Sync,
-    {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping expire");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-
-        match self.client.expire(&key, seconds).await {
-            Ok(()) => return Ok(()),
-            Err(e) => {
-                tracing::error!("failed to set a key-value pair: {e}");
-            }
-        }
-
-        tracing::info!("Redis client is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.client
-            .expire(key, seconds)
-            .await
-            .map_err(anyhow::Error::new)
-    }
-
-    #[allow(dead_code)]
-    pub(crate) async fn hget<K, F, V>(&mut self, key: K, field: F) -> anyhow::Result<V>
-    where
-        K: ToRedisArgs + Send + Sync,
-        F: ToRedisArgs + Send + Sync,
-        V: redis::FromRedisValue,
-    {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping hget");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-
-        match self.client.hget(&key, &field).await {
-            Ok(value) => return Ok(value),
-            Err(e) => {
-                tracing::error!("failed to get a value: {e}");
-            }
-        }
-
-        tracing::info!("Redis client is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.client
-            .hget(key, field)
-            .await
-            .map_err(anyhow::Error::new)
-    }
-
-    pub(crate) async fn hget_all<K, V>(&mut self, key: K) -> anyhow::Result<V>
-    where
-        K: ToRedisArgs + Send + Sync,
-        V: redis::FromRedisValue,
-    {
-        if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping hgetall");
-            return Err(anyhow::anyhow!("Rate limit exceeded"));
-        }
-
-        match self.client.hgetall(&key).await {
-            Ok(value) => return Ok(value),
-            Err(e) => {
-                tracing::error!("failed to get a value: {e}");
-            }
-        }
-
-        tracing::info!("Redis client is disconnected. Reconnectiong...");
-        self.try_connect().await?;
-        self.client.hgetall(key).await.map_err(anyhow::Error::new)
-    }
-
-    pub(crate) async fn hdel<K, F>(&mut self, key: K, field: F) -> anyhow::Result<()>
-    where
-        K: ToRedisArgs + Send + Sync,
-        F: ToRedisArgs + Send + Sync,
-    {
+        q: impl Queryable,
+    ) -> anyhow::Result<T> {
         if !self.limiter.check() {
-            tracing::info!("Rate limit exceeded. Skipping hdel");
+            tracing::info!("Rate limit exceeded. Skipping query");
             return Err(anyhow::anyhow!("Rate limit exceeded"));
         }
 
-        match self.client.hdel(&key, &field).await {
-            Ok(()) => return Ok(()),
+        match q.query(&mut self.client).await {
+            Ok(t) => return Ok(t),
             Err(e) => {
-                tracing::error!("failed to delete a key-value pair: {e}");
+                tracing::error!("failed to run query: {e}");
             }
         }
 
-        tracing::info!("Redis client is disconnected. Reconnectiong...");
+        tracing::info!("Redis client is disconnected. Reconnecting...");
         self.try_connect().await?;
-        self.client
-            .hdel(key, field)
-            .await
-            .map_err(anyhow::Error::new)
+        Ok(q.query(&mut self.client).await?)
     }
 }
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index 77b548cc43a7..42a3ea17a248 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -47,6 +47,7 @@ impl ConnInfo {
 }
 
 #[derive(Clone)]
+#[allow(clippy::large_enum_variant, reason = "TODO")]
 pub(crate) enum ClientDataEnum {
     Remote(ClientDataRemote),
     Local(ClientDataLocal),
diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs
index 003a75faa6a7..6e7c5d971df8 100644
--- a/safekeeper/src/http/mod.rs
+++ b/safekeeper/src/http/mod.rs
@@ -31,6 +31,7 @@ pub async fn task_main_https(
     global_timelines: Arc<GlobalTimelines>,
 ) -> anyhow::Result<()> {
     let cert_resolver = ReloadingCertificateResolver::new(
+        "main",
         &conf.ssl_key_file,
         &conf.ssl_cert_file,
         conf.ssl_cert_reload_period,
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index e6a7ade9f2e2..b7ba28f4356d 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -138,6 +138,7 @@ impl Drop for WriteGuardSharedState<'_> {
 /// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this
 /// case, SafeKeeper is not available (because WAL is not present on disk) and all
 /// operations can be done only with control file.
+#[allow(clippy::large_enum_variant, reason = "TODO")]
 pub enum StateSK {
     Loaded(SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>),
     Offloaded(Box<TimelineState<control_file::FileStorage>>),
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index 06ccb32d03a1..84c636daf627 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -35,7 +35,7 @@ impl Manager {
         next_event: &Option<tokio::time::Instant>,
         state: &StateSnapshot,
     ) -> bool {
-        let ready = self.backup_task.is_none()
+        self.backup_task.is_none()
             && self.recovery_task.is_none()
             && self.wal_removal_task.is_none()
             && self.partial_backup_task.is_none()
@@ -61,8 +61,7 @@ impl Manager {
                 .unwrap()
                 .flush_lsn
                 .segment_number(self.wal_seg_size)
-                == self.last_removed_segno + 1;
-        ready
+                == self.last_removed_segno + 1
     }
 
     /// Evict the timeline to remote storage. Returns whether the eviction was successful.
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index f1bd7ba708d9..a7e0c986e6da 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -96,6 +96,7 @@ enum Message {
 
 impl Message {
     /// Convert proto message to internal message.
+    #[allow(clippy::result_large_err, reason = "TODO")]
     pub fn from(proto_msg: TypedMessage) -> Result<Self, Status> {
         match proto_msg.r#type() {
             MessageType::SafekeeperTimelineInfo => Ok(Message::SafekeeperTimelineInfo(
@@ -127,6 +128,7 @@ impl Message {
     }
 
     /// Get the tenant_timeline_id from the message.
+    #[allow(clippy::result_large_err, reason = "TODO")]
     pub fn tenant_timeline_id(&self) -> Result<Option<TenantTimelineId>, Status> {
         match self {
             Message::SafekeeperTimelineInfo(msg) => Ok(msg
@@ -185,6 +187,7 @@ enum SubscriptionKey {
 
 impl SubscriptionKey {
     /// Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors).
+    #[allow(clippy::result_large_err, reason = "TODO")]
     pub fn from_proto_subscription_key(key: ProtoSubscriptionKey) -> Result<Self, Status> {
         match key {
             ProtoSubscriptionKey::All(_) => Ok(SubscriptionKey::All),
@@ -195,6 +198,7 @@ impl SubscriptionKey {
     }
 
     /// Parse from FilterTenantTimelineId
+    #[allow(clippy::result_large_err, reason = "TODO")]
     pub fn from_proto_filter_tenant_timeline_id(
         opt: Option<&FilterTenantTimelineId>,
     ) -> Result<Self, Status> {
@@ -385,6 +389,7 @@ impl Registry {
     }
 
     /// Send msg to relevant subscribers.
+    #[allow(clippy::result_large_err, reason = "TODO")]
     pub fn send_msg(&self, msg: &Message) -> Result<(), Status> {
         PROCESSED_MESSAGES_TOTAL.inc();
 
@@ -436,6 +441,7 @@ struct Publisher {
 
 impl Publisher {
     /// Send msg to relevant subscribers.
+    #[allow(clippy::result_large_err, reason = "TODO")]
     pub fn send_msg(&mut self, msg: &Message) -> Result<(), Status> {
         self.registry.send_msg(msg)
     }
diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs
index 55d411f607fb..7b36f5e9483c 100644
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -79,6 +79,7 @@ impl BrokerClientChannel {
 }
 
 // parse variable length bytes from protobuf
+#[allow(clippy::result_large_err, reason = "TODO")]
 pub fn parse_proto_ttid(proto_ttid: &ProtoTenantTimelineId) -> Result<TenantTimelineId, Status> {
     let tenant_id = TenantId::from_slice(&proto_ttid.tenant_id)
         .map_err(|e| Status::new(Code::InvalidArgument, format!("malformed tenant_id: {}", e)))?;
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 31ab443ccdb6..57709302e18b 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -629,15 +629,13 @@ impl ComputeHook {
         };
 
         let result = if !self.config.use_local_compute_notifications {
-            let compute_hook_url = if let Some(control_plane_url) = &self.config.control_plane_url {
-                Some(if control_plane_url.ends_with('/') {
-                    format!("{control_plane_url}notify-attach")
-                } else {
-                    format!("{control_plane_url}/notify-attach")
-                })
-            } else {
-                self.config.compute_hook_url.clone()
-            };
+            let compute_hook_url =
+                self.config
+                    .control_plane_url
+                    .as_ref()
+                    .map(|control_plane_url| {
+                        format!("{}/notify-attach", control_plane_url.trim_end_matches('/'))
+                    });
 
             // We validate this at startup
             let notify_url = compute_hook_url.as_ref().unwrap();
@@ -800,7 +798,7 @@ impl ComputeHook {
 
 #[cfg(test)]
 pub(crate) mod tests {
-    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber};
     use utils::id::TenantId;
 
     use super::*;
@@ -808,6 +806,7 @@ pub(crate) mod tests {
     #[test]
     fn tenant_updates() -> anyhow::Result<()> {
         let tenant_id = TenantId::generate();
+        let stripe_size = DEFAULT_STRIPE_SIZE;
         let mut tenant_state = ComputeHookTenant::new(
             TenantShardId {
                 tenant_id,
@@ -848,7 +847,7 @@ pub(crate) mod tests {
                 shard_count: ShardCount::new(2),
                 shard_number: ShardNumber(1),
             },
-            stripe_size: ShardStripeSize(32768),
+            stripe_size,
             preferred_az: None,
             node_id: NodeId(1),
         });
@@ -864,7 +863,7 @@ pub(crate) mod tests {
                 shard_count: ShardCount::new(2),
                 shard_number: ShardNumber(0),
             },
-            stripe_size: ShardStripeSize(32768),
+            stripe_size,
             preferred_az: None,
             node_id: NodeId(1),
         });
@@ -874,7 +873,7 @@ pub(crate) mod tests {
             anyhow::bail!("Wrong send result");
         };
         assert_eq!(request.shards.len(), 2);
-        assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
+        assert_eq!(request.stripe_size, Some(stripe_size));
 
         // Simulate successful send
         *guard = Some(ComputeRemoteState {
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 0d1dc8f8eec1..fb4530d0d219 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -22,6 +22,7 @@ use pageserver_api::controller_api::{
     MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
     NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest,
     ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest,
+    TimelineImportRequest,
 };
 use pageserver_api::models::{
     DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest,
@@ -1235,8 +1236,18 @@ async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError
         ForwardOutcome::NotForwarded(req) => req,
     };
 
-    let state = get_state(&req);
-    json_response(StatusCode::OK, state.service.step_down().await)
+    // Spawn a background task: once we start stepping down, we must finish: if the client drops
+    // their request we should avoid stopping in some part-stepped-down state.
+    let handle = tokio::spawn(async move {
+        let state = get_state(&req);
+        state.service.step_down().await
+    });
+
+    let result = handle
+        .await
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+
+    json_response(StatusCode::OK, result)
 }
 
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -1276,6 +1287,37 @@ async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiE
     )
 }
 
+async fn handle_timeline_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
+
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let import_req = json_request::<TimelineImportRequest>(&mut req).await?;
+
+    let state = get_state(&req);
+
+    if import_req.tenant_id != tenant_id || import_req.timeline_id != timeline_id {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "tenant id or timeline id mismatch: url={tenant_id}/{timeline_id}, body={}/{}",
+            import_req.tenant_id,
+            import_req.timeline_id
+        )));
+    }
+
+    json_response(
+        StatusCode::OK,
+        state.service.timeline_import(import_req).await?,
+    )
+}
+
 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -1949,6 +1991,16 @@ pub fn make_router(
                 RequestName("debug_v1_tenant_locate"),
             )
         })
+        .post(
+            "/debug/v1/tenant/:tenant_id/timeline/:timeline_id/import",
+            |r| {
+                named_request_span(
+                    r,
+                    handle_timeline_import,
+                    RequestName("debug_v1_timeline_import"),
+                )
+            },
+        )
         .get("/debug/v1/scheduler", |r| {
             named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
         })
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 1aa9ae10aea6..a924e5b6c558 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -86,10 +86,6 @@ struct Cli {
     #[arg(long)]
     peer_jwt_token: Option<String>,
 
-    /// URL to control plane compute notification endpoint
-    #[arg(long)]
-    compute_hook_url: Option<String>,
-
     /// URL to control plane storage API prefix
     #[arg(long)]
     control_plane_url: Option<String>,
@@ -360,13 +356,11 @@ async fn async_main() -> anyhow::Result<()> {
                 "Insecure config!  One or more secrets is not set.  This is only permitted in `--dev` mode"
             );
         }
-        StrictMode::Strict
-            if args.compute_hook_url.is_none() && args.control_plane_url.is_none() =>
-        {
+        StrictMode::Strict if args.control_plane_url.is_none() => {
             // Production systems should always have a control plane URL set, to prevent falling
             // back to trying to use neon_local.
             anyhow::bail!(
-                "neither `--compute-hook-url` nor `--control-plane-url` are set: this is only permitted in `--dev` mode"
+                "`--control-plane-url` is not set: this is only permitted in `--dev` mode"
             );
         }
         StrictMode::Strict if args.use_local_compute_notifications => {
@@ -394,7 +388,6 @@ async fn async_main() -> anyhow::Result<()> {
         safekeeper_jwt_token: secrets.safekeeper_jwt_token,
         control_plane_jwt_token: secrets.control_plane_jwt_token,
         peer_jwt_token: secrets.peer_jwt_token,
-        compute_hook_url: args.compute_hook_url,
         control_plane_url: args.control_plane_url,
         max_offline_interval: args
             .max_offline_interval
@@ -472,6 +465,7 @@ async fn async_main() -> anyhow::Result<()> {
             let https_listener = tcp_listener::bind(https_addr)?;
 
             let resolver = ReloadingCertificateResolver::new(
+                "main",
                 &args.ssl_key_file,
                 &args.ssl_cert_file,
                 *args.ssl_cert_reload_period,
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index ea390df726a6..5ce2fb65e4bb 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -44,6 +44,15 @@ pub(crate) struct StorageControllerMetricGroup {
     /// Size of the in-memory map of pageserver_nodes
     pub(crate) storage_controller_pageserver_nodes: measured::Gauge,
 
+    /// Count of how many pageserver nodes from in-memory map have https configured
+    pub(crate) storage_controller_https_pageserver_nodes: measured::Gauge,
+
+    /// Size of the in-memory map of safekeeper_nodes
+    pub(crate) storage_controller_safekeeper_nodes: measured::Gauge,
+
+    /// Count of how many safekeeper nodes from in-memory map have https configured
+    pub(crate) storage_controller_https_safekeeper_nodes: measured::Gauge,
+
     /// Reconciler tasks completed, broken down by success/failure/cancelled
     pub(crate) storage_controller_reconcile_complete:
         measured::CounterVec<ReconcileCompleteLabelGroupSet>,
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index f667514517b0..e180c49b4319 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -89,6 +89,10 @@ impl Node {
         self.scheduling = scheduling
     }
 
+    pub(crate) fn has_https_port(&self) -> bool {
+        self.listen_https_port.is_some()
+    }
+
     /// Does this registration request match `self`?  This is used when deciding whether a registration
     /// request should be allowed to update an existing record with the same node ID.
     pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool {
diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs
index 3b731acf7e11..5a13ef750e30 100644
--- a/storage_controller/src/safekeeper.rs
+++ b/storage_controller/src/safekeeper.rs
@@ -89,6 +89,9 @@ impl Safekeeper {
     pub(crate) fn availability(&self) -> SafekeeperState {
         self.availability.clone()
     }
+    pub(crate) fn has_https_port(&self) -> bool {
+        self.listen_https_port.is_some()
+    }
     /// Perform an operation (which is given a [`SafekeeperClient`]) with retries
     #[allow(clippy::too_many_arguments)]
     pub(crate) async fn with_client_retries<T, O, F>(
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 50f642deaf7e..a02131347400 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -43,7 +43,7 @@ use pageserver_api::models::{
     TimelineInfo, TopTenantShardItem, TopTenantShardsRequest,
 };
 use pageserver_api::shard::{
-    ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
+    DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId,
 };
 use pageserver_api::upcall_api::{
     ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
@@ -61,7 +61,7 @@ use utils::completion::Barrier;
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
-use utils::sync::gate::Gate;
+use utils::sync::gate::{Gate, GateGuard};
 use utils::{failpoint_support, pausable_failpoint};
 
 use crate::background_node_operations::{
@@ -357,18 +357,10 @@ pub struct Config {
     // This JWT token will be used to authenticate with other storage controller instances
     pub peer_jwt_token: Option<String>,
 
-    /// Where the compute hook should send notifications of pageserver attachment locations
-    /// (this URL points to the control plane in prod). If this is None, the compute hook will
-    /// assume it is running in a test environment and try to update neon_local.
-    pub compute_hook_url: Option<String>,
-
     /// Prefix for storage API endpoints of the control plane. We use this prefix to compute
     /// URLs that we use to send pageserver and safekeeper attachment locations.
     /// If this is None, the compute hook will assume it is running in a test environment
     /// and try to invoke neon_local instead.
-    ///
-    /// For now, there is also `compute_hook_url` which allows configuration of the pageserver
-    /// specific endpoint, but it is in the process of being phased out.
     pub control_plane_url: Option<String>,
 
     /// Grace period within which a pageserver does not respond to heartbeats, but is still
@@ -594,6 +586,8 @@ struct TenantShardSplitAbort {
     new_stripe_size: Option<ShardStripeSize>,
     /// Until this abort op is complete, no other operations may be done on the tenant
     _tenant_lock: TracingExclusiveGuard<TenantOperations>,
+    /// The reconciler gate for the duration of the split operation, and any included abort.
+    _gate: GateGuard,
 }
 
 #[derive(thiserror::Error, Debug)]
@@ -1460,7 +1454,7 @@ impl Service {
             // Retry until shutdown: we must keep this request object alive until it is properly
             // processed, as it holds a lock guard that prevents other operations trying to do things
             // to the tenant while it is in a weird part-split state.
-            while !self.cancel.is_cancelled() {
+            while !self.reconcilers_cancel.is_cancelled() {
                 match self.abort_tenant_shard_split(&op).await {
                     Ok(_) => break,
                     Err(e) => {
@@ -1473,9 +1467,12 @@ impl Service {
                         // when we retry, so that the abort op will succeed.  If the abort op is failing
                         // for some other reason, we will keep retrying forever, or until a human notices
                         // and does something about it (either fixing a pageserver or restarting the controller).
-                        tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled())
-                            .await
-                            .ok();
+                        tokio::time::timeout(
+                            Duration::from_secs(5),
+                            self.reconcilers_cancel.cancelled(),
+                        )
+                        .await
+                        .ok();
                     }
                 }
             }
@@ -1509,6 +1506,10 @@ impl Service {
             .metrics_group
             .storage_controller_pageserver_nodes
             .set(nodes.len() as i64);
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_https_pageserver_nodes
+            .set(nodes.values().filter(|n| n.has_https_port()).count() as i64);
 
         tracing::info!("Loading safekeepers from database...");
         let safekeepers = persistence
@@ -1526,6 +1527,14 @@ impl Service {
         let safekeepers: HashMap<NodeId, Safekeeper> =
             safekeepers.into_iter().map(|n| (n.get_id(), n)).collect();
         tracing::info!("Loaded {} safekeepers from database.", safekeepers.len());
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_safekeeper_nodes
+            .set(safekeepers.len() as i64);
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_https_safekeeper_nodes
+            .set(safekeepers.values().filter(|s| s.has_https_port()).count() as i64);
 
         tracing::info!("Loading shards from database...");
         let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?;
@@ -1835,6 +1844,7 @@ impl Service {
         };
 
         if insert {
+            let config = attach_req.config.clone().unwrap_or_default();
             let tsp = TenantShardPersistence {
                 tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
                 shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
@@ -1843,7 +1853,7 @@ impl Service {
                 generation: attach_req.generation_override.or(Some(0)),
                 generation_pageserver: None,
                 placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
-                config: serde_json::to_string(&TenantConfig::default()).unwrap(),
+                config: serde_json::to_string(&config).unwrap(),
                 splitting: SplitState::default(),
                 scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                     .unwrap(),
@@ -1866,16 +1876,16 @@ impl Service {
                 Ok(()) => {
                     tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id);
 
-                    let mut locked = self.inner.write().unwrap();
-                    locked.tenants.insert(
+                    let mut shard = TenantShard::new(
                         attach_req.tenant_shard_id,
-                        TenantShard::new(
-                            attach_req.tenant_shard_id,
-                            ShardIdentity::unsharded(),
-                            PlacementPolicy::Attached(0),
-                            None,
-                        ),
+                        ShardIdentity::unsharded(),
+                        PlacementPolicy::Attached(0),
+                        None,
                     );
+                    shard.config = config;
+
+                    let mut locked = self.inner.write().unwrap();
+                    locked.tenants.insert(attach_req.tenant_shard_id, shard);
                     tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id);
                 }
             }
@@ -1960,11 +1970,12 @@ impl Service {
             .set_attached(scheduler, attach_req.node_id);
 
         tracing::info!(
-            "attach_hook: tenant {} set generation {:?}, pageserver {}",
+            "attach_hook: tenant {} set generation {:?}, pageserver {}, config {:?}",
             attach_req.tenant_shard_id,
             tenant_shard.generation,
             // TODO: this is an odd number of 0xf's
-            attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
+            attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)),
+            attach_req.config,
         );
 
         // Trick the reconciler into not doing anything for this tenant: this helps
@@ -2742,7 +2753,7 @@ impl Service {
                         count: tenant_shard_id.shard_count,
                         // We only import un-sharded or single-sharded tenants, so stripe
                         // size can be made up arbitrarily here.
-                        stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE,
+                        stripe_size: DEFAULT_STRIPE_SIZE,
                     },
                     placement_policy: Some(placement_policy),
                     config: req.config.tenant_conf,
@@ -4898,7 +4909,7 @@ impl Service {
                     1,
                     10,
                     Duration::from_secs(5),
-                    &self.cancel,
+                    &self.reconcilers_cancel,
                 )
                 .await
             {
@@ -5149,6 +5160,11 @@ impl Service {
         )
         .await;
 
+        let _gate = self
+            .reconcilers_gate
+            .enter()
+            .map_err(|_| ApiError::ShuttingDown)?;
+
         let new_shard_count = ShardCount::new(split_req.new_shard_count);
         let new_stripe_size = split_req.new_stripe_size;
 
@@ -5176,6 +5192,7 @@ impl Service {
                         new_shard_count,
                         new_stripe_size,
                         _tenant_lock,
+                        _gate,
                     })
                     // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it.
                     .ok();
@@ -5515,7 +5532,10 @@ impl Service {
                 "failpoint".to_string()
             )));
 
-            failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel);
+            failpoint_support::sleep_millis_async!(
+                "shard-split-post-remote-sleep",
+                &self.reconcilers_cancel
+            );
 
             tracing::info!(
                 "Split {} into {}",
@@ -5573,7 +5593,7 @@ impl Service {
                         stripe_size,
                         preferred_az: preferred_az_id.as_ref().map(Cow::Borrowed),
                     },
-                    &self.cancel,
+                    &self.reconcilers_cancel,
                 )
                 .await
             {
@@ -6014,9 +6034,21 @@ impl Service {
             .max()
             .expect("We already validated >0 shards");
 
-        // FIXME: we have no way to recover the shard stripe size from contents of remote storage: this will
-        // only work if they were using the default stripe size.
-        let stripe_size = ShardParameters::DEFAULT_STRIPE_SIZE;
+        // Find the tenant's stripe size. This wasn't always persisted in the tenant manifest, so
+        // fall back to the original default stripe size of 32768 (256 MB) if it's not specified.
+        const ORIGINAL_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(32768);
+        let stripe_size = scan_result
+            .shards
+            .iter()
+            .find(|s| s.tenant_shard_id.shard_count == shard_count && s.generation == generation)
+            .expect("we validated >0 shards above")
+            .stripe_size
+            .unwrap_or_else(|| {
+                if shard_count.count() > 1 {
+                    warn!("unknown stripe size, assuming {ORIGINAL_STRIPE_SIZE}");
+                }
+                ORIGINAL_STRIPE_SIZE
+            });
 
         let (response, waiters) = self
             .do_tenant_create(TenantCreateRequest {
@@ -6242,6 +6274,10 @@ impl Service {
             .metrics_group
             .storage_controller_pageserver_nodes
             .set(locked.nodes.len() as i64);
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_https_pageserver_nodes
+            .set(locked.nodes.values().filter(|n| n.has_https_port()).count() as i64);
 
         locked.scheduler.node_remove(node_id);
 
@@ -6333,6 +6369,10 @@ impl Service {
                     .metrics_group
                     .storage_controller_pageserver_nodes
                     .set(nodes.len() as i64);
+                metrics::METRICS_REGISTRY
+                    .metrics_group
+                    .storage_controller_https_pageserver_nodes
+                    .set(nodes.values().filter(|n| n.has_https_port()).count() as i64);
             }
         }
 
@@ -6557,6 +6597,10 @@ impl Service {
             .metrics_group
             .storage_controller_pageserver_nodes
             .set(locked.nodes.len() as i64);
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_https_pageserver_nodes
+            .set(locked.nodes.values().filter(|n| n.has_https_port()).count() as i64);
 
         match registration_status {
             RegistrationStatus::New => {
@@ -7270,7 +7314,7 @@ impl Service {
             }
 
             // Eventual consistency: if an earlier reconcile job failed, and the shard is still
-            // dirty, spawn another rone
+            // dirty, spawn another one
             if self
                 .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal)
                 .is_some()
@@ -7829,7 +7873,7 @@ impl Service {
         // old, persisted stripe size.
         let new_stripe_size = match candidate.id.shard_count.count() {
             0 => panic!("invalid shard count 0"),
-            1 => Some(ShardParameters::DEFAULT_STRIPE_SIZE),
+            1 => Some(DEFAULT_STRIPE_SIZE),
             2.. => None,
         };
 
@@ -8634,9 +8678,24 @@ impl Service {
         failpoint_support::sleep_millis_async!("sleep-on-step-down-handling");
 
         self.inner.write().unwrap().step_down();
-        // TODO: would it make sense to have a time-out for this?
-        self.stop_reconciliations(StopReconciliationsReason::SteppingDown)
-            .await;
+
+        // Wait for reconciliations to stop, or terminate this process if they
+        // fail to stop in time (this indicates a bug in shutdown)
+        tokio::select! {
+            _ = self.stop_reconciliations(StopReconciliationsReason::SteppingDown) => {
+                tracing::info!("Reconciliations stopped, proceeding with step down");
+            }
+            _ = async {
+                failpoint_support::sleep_millis_async!("step-down-delay-timeout");
+                tokio::time::sleep(Duration::from_secs(10)).await
+            } => {
+                tracing::warn!("Step down timed out while waiting for reconciliation gate, terminating process");
+
+                // The caller may proceed to act as leader when it sees this request fail: reduce the chance
+                // of a split-brain situation by terminating this controller instead of leaving it up in a partially-shut-down state.
+                std::process::exit(1);
+            }
+        }
 
         let mut global_observed = GlobalObservedState::default();
         let locked = self.inner.read().unwrap();
diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs
index 7f2c63b9afcc..a23b9a4a0260 100644
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -5,19 +5,23 @@ use std::time::Duration;
 
 use super::safekeeper_reconciler::ScheduleRequest;
 use crate::heartbeater::SafekeeperState;
+use crate::metrics;
 use crate::persistence::{
     DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence,
 };
 use crate::safekeeper::Safekeeper;
 use anyhow::Context;
 use http_utils::error::ApiError;
-use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy};
+use pageserver_api::controller_api::{
+    SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest,
+};
 use pageserver_api::models::{self, SafekeeperInfo, SafekeepersInfo, TimelineInfo};
 use safekeeper_api::membership::{MemberSet, SafekeeperId};
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::logging::SecretString;
+use utils::lsn::Lsn;
 
 use super::Service;
 
@@ -297,6 +301,31 @@ impl Service {
             timeline_id,
         })
     }
+
+    /// Directly insert the timeline into the database without reconciling it with safekeepers.
+    ///
+    /// Useful if the timeline already exists on the specified safekeepers,
+    /// but we want to make it storage controller managed.
+    pub(crate) async fn timeline_import(&self, req: TimelineImportRequest) -> Result<(), ApiError> {
+        let persistence = TimelinePersistence {
+            tenant_id: req.tenant_id.to_string(),
+            timeline_id: req.timeline_id.to_string(),
+            start_lsn: Lsn::INVALID.into(),
+            generation: 1,
+            sk_set: req.sk_set.iter().map(|sk_id| sk_id.0 as i64).collect(),
+            new_sk_set: None,
+            cplane_notified_generation: 1,
+            deleted_at: None,
+        };
+        let inserted = self.persistence.insert_timeline(persistence).await?;
+        if inserted {
+            tracing::info!("imported timeline into db");
+        } else {
+            tracing::info!("didn't import timeline into db, as it is already present in db");
+        }
+        Ok(())
+    }
+
     /// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler.
     pub(super) async fn tenant_timeline_delete_safekeepers(
         self: &Arc<Self>,
@@ -590,6 +619,20 @@ impl Service {
                 }
             }
             locked.safekeepers = Arc::new(safekeepers);
+            metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_safekeeper_nodes
+                .set(locked.safekeepers.len() as i64);
+            metrics::METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_https_safekeeper_nodes
+                .set(
+                    locked
+                        .safekeepers
+                        .values()
+                        .filter(|s| s.has_https_port())
+                        .count() as i64,
+                );
         }
         Ok(())
     }
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 8424c65aba4c..3a75e96cb2b5 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -2000,7 +2000,7 @@ pub(crate) mod tests {
     use std::rc::Rc;
 
     use pageserver_api::controller_api::NodeAvailability;
-    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber};
     use rand::SeedableRng;
     use rand::rngs::StdRng;
     use utils::id::TenantId;
@@ -2012,6 +2012,7 @@ pub(crate) mod tests {
         let tenant_id = TenantId::generate();
         let shard_number = ShardNumber(0);
         let shard_count = ShardCount::new(1);
+        let stripe_size = DEFAULT_STRIPE_SIZE;
 
         let tenant_shard_id = TenantShardId {
             tenant_id,
@@ -2020,12 +2021,7 @@ pub(crate) mod tests {
         };
         TenantShard::new(
             tenant_shard_id,
-            ShardIdentity::new(
-                shard_number,
-                shard_count,
-                pageserver_api::shard::ShardStripeSize(32768),
-            )
-            .unwrap(),
+            ShardIdentity::new(shard_number, shard_count, stripe_size).unwrap(),
             policy,
             None,
         )
@@ -2045,6 +2041,7 @@ pub(crate) mod tests {
         shard_count: ShardCount,
         preferred_az: Option<AvailabilityZone>,
     ) -> Vec<TenantShard> {
+        let stripe_size = DEFAULT_STRIPE_SIZE;
         (0..shard_count.count())
             .map(|i| {
                 let shard_number = ShardNumber(i);
@@ -2056,12 +2053,7 @@ pub(crate) mod tests {
                 };
                 TenantShard::new(
                     tenant_shard_id,
-                    ShardIdentity::new(
-                        shard_number,
-                        shard_count,
-                        pageserver_api::shard::ShardStripeSize(32768),
-                    )
-                    .unwrap(),
+                    ShardIdentity::new(shard_number, shard_count, stripe_size).unwrap(),
                     policy.clone(),
                     preferred_az.clone(),
                 )
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index df500544dc70..879808b7baae 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -194,6 +194,7 @@ def counter(name: str) -> str:
     counter("pageserver_wait_lsn_started_count"),
     counter("pageserver_wait_lsn_finished_count"),
     counter("pageserver_wait_ondemand_download_seconds_sum"),
+    counter("pageserver_page_service_batch_break_reason"),
     *histogram("pageserver_page_service_batch_size"),
     *histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index d555ee298915..5f5626fb98c0 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -417,6 +417,19 @@ def storage_controller_stop(self, immediate: bool, instance_id: int | None = Non
             cmd.append(f"--instance-id={instance_id}")
         return self.raw_cli(cmd)
 
+    def object_storage_start(self, timeout_in_seconds: int | None = None):
+        cmd = ["object-storage", "start"]
+        if timeout_in_seconds is not None:
+            cmd.append(f"--start-timeout={timeout_in_seconds}s")
+        return self.raw_cli(cmd)
+
+    def object_storage_stop(self, immediate: bool):
+        cmd = ["object-storage", "stop"]
+        if immediate:
+            cmd.extend(["-m", "immediate"])
+        return self.raw_cli(cmd)
+        pass
+
     def pageserver_start(
         self,
         id: int,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5694bf170e36..10bbb7020bf1 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -14,6 +14,7 @@
 import time
 import uuid
 from collections import defaultdict
+from collections.abc import Mapping
 from contextlib import closing, contextmanager
 from dataclasses import dataclass
 from datetime import datetime
@@ -79,7 +80,12 @@
     default_remote_storage,
     remote_storage_to_toml_dict,
 )
-from fixtures.safekeeper.http import SafekeeperHttpClient
+from fixtures.safekeeper.http import (
+    MembershipConfiguration,
+    SafekeeperHttpClient,
+    SafekeeperId,
+    TimelineCreateRequest,
+)
 from fixtures.safekeeper.utils import wait_walreceivers_absent
 from fixtures.utils import (
     ATTACHMENT_NAME_REGEX,
@@ -941,6 +947,8 @@ def cleanup_local_storage(self):
                     continue
                 if SMALL_DB_FILE_NAME_REGEX.fullmatch(test_file.name):
                     continue
+                if FINAL_METRICS_FILE_NAME == test_file.name:
+                    continue
                 log.debug(f"Removing large database {test_file} file")
                 test_file.unlink()
             elif test_entry.is_dir():
@@ -1023,6 +1031,8 @@ def __exit__(
 
             self.env.broker.assert_no_errors()
 
+            self.env.object_storage.assert_no_errors()
+
         try:
             self.overlay_cleanup_teardown()
         except Exception as e:
@@ -1118,6 +1128,8 @@ def __init__(self, config: NeonEnvBuilder):
             pagectl_env_vars["RUST_LOG"] = self.rust_log_override
         self.pagectl = Pagectl(extra_env=pagectl_env_vars, binpath=self.neon_binpath)
 
+        self.object_storage = ObjectStorage(self)
+
         # The URL for the pageserver to use as its control_plane_api config
         if config.storage_controller_port_override is not None:
             log.info(
@@ -1173,6 +1185,7 @@ def __init__(self, config: NeonEnvBuilder):
             },
             "safekeepers": [],
             "pageservers": [],
+            "object_storage": {"port": self.port_distributor.get_port()},
             "generate_local_ssl_certs": self.generate_local_ssl_certs,
         }
 
@@ -1244,6 +1257,7 @@ def __init__(self, config: NeonEnvBuilder):
                 "mode": "pipelined",
                 "execution": "concurrent-futures",
                 "max_batch_size": 32,
+                "batching": "scattered-lsn",
             }
 
             get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io
@@ -1408,6 +1422,8 @@ def start(self, timeout_in_seconds: int | None = None):
                 self.storage_controller.on_safekeeper_deploy(sk_id, body)
                 self.storage_controller.safekeeper_scheduling_policy(sk_id, "Active")
 
+        self.object_storage.start(timeout_in_seconds=timeout_in_seconds)
+
     def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True):
         """
         After this method returns, there should be no child processes running.
@@ -1425,6 +1441,8 @@ def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoi
         except Exception as e:
             raise_later = e
 
+        self.object_storage.stop(immediate=immediate)
+
         # Stop storage controller before pageservers: we don't want it to spuriously
         # detect a pageserver "failure" during test teardown
         self.storage_controller.stop(immediate=immediate)
@@ -1441,6 +1459,12 @@ def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoi
                 except Exception as e:
                     metric_errors.append(e)
                     log.error(f"metric validation failed on {pageserver.id}: {e}")
+
+            try:
+                pageserver.snapshot_final_metrics()
+            except Exception as e:
+                log.error(f"metric snapshot failed on {pageserver.id}: {e}")
+
             try:
                 pageserver.stop(immediate=immediate)
             except RuntimeError:
@@ -1971,10 +1995,13 @@ def attach_hook_issue(
         tenant_shard_id: TenantId | TenantShardId,
         pageserver_id: int,
         generation_override: int | None = None,
+        config: None | dict[str, Any] = None,
     ) -> int:
         body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id}
         if generation_override is not None:
             body["generation_override"] = generation_override
+        if config is not None:
+            body["config"] = config
 
         response = self.request(
             "POST",
@@ -2635,6 +2662,26 @@ def __exit__(
         self.stop(immediate=True)
 
 
+class ObjectStorage(LogUtils):
+    def __init__(self, env: NeonEnv):
+        service_dir = env.repo_dir / "object_storage"
+        super().__init__(logfile=service_dir / "object_storage.log")
+        self.conf_path = service_dir / "object_storage.json"
+        self.env = env
+
+    def base_url(self):
+        return json.loads(self.conf_path.read_text())["listen"]
+
+    def start(self, timeout_in_seconds: int | None = None):
+        self.env.neon_cli.object_storage_start(timeout_in_seconds)
+
+    def stop(self, immediate: bool = False):
+        self.env.neon_cli.object_storage_stop(immediate)
+
+    def assert_no_errors(self):
+        assert_no_errors(self.logfile, "object_storage", [])
+
+
 class NeonProxiedStorageController(NeonStorageController):
     def __init__(self, env: NeonEnv, proxy_port: int, auth_enabled: bool, use_https: bool):
         super().__init__(env, proxy_port, auth_enabled, use_https)
@@ -2849,13 +2896,14 @@ def restart(
         self,
         immediate: bool = False,
         timeout_in_seconds: int | None = None,
+        extra_env_vars: dict[str, str] | None = None,
     ):
         """
         High level wrapper for restart: restarts the process, and waits for
         tenant state to stabilize.
         """
         self.stop(immediate=immediate)
-        self.start(timeout_in_seconds=timeout_in_seconds)
+        self.start(timeout_in_seconds=timeout_in_seconds, extra_env_vars=extra_env_vars)
         self.quiesce_tenants()
 
     def quiesce_tenants(self):
@@ -2932,6 +2980,20 @@ def assert_no_metric_errors(self):
             value = self.http_client().get_metric_value(metric)
             assert value == 0, f"Nonzero {metric} == {value}"
 
+    def snapshot_final_metrics(self):
+        """
+        Take a snapshot of this pageserver's metrics and stash in its work directory.
+        """
+        if not self.running:
+            log.info(f"Skipping metrics snapshot on pageserver {self.id}, it is not running")
+            return
+
+        metrics = self.http_client().get_metrics_str()
+        metrics_snapshot_path = self.workdir / FINAL_METRICS_FILE_NAME
+
+        with open(metrics_snapshot_path, "w") as f:
+            f.write(metrics)
+
     def tenant_attach(
         self,
         tenant_id: TenantId,
@@ -2944,11 +3006,12 @@ def tenant_attach(
         to call into the pageserver HTTP client.
         """
         client = self.http_client()
-        if generation is None:
-            generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
-        elif override_storage_controller_generation:
+        if generation is None or override_storage_controller_generation:
             generation = self.env.storage_controller.attach_hook_issue(
-                tenant_id, self.id, generation
+                tenant_id,
+                self.id,
+                generation_override=generation if override_storage_controller_generation else None,
+                config=config,
             )
         return client.tenant_attach(
             tenant_id,
@@ -4263,31 +4326,32 @@ def respec(self, **kwargs: Any) -> None:
     def respec_deep(self, **kwargs: Any) -> None:
         """
         Update the endpoint.json file taking into account nested keys.
-        It does one level deep update. Should enough for most cases.
         Distinct method from respec() to do not break existing functionality.
-        NOTE: This method also updates the spec.json file, not endpoint.json.
-        We need it because neon_local also writes to spec.json, so intended
+        NOTE: This method also updates the config.json file, not endpoint.json.
+        We need it because neon_local also writes to config.json, so intended
         use-case is i) start endpoint with some config, ii) respec_deep(),
         iii) call reconfigure() to apply the changes.
         """
-        config_path = os.path.join(self.endpoint_path(), "spec.json")
+
+        def update(curr, patch):
+            for k, v in patch.items():
+                if isinstance(v, Mapping):
+                    curr[k] = update(curr.get(k, {}), v)
+                else:
+                    curr[k] = v
+            return curr
+
+        config_path = os.path.join(self.endpoint_path(), "config.json")
         with open(config_path) as f:
-            data_dict: dict[str, Any] = json.load(f)
+            config: dict[str, Any] = json.load(f)
 
-        log.debug("Current compute spec: %s", json.dumps(data_dict, indent=4))
+        log.debug("Current compute config: %s", json.dumps(config, indent=4))
 
-        for key, value in kwargs.items():
-            if isinstance(value, dict):
-                if key not in data_dict:
-                    data_dict[key] = value
-                else:
-                    data_dict[key] = {**data_dict[key], **value}
-            else:
-                data_dict[key] = value
+        update(config, kwargs)
 
         with open(config_path, "w") as file:
-            log.debug("Updating compute spec to: %s", json.dumps(data_dict, indent=4))
-            json.dump(data_dict, file, indent=4)
+            log.debug("Updating compute config to: %s", json.dumps(config, indent=4))
+            json.dump(config, file, indent=4)
 
     def wait_for_migrations(self, wait_for: int = NUM_COMPUTE_MIGRATIONS) -> None:
         """
@@ -4304,7 +4368,7 @@ def check_migrations_done():
             wait_until(check_migrations_done)
 
     # Mock the extension part of spec passed from control plane for local testing
-    # endpooint.rs adds content of this file as a part of the spec.json
+    # endpooint.rs adds content of this file as a part of the config.json
     def create_remote_extension_spec(self, spec: dict[str, Any]):
         """Create a remote extension spec file for the endpoint."""
         remote_extensions_spec_path = os.path.join(
@@ -4810,6 +4874,50 @@ def paused():
 
         wait_until(paused)
 
+    @staticmethod
+    def sks_to_safekeeper_ids(sks: list[Safekeeper]) -> list[SafekeeperId]:
+        return [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in sks]
+
+    @staticmethod
+    def mconf_sks(env: NeonEnv, mconf: MembershipConfiguration) -> list[Safekeeper]:
+        """
+        List of Safekeepers which are members in `mconf`.
+        """
+        members_ids = [m.id for m in mconf.members]
+        new_members_ids = [m.id for m in mconf.new_members] if mconf.new_members is not None else []
+        return [sk for sk in env.safekeepers if sk.id in members_ids or sk.id in new_members_ids]
+
+    @staticmethod
+    def create_timeline(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        ps: NeonPageserver,
+        mconf: MembershipConfiguration,
+        members_sks: list[Safekeeper],
+    ):
+        """
+        Manually create timeline on safekeepers with given (presumably inital)
+        mconf: figure out LSN from pageserver, bake request and execute it on
+        given safekeepers.
+
+        Normally done by storcon, but some tests want to do it manually so far.
+        """
+        ps_http_cli = ps.http_client()
+        # figure out initial LSN.
+        ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id)
+        init_lsn = ps_timeline_detail["last_record_lsn"]
+        log.info(f"initial LSN: {init_lsn}")
+        # sk timeline creation request expects minor version
+        pg_version = ps_timeline_detail["pg_version"] * 10000
+        # create inital mconf
+        create_r = TimelineCreateRequest(
+            tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None
+        )
+        log.info(f"sending timeline create: {create_r.to_json()}")
+
+        for sk in members_sks:
+            sk.http_client().timeline_create(create_r)
+
 
 class NeonBroker(LogUtils):
     """An object managing storage_broker instance"""
@@ -5048,6 +5156,8 @@ def pytest_addoption(parser: Parser):
     r"config-v1|heatmap-v1|tenant-manifest|metadata|.+\.(?:toml|pid|json|sql|conf)"
 )
 
+FINAL_METRICS_FILE_NAME: str = "final_metrics.txt"
+
 
 SKIP_DIRS = frozenset(
     (
diff --git a/test_runner/fixtures/pageserver/common_types.py b/test_runner/fixtures/pageserver/common_types.py
index 0e068db59307..0a92883e9646 100644
--- a/test_runner/fixtures/pageserver/common_types.py
+++ b/test_runner/fixtures/pageserver/common_types.py
@@ -105,7 +105,7 @@ def parse_layer_file_name(file_name: str) -> LayerName:
     except InvalidFileName:
         pass
 
-    raise InvalidFileName("neither image nor delta layer")
+    raise InvalidFileName(f"neither image nor delta layer: {file_name}")
 
 
 def is_future_layer(layer_file_name: LayerName, disk_consistent_lsn: Lsn):
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index eedb693e3d62..71c750b9eb3b 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -65,13 +65,11 @@ def single_timeline(
     assert ps_http.tenant_list() == []
 
     def attach(tenant):
-        # NB: create the new tenant in the storage controller with the correct tenant config. This
-        # will pick up the existing tenant data from remote storage. If we just attach it to the
-        # Pageserver, the storage controller will reset the tenant config to the default.
-        env.create_tenant(
-            tenant_id=tenant,
-            timeline_id=template_timeline,
-            conf=template_config,
+        env.pageserver.tenant_attach(
+            tenant,
+            config=template_config,
+            generation=100,
+            override_storage_controller_generation=True,
         )
 
     with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor:
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index e409151b7604..839e985419e8 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -25,7 +25,7 @@ class Walreceiver:
 
 @dataclass
 class SafekeeperTimelineStatus:
-    mconf: Configuration | None
+    mconf: MembershipConfiguration | None
     term: int
     last_log_term: int
     pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
@@ -78,17 +78,17 @@ class SafekeeperId:
 
 
 @dataclass
-class Configuration:
+class MembershipConfiguration:
     generation: int
     members: list[SafekeeperId]
     new_members: list[SafekeeperId] | None
 
     @classmethod
-    def from_json(cls, d: dict[str, Any]) -> Configuration:
+    def from_json(cls, d: dict[str, Any]) -> MembershipConfiguration:
         generation = d["generation"]
         members = d["members"]
         new_members = d.get("new_members")
-        return Configuration(generation, members, new_members)
+        return MembershipConfiguration(generation, members, new_members)
 
     def to_json(self) -> str:
         return json.dumps(self, cls=EnhancedJSONEncoder)
@@ -98,7 +98,7 @@ def to_json(self) -> str:
 class TimelineCreateRequest:
     tenant_id: TenantId
     timeline_id: TimelineId
-    mconf: Configuration
+    mconf: MembershipConfiguration
     # not exactly PgVersion, for example 150002 for 15.2
     pg_version: int
     start_lsn: Lsn
@@ -110,13 +110,13 @@ def to_json(self) -> str:
 
 @dataclass
 class TimelineMembershipSwitchResponse:
-    previous_conf: Configuration
-    current_conf: Configuration
+    previous_conf: MembershipConfiguration
+    current_conf: MembershipConfiguration
 
     @classmethod
     def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse:
-        previous_conf = Configuration.from_json(d["previous_conf"])
-        current_conf = Configuration.from_json(d["current_conf"])
+        previous_conf = MembershipConfiguration.from_json(d["previous_conf"])
+        current_conf = MembershipConfiguration.from_json(d["current_conf"])
         return TimelineMembershipSwitchResponse(previous_conf, current_conf)
 
 
@@ -194,7 +194,7 @@ def timeline_status(
         resj = res.json()
         walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
         # It is always normally not None, it is allowed only to make forward compat tests happy.
-        mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None
+        mconf = MembershipConfiguration.from_json(resj["mconf"]) if "mconf" in resj else None
         return SafekeeperTimelineStatus(
             mconf=mconf,
             term=resj["acceptor_state"]["term"],
@@ -223,7 +223,9 @@ def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
         return self.timeline_status(tenant_id, timeline_id).commit_lsn
 
     # Get timeline membership configuration.
-    def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration:
+    def get_membership(
+        self, tenant_id: TenantId, timeline_id: TimelineId
+    ) -> MembershipConfiguration:
         # make mypy happy
         return self.timeline_status(tenant_id, timeline_id).mconf  # type: ignore
 
@@ -275,7 +277,7 @@ def pull_timeline(self, body: dict[str, Any]) -> dict[str, Any]:
         return res_json
 
     def timeline_exclude(
-        self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
+        self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration
     ) -> dict[str, Any]:
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/exclude",
@@ -287,7 +289,7 @@ def timeline_exclude(
         return res_json
 
     def membership_switch(
-        self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
+        self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration
     ) -> TimelineMembershipSwitchResponse:
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership",
diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index efd423104d23..8af52dcbd05f 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -66,11 +66,11 @@ def record(metric, **kwargs):
 
     n_txns = 500000
 
-    def setup_wrapper(env: NeonEnv):
-        return setup_tenant_template(env, n_txns)
-
     env = setup_pageserver_with_tenants(
-        neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper
+        neon_env_builder,
+        f"large_slru_count-{n_tenants}-{n_txns}",
+        n_tenants,
+        lambda env: setup_tenant_template(env, n_txns),
     )
     run_benchmark(env, pg_bin, record, duration)
 
@@ -80,10 +80,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int):
         "gc_period": "0s",  # disable periodic gc
         "checkpoint_timeout": "10 years",
         "compaction_period": "0s",  # disable periodic compaction
-        "compaction_threshold": 10,
-        "compaction_target_size": 134217728,
-        "checkpoint_distance": 268435456,
-        "image_creation_threshold": 3,
     }
 
     template_tenant, template_timeline = env.create_tenant(set_default=True)
diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py
index 2c27368001b3..b17ca772c9c7 100644
--- a/test_runner/performance/pageserver/test_page_service_batching.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -1,5 +1,7 @@
+import concurrent.futures
 import dataclasses
 import json
+import threading
 import time
 from dataclasses import dataclass
 from pathlib import Path
@@ -28,38 +30,33 @@ class PageServicePipeliningConfigSerial(PageServicePipeliningConfig):
 class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig):
     max_batch_size: int
     execution: str
+    batching: str
     mode: str = "pipelined"
 
 
-EXECUTION = ["concurrent-futures", "tasks"]
+EXECUTION = ["concurrent-futures"]
+BATCHING = ["uniform-lsn", "scattered-lsn"]
 
 NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
 for max_batch_size in [1, 32]:
     for execution in EXECUTION:
-        NON_BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
+        for batching in BATCHING:
+            NON_BATCHABLE.append(
+                PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
+            )
 
-BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
-for max_batch_size in [1, 2, 4, 8, 16, 32]:
+BATCHABLE: list[PageServicePipeliningConfig] = []
+for max_batch_size in [32]:
     for execution in EXECUTION:
-        BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
+        for batching in BATCHING:
+            BATCHABLE.append(
+                PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
+            )
 
 
 @pytest.mark.parametrize(
     "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
     [
-        # non-batchable workloads
-        # (A separate benchmark will consider latency).
-        *[
-            (
-                50,
-                config,
-                TARGET_RUNTIME,
-                1,
-                128,
-                f"not batchable {dataclasses.asdict(config)}",
-            )
-            for config in NON_BATCHABLE
-        ],
         # batchable workloads should show throughput and CPU efficiency improvements
         *[
             (
@@ -137,7 +134,14 @@ def test_throughput(
 
     env = neon_env_builder.init_start()
     ps_http = env.pageserver.http_client()
-    endpoint = env.endpoints.create_start("main")
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            # minimal lfc & small shared buffers to force requests to pageserver
+            "neon.max_file_cache_size=1MB",
+            "shared_buffers=10MB",
+        ],
+    )
     conn = endpoint.connect()
     cur = conn.cursor()
 
@@ -155,7 +159,6 @@ def test_throughput(
     tablesize = tablesize_mib * 1024 * 1024
     npages = tablesize // (8 * 1024)
     cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,))
-    # TODO: can we force postgres to do sequential scans?
 
     #
     # Run the workload, collect `Metrics` before and after, calculate difference, normalize.
@@ -166,6 +169,7 @@ class Metrics:
         time: float
         pageserver_batch_size_histo_sum: float
         pageserver_batch_size_histo_count: float
+        pageserver_batch_breaks_reason_count: dict[str, int]
         compute_getpage_count: float
         pageserver_cpu_seconds_total: float
 
@@ -179,6 +183,10 @@ def __sub__(self, other: "Metrics") -> "Metrics":
                 compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count,
                 pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total
                 - other.pageserver_cpu_seconds_total,
+                pageserver_batch_breaks_reason_count={
+                    reason: count - other.pageserver_batch_breaks_reason_count.get(reason, 0)
+                    for reason, count in self.pageserver_batch_breaks_reason_count.items()
+                },
             )
 
         def normalize(self, by) -> "Metrics":
@@ -188,6 +196,10 @@ def normalize(self, by) -> "Metrics":
                 pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count / by,
                 compute_getpage_count=self.compute_getpage_count / by,
                 pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by,
+                pageserver_batch_breaks_reason_count={
+                    reason: count / by
+                    for reason, count in self.pageserver_batch_breaks_reason_count.items()
+                },
             )
 
     def get_metrics() -> Metrics:
@@ -197,6 +209,20 @@ def get_metrics() -> Metrics:
             )
             compute_getpage_count = cur.fetchall()[0][0]
             pageserver_metrics = ps_http.get_metrics()
+            for name, samples in pageserver_metrics.metrics.items():
+                for sample in samples:
+                    log.info(f"{name=} labels={sample.labels} {sample.value}")
+
+            raw_batch_break_reason_count = pageserver_metrics.query_all(
+                "pageserver_page_service_batch_break_reason_total",
+                filter={"timeline_id": str(env.initial_timeline)},
+            )
+
+            batch_break_reason_count = {
+                sample.labels["reason"]: int(sample.value)
+                for sample in raw_batch_break_reason_count
+            }
+
             return Metrics(
                 time=time.time(),
                 pageserver_batch_size_histo_sum=pageserver_metrics.query_one(
@@ -205,34 +231,58 @@ def get_metrics() -> Metrics:
                 pageserver_batch_size_histo_count=pageserver_metrics.query_one(
                     "pageserver_page_service_batch_size_count"
                 ).value,
+                pageserver_batch_breaks_reason_count=batch_break_reason_count,
                 compute_getpage_count=compute_getpage_count,
                 pageserver_cpu_seconds_total=pageserver_metrics.query_one(
                     "libmetrics_process_cpu_seconds_highres"
                 ).value,
             )
 
-    def workload() -> Metrics:
+    def workload(disruptor_started: threading.Event) -> Metrics:
+        disruptor_started.wait()
         start = time.time()
         iters = 0
         while time.time() - start < target_runtime or iters < 2:
-            log.info("Seqscan %d", iters)
             if iters == 1:
                 # round zero for warming up
                 before = get_metrics()
-            cur.execute(
-                "select clear_buffer_cache()"
-            )  # TODO: what about LFC? doesn't matter right now because LFC isn't enabled by default in tests
             cur.execute("select sum(data::bigint) from t")
             assert cur.fetchall()[0][0] == npages * (npages + 1) // 2
             iters += 1
         after = get_metrics()
         return (after - before).normalize(iters - 1)
 
+    def disruptor(disruptor_started: threading.Event, stop_disruptor: threading.Event):
+        conn = endpoint.connect()
+        cur = conn.cursor()
+        iters = 0
+        while True:
+            cur.execute("SELECT pg_logical_emit_message(true, 'test', 'advancelsn')")
+            if stop_disruptor.is_set():
+                break
+            disruptor_started.set()
+            iters += 1
+            time.sleep(0.001)
+        return iters
+
     env.pageserver.patch_config_toml_nonrecursive(
         {"page_service_pipelining": dataclasses.asdict(pipelining_config)}
     )
-    env.pageserver.restart()
-    metrics = workload()
+
+    # set trace for log analysis below
+    env.pageserver.restart(extra_env_vars={"RUST_LOG": "info,pageserver::page_service=trace"})
+
+    log.info("Starting workload")
+
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        disruptor_started = threading.Event()
+        stop_disruptor = threading.Event()
+        disruptor_fut = executor.submit(disruptor, disruptor_started, stop_disruptor)
+        workload_fut = executor.submit(workload, disruptor_started)
+        metrics = workload_fut.result()
+        stop_disruptor.set()
+        ndisruptions = disruptor_fut.result()
+        log.info("Disruptor issued %d disrupting requests", ndisruptions)
 
     log.info("Results: %s", metrics)
 
@@ -249,7 +299,16 @@ def workload() -> Metrics:
     #
 
     for metric, value in dataclasses.asdict(metrics).items():
-        zenbenchmark.record(f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM)
+        if metric == "pageserver_batch_breaks_reason_count":
+            assert isinstance(value, dict)
+            for reason, count in value.items():
+                zenbenchmark.record(
+                    f"counters.{metric}_{reason}", count, unit="", report=MetricReport.TEST_PARAM
+                )
+        else:
+            zenbenchmark.record(
+                f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM
+            )
 
     zenbenchmark.record(
         "perfmetric.batching_factor",
@@ -262,7 +321,10 @@ def workload() -> Metrics:
 PRECISION_CONFIGS: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
 for max_batch_size in [1, 32]:
     for execution in EXECUTION:
-        PRECISION_CONFIGS.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
+        for batching in BATCHING:
+            PRECISION_CONFIGS.append(
+                PageServicePipeliningConfigPipelined(max_batch_size, execution, batching)
+            )
 
 
 @pytest.mark.parametrize(
diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py
index 7a6d88f79c77..b50659defc4f 100644
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -40,6 +40,8 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
             for layer in info.historic_layers:
                 assert not layer.remote
 
+    env.storage_controller.reconcile_until_idle(timeout_secs=60)
+
     log.info("ready")
 
 
diff --git a/test_runner/performance/test_perf_oltp_large_tenant.py b/test_runner/performance/test_perf_oltp_large_tenant.py
index 957a4ec796ac..b45394d6271b 100644
--- a/test_runner/performance/test_perf_oltp_large_tenant.py
+++ b/test_runner/performance/test_perf_oltp_large_tenant.py
@@ -145,11 +145,14 @@ def run_database_maintenance(env: PgCompare):
                 END $$;
                 """
             )
-
-            log.info("start REINDEX TABLE CONCURRENTLY transaction.transaction")
-            with env.zenbenchmark.record_duration("reindex concurrently"):
-                cur.execute("REINDEX TABLE CONCURRENTLY transaction.transaction;")
-            log.info("finished REINDEX TABLE CONCURRENTLY transaction.transaction")
+            # in production a customer would likely use reindex concurrently
+            # but for our test we don't care about the downtime
+            # and it would just about double the time we report in the test
+            # because we need one more table scan for each index
+            log.info("start REINDEX TABLE transaction.transaction")
+            with env.zenbenchmark.record_duration("reindex"):
+                cur.execute("REINDEX TABLE transaction.transaction;")
+            log.info("finished REINDEX TABLE transaction.transaction")
 
 
 @pytest.mark.parametrize("custom_scripts", get_custom_scripts())
diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py
index df5419f29268..16cdab155a60 100644
--- a/test_runner/performance/test_physical_replication.py
+++ b/test_runner/performance/test_physical_replication.py
@@ -64,8 +64,8 @@ def test_ro_replica_lag(
 
     project = neon_api.create_project(pg_version)
     project_id = project["project"]["id"]
-    log.info("Project ID: {}", project_id)
-    log.info("Primary endpoint ID: {}", project["project"]["endpoints"][0]["id"])
+    log.info("Project ID: %s", project_id)
+    log.info("Primary endpoint ID: %s", project["project"]["endpoints"][0]["id"])
     neon_api.wait_for_operation_to_finish(project_id)
     error_occurred = False
     try:
@@ -81,7 +81,7 @@ def test_ro_replica_lag(
             endpoint_type="read_only",
             settings={"pg_settings": {"hot_standby_feedback": "on"}},
         )
-        log.info("Replica endpoint ID: {}", replica["endpoint"]["id"])
+        log.info("Replica endpoint ID: %s", replica["endpoint"]["id"])
         replica_env = master_env.copy()
         replica_env["PGHOST"] = replica["endpoint"]["host"]
         neon_api.wait_for_operation_to_finish(project_id)
@@ -197,8 +197,8 @@ def test_replication_start_stop(
 
     project = neon_api.create_project(pg_version)
     project_id = project["project"]["id"]
-    log.info("Project ID: {}", project_id)
-    log.info("Primary endpoint ID: {}", project["project"]["endpoints"][0]["id"])
+    log.info("Project ID: %s", project_id)
+    log.info("Primary endpoint ID: %s", project["project"]["endpoints"][0]["id"])
     neon_api.wait_for_operation_to_finish(project_id)
     try:
         branch_id = project["branch"]["id"]
@@ -215,7 +215,7 @@ def test_replication_start_stop(
                 endpoint_type="read_only",
                 settings={"pg_settings": {"hot_standby_feedback": "on"}},
             )
-            log.info("Replica {} endpoint ID: {}", i + 1, replica["endpoint"]["id"])
+            log.info("Replica %d endpoint ID: %s", i + 1, replica["endpoint"]["id"])
             replicas.append(replica)
             neon_api.wait_for_operation_to_finish(project_id)
 
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
index 027be03707b7..22c0e461b5c0 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
@@ -808,9 +808,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.38.0"
+version = "1.38.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a"
+checksum = "68722da18b0fc4a05fdc1120b302b82051265792a1e1b399086e9b204b10ad3d"
 dependencies = [
  "backtrace",
  "bytes",
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 5021cc4b1790..9b6930695c79 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -187,6 +187,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         },
         "rel_size_v2_enabled": False,  # test suite enables it by default as of https://github.com/neondatabase/neon/issues/11081, so, custom config means disabling it
         "gc_compaction_enabled": True,
+        "gc_compaction_verification": False,
         "gc_compaction_initial_threshold_kb": 1024000,
         "gc_compaction_ratio_percent": 200,
         "image_creation_preempt_threshold": 5,
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 6789939e0c32..84d37de9f14b 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -38,12 +38,34 @@
     "compaction_target_size": 1024**2,
     "image_creation_threshold": 1,
     "image_creation_preempt_threshold": 1,
-    # compact more frequently
+    # Compact more frequently
     "compaction_threshold": 3,
     "compaction_upper_limit": 6,
     "lsn_lease_length": "0s",
 }
 
+PREEMPT_GC_COMPACTION_TENANT_CONF = {
+    "gc_period": "5s",
+    "compaction_period": "5s",
+    # Small checkpoint distance to create many layers
+    "checkpoint_distance": 1024**2,
+    # Compact small layers
+    "compaction_target_size": 1024**2,
+    "image_creation_threshold": 10000,  # Do not create image layers at all
+    "image_creation_preempt_threshold": 10000,
+    # Compact more frequently
+    "compaction_threshold": 3,
+    "compaction_upper_limit": 6,
+    "lsn_lease_length": "0s",
+    # Enable gc-compaction
+    "gc_compaction_enabled": "true",
+    "gc_compaction_initial_threshold_kb": 1024,  # At a small threshold
+    "gc_compaction_ratio_percent": 1,
+    # No PiTR interval and small GC horizon
+    "pitr_interval": "0s",
+    "gc_horizon": f"{1024**2}",
+}
+
 
 @skip_in_debug_build("only run with release build")
 @pytest.mark.parametrize(
@@ -140,6 +162,8 @@ def test_pageserver_compaction_preempt(
     conf = PREEMPT_COMPACTION_TENANT_CONF.copy()
     env = neon_env_builder.init_start(initial_tenant_conf=conf)
 
+    env.pageserver.allowed_errors.append(".*The timeline or pageserver is shutting down.*")
+
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
@@ -165,6 +189,41 @@ def test_pageserver_compaction_preempt(
     env.pageserver.assert_log_contains("resuming image layer creation")
 
 
+@skip_in_debug_build("only run with release build")
+def test_pageserver_gc_compaction_preempt(
+    neon_env_builder: NeonEnvBuilder,
+):
+    # Ideally we should be able to do unit tests for this, but we need real Postgres
+    # WALs in order to do unit testing...
+
+    conf = PREEMPT_GC_COMPACTION_TENANT_CONF.copy()
+    env = neon_env_builder.init_start(initial_tenant_conf=conf)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    row_count = 200000
+    churn_rounds = 10
+
+    ps_http = env.pageserver.http_client()
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageserver.id)
+
+    log.info("Writing initial data ...")
+    workload.write_rows(row_count, env.pageserver.id)
+
+    for i in range(1, churn_rounds + 1):
+        log.info(f"Running churn round {i}/{churn_rounds} ...")
+        workload.churn_rows(row_count, env.pageserver.id, upload=False)
+        workload.validate(env.pageserver.id)
+    ps_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True)
+    log.info("Validating at workload end ...")
+    workload.validate(env.pageserver.id)
+    # ensure gc_compaction gets preempted and then resumed
+    env.pageserver.assert_log_contains("preempt gc-compaction")
+
+
 @skip_in_debug_build("only run with release build")
 @pytest.mark.timeout(900)  # This test is slow with sanitizers enabled, especially on ARM
 @pytest.mark.parametrize(
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index ee96daca3328..e23b1e0bca98 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -148,9 +148,9 @@ def test_create_snapshot(
     env = neon_env_builder.init_start(
         initial_tenant_conf={
             # Miniature layers to enable generating non-trivial layer map without writing lots of data.
-            "checkpoint_distance": f"{128 * 1024}",
-            "compaction_threshold": "1",
-            "compaction_target_size": f"{128 * 1024}",
+            "checkpoint_distance": f"{256 * 1024}",
+            "compaction_threshold": "5",
+            "compaction_target_size": f"{256 * 1024}",
         }
     )
     endpoint = env.endpoints.create_start("main")
@@ -492,6 +492,13 @@ def __str__(self):
         PgVersion.V17,
         "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-02-07-pgv17-nogenerations.tar.zst",
     ),
+    # Tenant manifest v1.
+    HistoricDataSet(
+        "2025-04-08-tenant-manifest-v1",
+        TenantId("c547c28588abf1d7b7139ff1f1158345"),
+        PgVersion.V17,
+        "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-04-08-pgv17-tenant-manifest-v1.tar.zst",
+    ),
 ]
 
 
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index c1f05830b742..37208c9fff3d 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -90,10 +90,12 @@ def test_compute_catalog(neon_simple_env: NeonEnv):
     # and reconfigure the endpoint to create some test databases.
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "roles": TEST_ROLE_NAMES,
-                "databases": TEST_DB_NAMES,
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "roles": TEST_ROLE_NAMES,
+                    "databases": TEST_DB_NAMES,
+                },
             },
         }
     )
@@ -155,10 +157,12 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
     # and reconfigure the endpoint to apply the changes.
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "roles": TEST_ROLE_NAMES,
-                "databases": TEST_DB_NAMES,
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "roles": TEST_ROLE_NAMES,
+                    "databases": TEST_DB_NAMES,
+                },
             },
         }
     )
@@ -196,12 +200,14 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
 
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "roles": [],
-                "databases": [],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "roles": [],
+                    "databases": [],
+                },
+                "delta_operations": delta_operations,
             },
-            "delta_operations": delta_operations,
         }
     )
     endpoint.reconfigure()
@@ -250,9 +256,11 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
     # and reconfigure the endpoint to apply the changes.
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "databases": TEST_DB_NAMES,
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "databases": TEST_DB_NAMES,
+                },
             },
         }
     )
@@ -306,17 +314,19 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
     # and reconfigure the endpoint to apply the changes.
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "databases": TEST_DB_NAMES_NEW,
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "databases": TEST_DB_NAMES_NEW,
+                },
+                "delta_operations": [
+                    {"action": "delete_db", "name": SUB_DB_NAME},
+                    # also test the case when we try to delete a non-existent database
+                    # shouldn't happen in normal operation,
+                    # but can occur when failed operations are retried
+                    {"action": "delete_db", "name": "nonexistent_db"},
+                ],
             },
-            "delta_operations": [
-                {"action": "delete_db", "name": SUB_DB_NAME},
-                # also test the case when we try to delete a non-existent database
-                # shouldn't happen in normal operation,
-                # but can occur when failed operations are retried
-                {"action": "delete_db", "name": "nonexistent_db"},
-            ],
         }
     )
 
@@ -354,25 +364,27 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne
 
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "roles": [
-                    {
-                        # We need to create role via compute_ctl, because in this case it will receive
-                        # additional grants equivalent to our real environment, so we can repro some
-                        # issues.
-                        "name": "neon",
-                        # Some autocomplete-suggested hash, no specific meaning.
-                        "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=",
-                        "options": [],
-                    },
-                ],
-                "databases": [
-                    {
-                        "name": TEST_DB_NAME,
-                        "owner": "neon",
-                    },
-                ],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "roles": [
+                        {
+                            # We need to create role via compute_ctl, because in this case it will receive
+                            # additional grants equivalent to our real environment, so we can repro some
+                            # issues.
+                            "name": "neon",
+                            # Some autocomplete-suggested hash, no specific meaning.
+                            "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=",
+                            "options": [],
+                        },
+                    ],
+                    "databases": [
+                        {
+                            "name": TEST_DB_NAME,
+                            "owner": "neon",
+                        },
+                    ],
+                },
             },
         }
     )
@@ -415,13 +427,15 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne
     # Drop role via compute_ctl
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "delta_operations": [
-                {
-                    "action": "delete_role",
-                    "name": TEST_GRANTEE,
-                },
-            ],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "delta_operations": [
+                    {
+                        "action": "delete_role",
+                        "name": TEST_GRANTEE,
+                    },
+                ],
+            },
         }
     )
     endpoint.reconfigure()
@@ -444,13 +458,15 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne
 
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "delta_operations": [
-                {
-                    "action": "delete_role",
-                    "name": "readonly2",
-                },
-            ],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "delta_operations": [
+                    {
+                        "action": "delete_role",
+                        "name": "readonly2",
+                    },
+                ],
+            },
         }
     )
     endpoint.reconfigure()
@@ -475,25 +491,27 @@ def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env
     endpoint = env.endpoints.create_start("main")
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "roles": [
-                    {
-                        # We need to create role via compute_ctl, because in this case it will receive
-                        # additional grants equivalent to our real environment, so we can repro some
-                        # issues.
-                        "name": TEST_GRANTOR,
-                        # Some autocomplete-suggested hash, no specific meaning.
-                        "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=",
-                        "options": [],
-                    },
-                ],
-                "databases": [
-                    {
-                        "name": TEST_DB_NAME,
-                        "owner": TEST_GRANTOR,
-                    },
-                ],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "roles": [
+                        {
+                            # We need to create role via compute_ctl, because in this case it will receive
+                            # additional grants equivalent to our real environment, so we can repro some
+                            # issues.
+                            "name": TEST_GRANTOR,
+                            # Some autocomplete-suggested hash, no specific meaning.
+                            "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=",
+                            "options": [],
+                        },
+                    ],
+                    "databases": [
+                        {
+                            "name": TEST_DB_NAME,
+                            "owner": TEST_GRANTOR,
+                        },
+                    ],
+                },
             },
         }
     )
@@ -507,13 +525,15 @@ def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env
 
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "delta_operations": [
-                {
-                    "action": "delete_role",
-                    "name": TEST_GRANTEE,
-                },
-            ],
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "delta_operations": [
+                    {
+                        "action": "delete_role",
+                        "name": TEST_GRANTEE,
+                    },
+                ],
+            },
         }
     )
     endpoint.reconfigure()
diff --git a/test_runner/regress/test_compute_reconfigure.py b/test_runner/regress/test_compute_reconfigure.py
index 6396ba67a10f..b533d45b1eeb 100644
--- a/test_runner/regress/test_compute_reconfigure.py
+++ b/test_runner/regress/test_compute_reconfigure.py
@@ -31,15 +31,17 @@ def test_compute_reconfigure(neon_simple_env: NeonEnv):
 
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": True,
-            "cluster": {
-                "settings": [
-                    {
-                        "name": "log_line_prefix",
-                        "vartype": "string",
-                        "value": TEST_LOG_LINE_PREFIX,
-                    }
-                ]
+            "spec": {
+                "skip_pg_catalog_updates": True,
+                "cluster": {
+                    "settings": [
+                        {
+                            "name": "log_line_prefix",
+                            "vartype": "string",
+                            "value": TEST_LOG_LINE_PREFIX,
+                        }
+                    ]
+                },
             },
         }
     )
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 7280a91a1249..c5a1bf0d16d5 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -276,3 +276,34 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
             if i > 1:
                 before_timestamp = tbl[i - step_size][1]
                 assert timestamp >= before_timestamp, "before_timestamp before timestamp"
+
+
+def test_timestamp_of_lsn_empty_branch(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that getting the timestamp of the head LSN of a newly created branch works.
+    This verifies that we don't get a 404 error when trying to get the timestamp
+    of the head LSN of a branch that was just created.
+    We now return a special status code 412 to indicate if there is no timestamp found for lsn.
+
+    Reproducer for https://github.com/neondatabase/neon/issues/11439
+    """
+    env = neon_env_builder.init_start()
+
+    # Create a new branch
+    new_timeline_id = env.create_branch("test_timestamp_of_lsn_empty_branch")
+
+    # Retrieve the commit LSN of the empty branch, which we have never run postgres on
+    detail = env.pageserver.http_client().timeline_detail(
+        tenant_id=env.initial_tenant, timeline_id=new_timeline_id
+    )
+    head_lsn = detail["last_record_lsn"]
+
+    # Verify that we get 412 status code
+    with env.pageserver.http_client() as client:
+        with pytest.raises(PageserverApiException) as err:
+            client.timeline_get_timestamp_of_lsn(
+                env.initial_tenant,
+                new_timeline_id,
+                head_lsn,
+            )
+        assert err.value.status_code == 412
diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py
index 8bd0662ef8bd..e6bcdf8e67a8 100644
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -134,10 +134,11 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
     """
     env = neon_env_builder.init_start()
 
-    # Stop default ps/sk
+    # Stop default services
     env.neon_cli.pageserver_stop(env.pageserver.id)
     env.neon_cli.safekeeper_stop()
     env.neon_cli.storage_controller_stop(False)
+    env.neon_cli.object_storage_stop(False)
     env.neon_cli.storage_broker_stop()
 
     # Keep NeonEnv state up to date, it usually owns starting/stopping services
@@ -179,11 +180,13 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
 
     # Using the single-pageserver shortcut property throws when there are multiple pageservers
     with pytest.raises(AssertionError):
-        _drop = env.pageserver
+        _ = env.pageserver
 
     env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 1)
     env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 2)
 
+    env.neon_cli.object_storage_stop(False)
+
     # Stop this to get out of the way of the following `start`
     env.neon_cli.storage_controller_stop(False)
     env.neon_cli.storage_broker_stop()
diff --git a/test_runner/regress/test_object_storage.py b/test_runner/regress/test_object_storage.py
new file mode 100644
index 000000000000..0b1cfa344fe6
--- /dev/null
+++ b/test_runner/regress/test_object_storage.py
@@ -0,0 +1,56 @@
+from time import time
+
+import pytest
+from aiohttp import ClientSession
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv
+from jwcrypto import jwk, jwt
+
+
+@pytest.mark.asyncio
+async def test_object_storage_insert_retrieve_delete(neon_simple_env: NeonEnv):
+    """
+    Inserts, retrieves, and deletes test file using a JWT token
+    """
+    env = neon_simple_env
+    ep = env.endpoints.create_start(branch_name="main")
+    tenant_id = str(ep.tenant_id)
+    timeline_id = str(ep.show_timeline_id())
+    endpoint_id = ep.endpoint_id
+
+    key_path = env.repo_dir / "auth_private_key.pem"
+    key = jwk.JWK.from_pem(key_path.read_bytes())
+    claims = {
+        "tenant_id": tenant_id,
+        "timeline_id": timeline_id,
+        "endpoint_id": endpoint_id,
+        "exp": round(time()) + 99,
+    }
+    log.info(f"key path {key_path}\nclaims {claims}")
+    token = jwt.JWT(header={"alg": "EdDSA"}, claims=claims)
+    token.make_signed_token(key)
+    token = token.serialize()
+
+    base_url = env.object_storage.base_url()
+    key = f"http://{base_url}/{tenant_id}/{timeline_id}/{endpoint_id}/key"
+    headers = {"Authorization": f"Bearer {token}"}
+    log.info(f"cache key url {key}")
+    log.info(f"token {token}")
+
+    async with ClientSession(headers=headers) as session:
+        async with session.get(key) as res:
+            assert res.status == 404, f"Non-existing file is present: {res}"
+
+        data = b"cheburash"
+        async with session.put(key, data=data) as res:
+            assert res.status == 200, f"Error writing file: {res}"
+
+        async with session.get(key) as res:
+            read_data = await res.read()
+            assert data == read_data
+
+        async with session.delete(key) as res:
+            assert res.status == 200, f"Error removing file {res}"
+
+        async with session.get(key) as res:
+            assert res.status == 404, f"File was not deleted: {res}"
diff --git a/test_runner/regress/test_page_service_batching_regressions.py b/test_runner/regress/test_page_service_batching_regressions.py
index fa85e1210b30..50303a498622 100644
--- a/test_runner/regress/test_page_service_batching_regressions.py
+++ b/test_runner/regress/test_page_service_batching_regressions.py
@@ -16,6 +16,7 @@ def patch_pageserver_toml(config):
             "mode": "pipelined",
             "max_batch_size": 32,
             "execution": "concurrent-futures",
+            "batching": "uniform-lsn",
         }
 
     neon_env_builder.pageserver_config_override = patch_pageserver_toml
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index 3d7204d88388..5ef63e2fe92f 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import copy
 import json
 import uuid
 from typing import TYPE_CHECKING
@@ -16,7 +15,6 @@
     from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 
 
-@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/11395")
 def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     env = neon_env_builder.init_start()
 
@@ -44,7 +42,6 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
                 "refill_interval": "100ms",
                 "refill_amount": int(rate_limit_rps / 10),
                 "max": int(rate_limit_rps / 10),
-                "fair": True,
             },
         },
     )
@@ -98,17 +95,12 @@ def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: i
     _, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None))
 
     log.info("run pagebench")
-    duration_secs = 10
+    duration_secs = 20
     actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs)
 
     log.info("validate the client is capped at the configured rps limit")
     expect_ncompleted = duration_secs * rate_limit_rps
-    delta_abs = abs(expect_ncompleted - actual_ncompleted)
-    threshold = 0.05 * expect_ncompleted
-    assert threshold / rate_limit_rps < 0.1 * duration_secs, (
-        "test self-test: unrealistic expecations regarding precision in this test"
-    )
-    assert delta_abs < 0.05 * expect_ncompleted, (
+    assert pytest.approx(expect_ncompleted, 0.05) == actual_ncompleted, (
         "the throttling deviates more than 5percent from the expectation"
     )
 
@@ -122,6 +114,7 @@ def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: i
         timeout=compaction_period,
     )
 
+    log.info("validate the metrics")
     smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
     assert smgr_query_seconds_post is not None
     throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
@@ -130,72 +123,13 @@ def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: i
     actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
     actual_throttled_secs = actual_throttled_usecs / 1_000_000
 
-    log.info("validate that the metric doesn't include throttle wait time")
-    assert duration_secs >= 10 * actual_smgr_query_seconds, (
-        "smgr metrics should not include throttle wait time"
-    )
-
-    log.info("validate that the throttling wait time metrics is correct")
     assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, (
-        "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
-    )
-
-
-throttle_config_with_field_fair_set = {
-    "task_kinds": ["PageRequestHandler"],
-    "fair": True,
-    "initial": 27,
-    "refill_interval": "43s",
-    "refill_amount": 23,
-    "max": 42,
-}
-
-
-def assert_throttle_config_with_field_fair_set(conf):
-    """
-    Field `fair` is ignored, so, responses don't contain it
-    """
-    without_fair = copy.deepcopy(throttle_config_with_field_fair_set)
-    without_fair.pop("fair")
-
-    assert conf == without_fair
-
-
-def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder):
-    """
-    To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
-    """
-    env = neon_env_builder.init_start()
-    vps_http = env.storage_controller.pageserver_api()
-    # with_fair config should still be settable
-    vps_http.set_tenant_config(
-        env.initial_tenant,
-        {"timeline_get_throttle": throttle_config_with_field_fair_set},
-    )
-    conf = vps_http.tenant_config(env.initial_tenant)
-    assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
-    assert_throttle_config_with_field_fair_set(
-        conf.tenant_specific_overrides["timeline_get_throttle"]
+        "throttling and processing latency = total request time; this assert validates thi holds on average"
     )
 
-
-def test_throttle_fair_config_is_settable_but_ignored_in_config_toml(
-    neon_env_builder: NeonEnvBuilder,
-):
-    """
-    To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
-    """
-
-    def set_tenant_config(ps_cfg):
-        tenant_config = ps_cfg.setdefault("tenant_config", {})
-        tenant_config["timeline_get_throttle"] = throttle_config_with_field_fair_set
-
-    neon_env_builder.pageserver_config_override = set_tenant_config
-    env = neon_env_builder.init_start()
-    ps_http = env.pageserver.http_client()
-    conf = ps_http.tenant_config(env.initial_tenant)
-    assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
-
-    env.pageserver.allowed_errors.append(
-        r'.*ignoring unknown configuration item path="tenant_config\.timeline_get_throttle\.fair"*'
+    # without this assertion, the test would pass even if the throttling was completely broken
+    # but the request processing is so slow that it makes up for the latency that a correct throttling
+    # implementation would add
+    assert actual_smgr_query_seconds < 0.66 * duration_secs, (
+        "test self-test: request processing is consuming most of the wall clock time; this risks that we're not actually testing throttling"
     )
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index c73a592d98d6..d48e73139403 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -61,7 +61,7 @@ def evict_random_layers(
     )
     client = pageserver.http_client()
     for layer in initial_local_layers:
-        if "ephemeral" in layer.name or "temp_download" in layer.name:
+        if "ephemeral" in layer.name or "temp_download" in layer.name or ".___temp" in layer.name:
             continue
 
         layer_name = parse_layer_file_name(layer.name)
@@ -242,7 +242,13 @@ def ignore_notify(request: Request):
             pageserver.tenant_location_configure(tenant_id, location_conf)
             last_state[pageserver.id] = (mode, generation)
 
-            if mode.startswith("Attached"):
+            # It's only valid to connect to the last generation. Newer generations may yank layer
+            # files used in older generations.
+            last_generation = max(
+                [s[1] for s in last_state.values() if s[1] is not None], default=None
+            )
+
+            if mode.startswith("Attached") and generation == last_generation:
                 # This is a basic test: we are validating that he endpoint works properly _between_
                 # configuration changes.  A stronger test would be to validate that clients see
                 # no errors while we are making the changes.
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index a3fae9732741..0fea70688801 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -239,6 +239,8 @@ def test_isolation(
             "neon.regress_test_mode = true",
             # Stack size should be increased for tests to pass with asan.
             "max_stack_depth = 4MB",
+            # Neon extensiosn starts 2 BGW so decreasing number of parallel workers which can affect deadlock-parallel test if it hits max_worker_processes.
+            "max_worker_processes = 16",
         ],
     )
     endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
diff --git a/test_runner/regress/test_ssl.py b/test_runner/regress/test_ssl.py
index 9a7204ca17cc..39c94c05a9d7 100644
--- a/test_runner/regress/test_ssl.py
+++ b/test_runner/regress/test_ssl.py
@@ -1,5 +1,6 @@
 import os
 import ssl
+from datetime import datetime, timedelta
 
 import pytest
 import requests
@@ -151,3 +152,63 @@ def cert_reloaded():
     requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status()
     cur_cert = ssl.get_server_certificate(("localhost", port))
     assert cur_cert == sk_cert
+
+
+def test_server_and_cert_metrics(neon_env_builder: NeonEnvBuilder):
+    """
+    Test metrics exported from http/https server and tls cert reloader.
+    """
+    neon_env_builder.use_https_pageserver_api = True
+    neon_env_builder.pageserver_config_override = "ssl_cert_reload_period='100 ms'"
+    env = neon_env_builder.init_start()
+
+    env.pageserver.allowed_errors.append(".*Error reloading certificate.*")
+
+    ps_client = env.pageserver.http_client()
+
+    # 1. Test connection started metric.
+    filter_https = {"scheme": "https"}
+    old_https_conn_count = (
+        ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0
+    )
+
+    addr = f"https://localhost:{env.pageserver.service_port.https}/v1/status"
+    requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status()
+
+    new_https_conn_count = (
+        ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0
+    )
+    # The counter should increase after the request,
+    # but it may increase by more than one because of storcon requests.
+    assert new_https_conn_count > old_https_conn_count
+
+    # 2. Test tls connection error.
+    # Request without specified CA cert file should fail.
+    with pytest.raises(requests.exceptions.SSLError):
+        requests.get(addr)
+
+    tls_error_cnt = (
+        ps_client.get_metric_value("http_server_connection_errors_total", {"type": "tls"}) or 0
+    )
+    assert tls_error_cnt == 1
+
+    # 3. Test expiration time metric.
+    expiration_time = datetime.fromtimestamp(
+        ps_client.get_metric_value("tls_certs_expiration_time_seconds") or 0
+    )
+    now = datetime.now()
+    # neon_local generates certs valid for 100 years.
+    # Compare with +-1 year to not care about leap years.
+    assert now + timedelta(days=365 * 99) < expiration_time < now + timedelta(days=365 * 101)
+
+    # 4. Test cert reload failed metric.
+    reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total")
+    assert reload_error_cnt == 0
+
+    os.remove(env.pageserver.workdir / "server.crt")
+
+    def reload_failed():
+        reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total") or 0
+        assert reload_error_cnt > 0
+
+    wait_until(reload_failed)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 702f4eeccfe2..b2c8415e9a09 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -95,6 +95,7 @@ def test_storage_controller_smoke(
     env.pageservers[1].start()
     for sk in env.safekeepers:
         sk.start()
+    env.object_storage.start()
 
     # The pageservers we started should have registered with the sharding service on startup
     nodes = env.storage_controller.node_list()
@@ -346,6 +347,7 @@ def prepare_onboarding_env(
     env = neon_env_builder.init_configs()
     env.broker.start()
     env.storage_controller.start()
+    env.object_storage.start()
 
     # This is the pageserver where we'll initially create the tenant.  Run it in emergency
     # mode so that it doesn't talk to storage controller, and do not register it.
@@ -675,7 +677,7 @@ def received_restart_notification():
     env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2)
     expect = {
         "tenant_id": str(env.initial_tenant),
-        "stripe_size": 32768,
+        "stripe_size": 2048,
         "shards": [
             {"node_id": int(env.pageservers[1].id), "shard_number": 0},
             {"node_id": int(env.pageservers[1].id), "shard_number": 1},
@@ -2890,10 +2892,12 @@ def new_becomes_leader():
         )
 
 
+@pytest.mark.parametrize("step_down_times_out", [False, True])
 def test_storage_controller_leadership_transfer_during_split(
     neon_env_builder: NeonEnvBuilder,
     storage_controller_proxy: StorageControllerProxy,
     port_distributor: PortDistributor,
+    step_down_times_out: bool,
 ):
     """
     Exercise a race between shard splitting and graceful leadership transfer.  This is
@@ -2934,6 +2938,18 @@ def test_storage_controller_leadership_transfer_during_split(
         )
     env.storage_controller.reconcile_until_idle()
 
+    # We are testing scenarios where the step down API does not complete: either because it is stuck
+    # doing a shard split, or because it totally times out on some other failpoint.
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*step_down.*request was dropped before completing.*",
+            ".*step_down.*operation timed out.*",
+            ".*Send step down request failed, will retry.*",
+            ".*Send step down request still failed after.*retries.*",
+            ".*Leader .+ did not respond to step-down request.*",
+        ]
+    )
+
     with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
         # Start a shard split
         env.storage_controller.allowed_errors.extend(
@@ -2941,6 +2957,14 @@ def test_storage_controller_leadership_transfer_during_split(
         )
         pause_failpoint = "shard-split-pre-complete"
         env.storage_controller.configure_failpoints((pause_failpoint, "pause"))
+
+        if not step_down_times_out:
+            # Prevent the timeout self-terminate code from executing: we will block step down on the
+            # shard split itself
+            env.storage_controller.configure_failpoints(
+                ("step-down-delay-timeout", "return(3600000)")
+            )
+
         split_fut = executor.submit(
             env.storage_controller.tenant_shard_split, list(tenants)[0], shard_count * 2
         )
@@ -2959,12 +2983,20 @@ def hit_failpoint():
             timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port
         )
 
+        if step_down_times_out:
+            # Step down will time out, original controller will terminate itself
+            env.storage_controller.allowed_errors.extend([".*terminating process.*"])
+        else:
+            # Step down does not time out: original controller hits its shard split completion
+            # code path and realises that it must not purge the parent shards from the database.
+            env.storage_controller.allowed_errors.extend([".*Enqueuing background abort.*"])
+
         def passed_split_abort():
             try:
                 log.info("Checking log for pattern...")
-                assert env.storage_controller.log_contains(
-                    ".*Using observed state received from leader.*"
-                )
+                # This log is indicative of entering startup_reconcile, which happens
+                # after the point we would abort shard splits
+                assert env.storage_controller.log_contains(".*Populating tenant shards.*")
             except Exception:
                 log.exception("Failed to find pattern in log")
                 raise
@@ -2973,34 +3005,42 @@ def passed_split_abort():
         wait_until(passed_split_abort, interval=0.1, status_interval=1.0)
         assert env.storage_controller.log_contains(".*Aborting shard split.*")
 
-        # Proxy is still talking to original controller here: disable its pause failpoint so
-        # that its shard split can run to completion.
-        log.info("Disabling failpoint")
-        # Bypass the proxy: the python test HTTPServer is single threaded and still blocked
-        # on handling the shard split request.
-        env.storage_controller.request(
-            "PUT",
-            f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
-            json=[{"name": "shard-split-pre-complete", "actions": "off"}],
-            headers=env.storage_controller.headers(TokenScope.ADMIN),
-        )
+        if step_down_times_out:
+            # We will let the old controller hit a timeout path where it terminates itself, rather than
+            # completing step_down and trying to complete a shard split
+            def old_controller_terminated():
+                assert env.storage_controller.log_contains(".*terminating process.*")
 
-        def previous_stepped_down():
-            assert (
-                env.storage_controller.get_leadership_status()
-                == StorageControllerLeadershipStatus.STEPPED_DOWN
+            wait_until(old_controller_terminated)
+        else:
+            # Proxy is still talking to original controller here: disable its pause failpoint so
+            # that its shard split can run to completion.
+            log.info("Disabling failpoint")
+            # Bypass the proxy: the python test HTTPServer is single threaded and still blocked
+            # on handling the shard split request.
+            env.storage_controller.request(
+                "PUT",
+                f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints",
+                json=[{"name": "shard-split-pre-complete", "actions": "off"}],
+                headers=env.storage_controller.headers(TokenScope.ADMIN),
             )
 
-        log.info("Awaiting step down")
-        wait_until(previous_stepped_down)
+            def previous_stepped_down():
+                assert (
+                    env.storage_controller.get_leadership_status()
+                    == StorageControllerLeadershipStatus.STEPPED_DOWN
+                )
 
-        # Let the shard split complete: this may happen _after_ the replacement has come up
-        # and tried to clean up the databases
-        log.info("Unblocking & awaiting shard split")
-        with pytest.raises(Exception, match="Unexpected child shard count"):
-            # This split fails when it tries to persist results, because it encounters
-            # changes already made by the new controller's abort-on-startup
-            split_fut.result()
+            log.info("Awaiting step down")
+            wait_until(previous_stepped_down)
+
+            # Let the shard split complete: this may happen _after_ the replacement has come up
+            # and tried to clean up the databases
+            log.info("Unblocking & awaiting shard split")
+            with pytest.raises(Exception, match="Unexpected child shard count"):
+                # This split fails when it tries to persist results, because it encounters
+                # changes already made by the new controller's abort-on-startup
+                split_fut.result()
 
         log.info("Routing to new leader")
         storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
@@ -3018,13 +3058,14 @@ def new_becomes_leader():
     env.storage_controller.wait_until_ready()
     env.storage_controller.consistency_check()
 
-    # Check that the stepped down instance forwards requests
-    # to the new leader while it's still running.
-    storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
-    env.storage_controller.tenant_shard_dump()
-    env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
-    status = env.storage_controller.node_status(env.pageservers[0].id)
-    assert status["scheduling"] == "Pause"
+    if not step_down_times_out:
+        # Check that the stepped down instance forwards requests
+        # to the new leader while it's still running.
+        storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
+        env.storage_controller.tenant_shard_dump()
+        env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
+        status = env.storage_controller.node_status(env.pageservers[0].id)
+        assert status["scheduling"] == "Pause"
 
 
 def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder):
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 70af299de355..03cd133ccbb8 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -75,7 +75,7 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
         tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)]
 
     # Let shards finish rescheduling to other pageservers: this makes the rest of the test more stable
-    # is it won't overlap with migrations
+    # as it won't overlap with migrations
     env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
 
     output_path = neon_env_builder.test_output_dir / "snapshot"
@@ -87,6 +87,13 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
 
     workload.stop()
 
+    # Disable scheduling, so the storage controller doesn't migrate shards around
+    # while we are stopping pageservers
+    env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Stop"})
+    env.storage_controller.allowed_errors.extend(
+        [".*Scheduling is disabled by policy Stop.*", ".*Skipping reconcile for policy Stop.*"]
+    )
+
     # Stop pageservers
     for pageserver in env.pageservers:
         pageserver.stop()
@@ -127,9 +134,16 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count:
     for pageserver in env.pageservers:
         pageserver.start()
 
+    # Turn scheduling back on.
+    # We don't care about optimizations, so enable only essential scheduling
+    env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Essential"})
+
     # Check we can read everything
     workload.validate()
 
+    # Reconcile to avoid a race between test shutdown and background reconciliation (#11278)
+    env.storage_controller.reconcile_until_idle()
+
 
 def drop_local_state(env: NeonEnv, tenant_id: TenantId):
     env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})
diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py
index 6175643389a4..83bebc19becb 100644
--- a/test_runner/regress/test_subscriber_branching.py
+++ b/test_runner/regress/test_subscriber_branching.py
@@ -251,7 +251,7 @@ def test_multiple_subscription_branching(neon_simple_env: NeonEnv):
     NUMBER_OF_DBS = 5
 
     # Create and start endpoint so that neon_local put all the generated
-    # stuff into the spec.json file.
+    # stuff into the config.json file.
     endpoint = env.endpoints.create_start(
         "main",
         config_lines=[
@@ -280,13 +280,15 @@ def test_multiple_subscription_branching(neon_simple_env: NeonEnv):
             }
         )
 
-    # Update the spec.json file to create the databases
+    # Update the config.json file to create the databases
     # and reconfigure the endpoint to apply the changes.
     endpoint.respec_deep(
         **{
-            "skip_pg_catalog_updates": False,
-            "cluster": {
-                "databases": TEST_DB_NAMES,
+            "spec": {
+                "skip_pg_catalog_updates": False,
+                "cluster": {
+                    "databases": TEST_DB_NAMES,
+                },
             },
         }
     )
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index c613a79374f1..c00f8f4ca5a2 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -390,6 +390,7 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
     # Tenant creation requests which arrive out of order will generate complaints about
     # generation nubmers out of order.
     env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+")
+    env.pageserver.allowed_errors.append(".*due to stale generation.+")
 
     # Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of
     # an incomplete attach, or some other problem.  In the field this should be rare,
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index e3d39f9315ee..a9a6699e5cb5 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -45,7 +45,7 @@
     s3_storage,
 )
 from fixtures.safekeeper.http import (
-    Configuration,
+    MembershipConfiguration,
     SafekeeperHttpClient,
     SafekeeperId,
     TimelineCreateRequest,
@@ -589,7 +589,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re
     for sk in env.safekeepers:
         sk.start()
         cli = sk.http_client()
-        mconf = Configuration(generation=0, members=[], new_members=None)
+        mconf = MembershipConfiguration(generation=0, members=[], new_members=None)
         # set start_lsn to the beginning of the first segment to allow reading
         # WAL from there (could you intidb LSN as well).
         r = TimelineCreateRequest(
@@ -1948,7 +1948,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
     sk_id_2 = SafekeeperId(11, "localhost", 5434)  # just a mock
 
     # Request to switch before timeline creation should fail.
-    init_conf = Configuration(generation=1, members=[sk_id_1], new_members=None)
+    init_conf = MembershipConfiguration(generation=1, members=[sk_id_1], new_members=None)
     with pytest.raises(requests.exceptions.HTTPError):
         http_cli.membership_switch(tenant_id, timeline_id, init_conf)
 
@@ -1960,7 +1960,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
     http_cli.timeline_create(create_r)
 
     # Switch into some conf.
-    joint_conf = Configuration(generation=4, members=[sk_id_1], new_members=[sk_id_2])
+    joint_conf = MembershipConfiguration(generation=4, members=[sk_id_1], new_members=[sk_id_2])
     resp = http_cli.membership_switch(tenant_id, timeline_id, joint_conf)
     log.info(f"joint switch resp: {resp}")
     assert resp.previous_conf.generation == 1
@@ -1973,24 +1973,26 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
     assert after_restart.generation == 4
 
     # Switch into non joint conf of which sk is not a member, must fail.
-    non_joint_not_member = Configuration(generation=5, members=[sk_id_2], new_members=None)
+    non_joint_not_member = MembershipConfiguration(
+        generation=5, members=[sk_id_2], new_members=None
+    )
     with pytest.raises(requests.exceptions.HTTPError):
         resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint_not_member)
 
     # Switch into good non joint conf.
-    non_joint = Configuration(generation=6, members=[sk_id_1], new_members=None)
+    non_joint = MembershipConfiguration(generation=6, members=[sk_id_1], new_members=None)
     resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint)
     log.info(f"non joint switch resp: {resp}")
     assert resp.previous_conf.generation == 4
     assert resp.current_conf.generation == 6
 
     # Switch request to lower conf should be rejected.
-    lower_conf = Configuration(generation=3, members=[sk_id_1], new_members=None)
+    lower_conf = MembershipConfiguration(generation=3, members=[sk_id_1], new_members=None)
     with pytest.raises(requests.exceptions.HTTPError):
         http_cli.membership_switch(tenant_id, timeline_id, lower_conf)
 
     # Now, exclude sk from the membership, timeline should be deleted.
-    excluded_conf = Configuration(generation=7, members=[sk_id_2], new_members=None)
+    excluded_conf = MembershipConfiguration(generation=7, members=[sk_id_2], new_members=None)
     http_cli.timeline_exclude(tenant_id, timeline_id, excluded_conf)
     with pytest.raises(requests.exceptions.HTTPError):
         http_cli.timeline_status(tenant_id, timeline_id)
@@ -2010,11 +2012,6 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder):
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    ps = env.pageservers[0]
-    ps_http_cli = ps.http_client()
-
-    http_clis = [sk.http_client() for sk in env.safekeepers]
-
     config_lines = [
         "neon.safekeeper_proto_version = 3",
     ]
@@ -2023,22 +2020,11 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder):
     # expected to fail because timeline is not created on safekeepers
     with pytest.raises(Exception, match=r".*timed out.*"):
         ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3], timeout="2s")
-    # figure out initial LSN.
-    ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id)
-    init_lsn = ps_timeline_detail["last_record_lsn"]
-    log.info(f"initial LSN: {init_lsn}")
-    # sk timeline creation request expects minor version
-    pg_version = ps_timeline_detail["pg_version"] * 10000
     # create inital mconf
-    sk_ids = [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in env.safekeepers]
-    mconf = Configuration(generation=1, members=sk_ids, new_members=None)
-    create_r = TimelineCreateRequest(
-        tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None
+    mconf = MembershipConfiguration(
+        generation=1, members=Safekeeper.sks_to_safekeeper_ids(env.safekeepers), new_members=None
     )
-    log.info(f"sending timeline create: {create_r.to_json()}")
-
-    for sk_http_cli in http_clis:
-        sk_http_cli.timeline_create(create_r)
+    Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, env.safekeepers)
     # Once timeline created endpoint should start.
     ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3])
     ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)")
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index b7c7478e7816..c5dd34f64ff2 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -18,6 +18,7 @@
     Safekeeper,
 )
 from fixtures.remote_storage import RemoteStorageKind
+from fixtures.safekeeper.http import MembershipConfiguration
 from fixtures.utils import skip_in_debug_build
 
 if TYPE_CHECKING:
@@ -452,20 +453,24 @@ def test_concurrent_computes(neon_env_builder: NeonEnvBuilder):
     asyncio.run(run_concurrent_computes(env))
 
 
+async def assert_query_hangs(endpoint: Endpoint, query: str):
+    """
+    Start on endpoint query which is expected to hang and check that it does.
+    """
+    conn = await endpoint.connect_async()
+    bg_query = asyncio.create_task(conn.execute(query))
+    await asyncio.sleep(2)
+    assert not bg_query.done()
+    return bg_query
+
+
 # Stop safekeeper and check that query cannot be executed while safekeeper is down.
 # Query will insert a single row into a table.
-async def check_unavailability(
-    sk: Safekeeper, conn: asyncpg.Connection, key: int, start_delay_sec: int = 2
-):
+async def check_unavailability(sk: Safekeeper, ep: Endpoint, key: int, start_delay_sec: int = 2):
     # shutdown one of two acceptors, that is, majority
     sk.stop()
 
-    bg_query = asyncio.create_task(conn.execute(f"INSERT INTO t values ({key}, 'payload')"))
-
-    await asyncio.sleep(start_delay_sec)
-    # ensure that the query has not been executed yet
-    assert not bg_query.done()
-
+    bg_query = await assert_query_hangs(ep, f"INSERT INTO t values ({key}, 'payload')")
     # start safekeeper and await the query
     sk.start()
     await bg_query
@@ -480,10 +485,10 @@ async def run_unavailability(env: NeonEnv, endpoint: Endpoint):
     await conn.execute("INSERT INTO t values (1, 'payload')")
 
     # stop safekeeper and check that query cannot be executed while safekeeper is down
-    await check_unavailability(env.safekeepers[0], conn, 2)
+    await check_unavailability(env.safekeepers[0], endpoint, 2)
 
     # for the world's balance, do the same with second safekeeper
-    await check_unavailability(env.safekeepers[1], conn, 3)
+    await check_unavailability(env.safekeepers[1], endpoint, 3)
 
     # check that we can execute queries after restart
     await conn.execute("INSERT INTO t values (4, 'payload')")
@@ -514,15 +519,7 @@ async def run_recovery_uncommitted(env: NeonEnv):
     # insert with only one safekeeper up to create tail of flushed but not committed WAL
     sk1.stop()
     sk2.stop()
-    conn = await ep.connect_async()
-    # query should hang, so execute in separate task
-    bg_query = asyncio.create_task(
-        conn.execute("insert into t select generate_series(1, 2000), 'payload'")
-    )
-    sleep_sec = 2
-    await asyncio.sleep(sleep_sec)
-    # it must still be not finished
-    assert not bg_query.done()
+    await assert_query_hangs(ep, "insert into t select generate_series(1, 2000), 'payload'")
     # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
     ep.stop_and_destroy()
 
@@ -559,15 +556,7 @@ async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int):
     # insert with only one sk3 up to create tail of flushed but not committed WAL on it
     sk1.stop()
     sk2.stop()
-    conn = await ep.connect_async()
-    # query should hang, so execute in separate task
-    bg_query = asyncio.create_task(
-        conn.execute("insert into t select generate_series(1, 180000), 'Papaya'")
-    )
-    sleep_sec = 2
-    await asyncio.sleep(sleep_sec)
-    # it must still be not finished
-    assert not bg_query.done()
+    await assert_query_hangs(ep, "insert into t select generate_series(1, 180000), 'Papaya'")
     # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
     ep.stop_and_destroy()
 
@@ -607,6 +596,132 @@ def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_versi
     asyncio.run(run_wal_truncation(env, safekeeper_proto_version))
 
 
+async def quorum_sanity_single(
+    env: NeonEnv,
+    compute_sks_ids: list[int],
+    members_sks_ids: list[int],
+    new_members_sks_ids: list[int] | None,
+    sks_to_stop_ids: list[int],
+    should_work_when_stopped: bool,
+):
+    """
+    *_ids params contain safekeeper node ids; it is assumed they are issued
+    from 1 and sequentially assigned to env.safekeepers.
+    """
+    members_sks = [env.safekeepers[i - 1] for i in members_sks_ids]
+    new_members_sks = (
+        [env.safekeepers[i - 1] for i in new_members_sks_ids] if new_members_sks_ids else None
+    )
+    sks_to_stop = [env.safekeepers[i - 1] for i in sks_to_stop_ids]
+
+    mconf = MembershipConfiguration(
+        generation=1,
+        members=Safekeeper.sks_to_safekeeper_ids(members_sks),
+        new_members=Safekeeper.sks_to_safekeeper_ids(new_members_sks) if new_members_sks else None,
+    )
+    members_sks = Safekeeper.mconf_sks(env, mconf)
+
+    tenant_id = env.initial_tenant
+    compute_sks_ids_str = "-".join([str(sk_id) for sk_id in compute_sks_ids])
+    members_sks_ids_str = "-".join([str(sk.id) for sk in mconf.members])
+    new_members_sks_ids_str = "-".join(
+        [str(sk.id) for sk in mconf.new_members] if mconf.new_members is not None else []
+    )
+    sks_to_stop_ids_str = "-".join([str(sk.id) for sk in sks_to_stop])
+    log.info(
+        f"running quorum_sanity_single with compute_sks={compute_sks_ids_str}, members_sks={members_sks_ids_str}, new_members_sks={new_members_sks_ids_str}, sks_to_stop={sks_to_stop_ids_str}, should_work_when_stopped={should_work_when_stopped}"
+    )
+    branch_name = f"test_quorum_single_c{compute_sks_ids_str}_m{members_sks_ids_str}_{new_members_sks_ids_str}_s{sks_to_stop_ids_str}"
+    timeline_id = env.create_branch(branch_name)
+
+    # create timeline on `members_sks`
+    Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, members_sks)
+
+    config_lines = [
+        "neon.safekeeper_proto_version = 3",
+    ]
+    ep = env.endpoints.create(branch_name, config_lines=config_lines)
+    ep.start(safekeeper_generation=1, safekeepers=compute_sks_ids)
+    ep.safe_psql("create table t(key int, value text)")
+
+    # stop specified sks and check whether writes work
+    for sk in sks_to_stop:
+        sk.stop()
+    if should_work_when_stopped:
+        log.info("checking that writes still work")
+        ep.safe_psql("insert into t select generate_series(1, 100), 'Papaya'")
+        # restarting ep should also be fine
+        ep.stop()
+        ep.start()
+        ep.safe_psql("insert into t select generate_series(1, 100), 'plum'")
+        bg_query = None
+    else:
+        log.info("checking that writes hang")
+        bg_query = await assert_query_hangs(
+            ep, "insert into t select generate_series(1, 100), 'Papaya'"
+        )
+    # start again; now they should work
+    for sk in sks_to_stop:
+        sk.start()
+    if bg_query:
+        log.info("awaiting query")
+        await bg_query
+
+
+# It's a bit tempting to iterate over all possible combinations, but let's stick
+# with this for now.
+async def run_quorum_sanity(env: NeonEnv):
+    # 3 members, all up, should work
+    await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [], True)
+    # 3 members, 2/3 up, should work
+    await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [3], True)
+    # 3 members, 1/3 up, should not work
+    await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [2, 3], False)
+
+    # 3 members, all up, should work; wp redundantly talks to 4th.
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], None, [], True)
+    # 3 members, all up, should work with wp talking to 2 of these 3 + plus one redundant
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [], True)
+    # 3 members, 2/3 up, could work but wp talks to different 3s, so it shouldn't
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [3], False)
+
+    # joint conf of 1-2-3 and 4, all up, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [], True)
+    # joint conf of 1-2-3 and 4, 4 down, shouldn't work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [4], False)
+
+    # joint conf of 1-2-3 and 2-3-4, all up, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [], True)
+    # joint conf of 1-2-3 and 2-3-4, 1 and 4 down, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 4], True)
+    # joint conf of 1-2-3 and 2-3-4, 2 down, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2], True)
+    # joint conf of 1-2-3 and 2-3-4, 3 down, should work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [3], True)
+    # joint conf of 1-2-3 and 2-3-4, 1 and 2 down, shouldn't work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 2], False)
+    # joint conf of 1-2-3 and 2-3-4, 2 and 4 down, shouldn't work
+    await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2, 4], False)
+
+    # joint conf of 1-2-3 and 2-3-4 with wp talking to 2-3-4 only.
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [], True)
+    # with 1 down should still be ok
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [1], True)
+    # but with 2 down not ok
+    await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [2], False)
+
+
+# Test various combinations of membership configurations / neon.safekeepers
+# (list of safekeepers endpoint connects to) values / up & down safekeepers and
+# check that endpont can start and write data when we have quorum and can't when
+# we don't.
+def test_quorum_sanity(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 4
+    env = neon_env_builder.init_start()
+
+    asyncio.run(run_quorum_sanity(env))
+
+
 async def run_segment_init_failure(env: NeonEnv):
     env.create_branch("test_segment_init_failure")
     ep = env.endpoints.create_start("test_segment_init_failure")