diff --git a/.dockerignore b/.dockerignore index 9fafc2e4baea..ffa72eaf51e4 100644 --- a/.dockerignore +++ b/.dockerignore @@ -19,6 +19,7 @@ !pageserver/ !pgxn/ !proxy/ +!object_storage/ !storage_scrubber/ !safekeeper/ !storage_broker/ diff --git a/.gitignore b/.gitignore index a07a65ccef1c..45eb4dbf0ee8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +/artifact_cache /pg_install /target /tmp_check diff --git a/Cargo.lock b/Cargo.lock index dbbf2c335737..5d2cdcea272e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2837,6 +2837,7 @@ dependencies = [ "utils", "uuid", "workspace_hack", + "x509-cert", ] [[package]] @@ -3991,6 +3992,33 @@ dependencies = [ "memchr", ] +[[package]] +name = "object_storage" +version = "0.0.1" +dependencies = [ + "anyhow", + "axum", + "axum-extra", + "camino", + "camino-tempfile", + "futures", + "http-body-util", + "itertools 0.10.5", + "jsonwebtoken", + "prometheus", + "rand 0.8.5", + "remote_storage", + "serde", + "serde_json", + "test-log", + "tokio", + "tokio-util", + "tower 0.5.2", + "tracing", + "utils", + "workspace_hack", +] + [[package]] name = "once_cell" version = "1.20.2" @@ -4693,7 +4721,7 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.6" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b" dependencies = [ "base64 0.22.1", "byteorder", @@ -4727,7 +4755,7 @@ dependencies = [ [[package]] name = "postgres-types" version = "0.2.6" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b" dependencies = [ "bytes", "chrono", @@ -6925,6 +6953,28 @@ dependencies = [ "syn 2.0.100", ] +[[package]] +name = "test-log" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7f46083d221181166e5b6f6b1e5f1d499f3a76888826e6cb1d057554157cd0f" +dependencies = [ + "env_logger", + "test-log-macros", + "tracing-subscriber", +] + +[[package]] +name = "test-log-macros" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "888d0c3c6db53c0fdab160d2ed5e12ba745383d3e85813f2ea0f2b1475ab553f" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -7172,7 +7222,7 @@ dependencies = [ [[package]] name = "tokio-postgres" version = "0.7.10" -source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#1f21e7959a96a34dcfbfce1b14b73286cdadffe9" +source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#f3cf448febde5fd298071d54d568a9c875a7a62b" dependencies = [ "async-trait", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 1f605681dbb4..d957fa90708f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,7 @@ members = [ "libs/proxy/postgres-protocol2", "libs/proxy/postgres-types2", "libs/proxy/tokio-postgres2", + "object_storage", ] [workspace.package] @@ -208,6 +209,7 @@ tracing-opentelemetry = "0.28" tracing-serde = "0.2.0" tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] } try-lock = "0.2.5" +test-log = { version = "0.2.17", default-features = false, features = ["log"] } twox-hash = { version = "1.6.3", default-features = false } typed-json = "0.1" url = "2.2" diff --git a/Dockerfile b/Dockerfile index 01540e192586..848bfab92196 100644 --- a/Dockerfile +++ b/Dockerfile @@ -89,6 +89,7 @@ RUN set -e \ --bin storage_broker \ --bin storage_controller \ --bin proxy \ + --bin object_storage \ --bin neon_local \ --bin storage_scrubber \ --locked --release @@ -121,6 +122,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/object_storage /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index da11ac2860fd..16fd51d79aa5 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -29,13 +29,12 @@ //! ```sh //! compute_ctl -D /var/db/postgres/compute \ //! -C 'postgresql://cloud_admin@localhost/postgres' \ -//! -S /var/db/postgres/specs/current.json \ +//! -c /var/db/postgres/configs/config.json \ //! -b /usr/local/bin/postgres \ //! -r http://pg-ext-s3-gateway \ //! ``` use std::ffi::OsString; use std::fs::File; -use std::path::Path; use std::process::exit; use std::sync::mpsc; use std::thread; @@ -43,8 +42,7 @@ use std::time::Duration; use anyhow::{Context, Result}; use clap::Parser; -use compute_api::responses::ComputeCtlConfig; -use compute_api::spec::ComputeSpec; +use compute_api::responses::ComputeConfig; use compute_tools::compute::{ BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal, }; @@ -118,16 +116,21 @@ struct Cli { #[arg(long)] pub set_disk_quota_for_fs: Option, - #[arg(short = 's', long = "spec", group = "spec")] - pub spec_json: Option, - - #[arg(short = 'S', long, group = "spec-path")] - pub spec_path: Option, + // TODO(tristan957): remove alias after compatibility tests are no longer + // an issue + #[arg(short = 'c', long, alias = "spec-path")] + pub config: Option, #[arg(short = 'i', long, group = "compute-id")] pub compute_id: String, - #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")] + #[arg( + short = 'p', + long, + conflicts_with = "config", + value_name = "CONTROL_PLANE_API_BASE_URL", + requires = "compute-id" + )] pub control_plane_uri: Option, } @@ -136,7 +139,7 @@ fn main() -> Result<()> { let scenario = failpoint_support::init(); - // For historical reasons, the main thread that processes the spec and launches postgres + // For historical reasons, the main thread that processes the config and launches postgres // is synchronous, but we always have this tokio runtime available and we "enter" it so // that you can use tokio::spawn() and tokio::runtime::Handle::current().block_on(...) // from all parts of compute_ctl. @@ -152,7 +155,7 @@ fn main() -> Result<()> { let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?; - let cli_spec = try_spec_from_cli(&cli)?; + let config = get_config(&cli)?; let compute_node = ComputeNode::new( ComputeNodeParams { @@ -172,10 +175,8 @@ fn main() -> Result<()> { cgroup: cli.cgroup, #[cfg(target_os = "linux")] vm_monitor_addr: cli.vm_monitor_addr, - live_config_allowed: cli_spec.live_config_allowed, }, - cli_spec.spec, - cli_spec.compute_ctl_config, + config, )?; let exit_code = compute_node.run()?; @@ -200,37 +201,17 @@ async fn init() -> Result<()> { Ok(()) } -fn try_spec_from_cli(cli: &Cli) -> Result { - // First, try to get cluster spec from the cli argument - if let Some(ref spec_json) = cli.spec_json { - info!("got spec from cli argument {}", spec_json); - return Ok(CliSpecParams { - spec: Some(serde_json::from_str(spec_json)?), - compute_ctl_config: ComputeCtlConfig::default(), - live_config_allowed: false, - }); +fn get_config(cli: &Cli) -> Result { + // First, read the config from the path if provided + if let Some(ref config) = cli.config { + let file = File::open(config)?; + return Ok(serde_json::from_reader(&file)?); } - // Second, try to read it from the file if path is provided - if let Some(ref spec_path) = cli.spec_path { - let file = File::open(Path::new(spec_path))?; - return Ok(CliSpecParams { - spec: Some(serde_json::from_reader(file)?), - compute_ctl_config: ComputeCtlConfig::default(), - live_config_allowed: true, - }); - } - - if cli.control_plane_uri.is_none() { - panic!("must specify --control-plane-uri"); - }; - - match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) { - Ok(resp) => Ok(CliSpecParams { - spec: resp.0, - compute_ctl_config: resp.1, - live_config_allowed: true, - }), + // If the config wasn't provided in the CLI arguments, then retrieve it from + // the control plane + match get_config_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) { + Ok(config) => Ok(config), Err(e) => { error!( "cannot get response from control plane: {}\n\ @@ -242,14 +223,6 @@ fn try_spec_from_cli(cli: &Cli) -> Result { } } -struct CliSpecParams { - /// If a spec was provided via CLI or file, the [`ComputeSpec`] - spec: Option, - #[allow(dead_code)] - compute_ctl_config: ComputeCtlConfig, - live_config_allowed: bool, -} - fn deinit_and_exit(exit_code: Option) -> ! { // Shutdown trace pipeline gracefully, so that it has a chance to send any // pending traces before we exit. Shutting down OTEL tracing provider may diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs index db3e07e086b8..082ba62b8e99 100644 --- a/compute_tools/src/catalog.rs +++ b/compute_tools/src/catalog.rs @@ -98,13 +98,15 @@ pub async fn get_database_schema( .kill_on_drop(true) .spawn()?; - let stdout = cmd.stdout.take().ok_or_else(|| { - std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stdout.") - })?; + let stdout = cmd + .stdout + .take() + .ok_or_else(|| std::io::Error::other("Failed to capture stdout."))?; - let stderr = cmd.stderr.take().ok_or_else(|| { - std::io::Error::new(std::io::ErrorKind::Other, "Failed to capture stderr.") - })?; + let stderr = cmd + .stderr + .take() + .ok_or_else(|| std::io::Error::other("Failed to capture stderr."))?; let mut stdout_reader = FramedRead::new(stdout, BytesCodec::new()); let stderr_reader = BufReader::new(stderr); @@ -128,8 +130,7 @@ pub async fn get_database_schema( } }); - return Err(SchemaDumpError::IO(std::io::Error::new( - std::io::ErrorKind::Other, + return Err(SchemaDumpError::IO(std::io::Error::other( "failed to start pg_dump", ))); } diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 70b91c781a64..c7b4bdd24013 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -11,7 +11,7 @@ use std::{env, fs}; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; use compute_api::privilege::Privilege; -use compute_api::responses::{ComputeCtlConfig, ComputeMetrics, ComputeStatus}; +use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus}; use compute_api::spec::{ ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent, }; @@ -93,20 +93,6 @@ pub struct ComputeNodeParams { /// the address of extension storage proxy gateway pub ext_remote_storage: Option, - - /// We should only allow live re- / configuration of the compute node if - /// it uses 'pull model', i.e. it can go to control-plane and fetch - /// the latest configuration. Otherwise, there could be a case: - /// - we start compute with some spec provided as argument - /// - we push new spec and it does reconfiguration - /// - but then something happens and compute pod / VM is destroyed, - /// so k8s controller starts it again with the **old** spec - /// - /// and the same for empty computes: - /// - we started compute without any spec - /// - we push spec and it does configuration - /// - but then it is restarted without any spec again - pub live_config_allowed: bool, } /// Compute node info shared across several `compute_ctl` threads. @@ -317,11 +303,7 @@ struct StartVmMonitorResult { } impl ComputeNode { - pub fn new( - params: ComputeNodeParams, - cli_spec: Option, - compute_ctl_config: ComputeCtlConfig, - ) -> Result { + pub fn new(params: ComputeNodeParams, config: ComputeConfig) -> Result { let connstr = params.connstr.as_str(); let conn_conf = postgres::config::Config::from_str(connstr) .context("cannot build postgres config from connstr")?; @@ -329,8 +311,8 @@ impl ComputeNode { .context("cannot build tokio postgres config from connstr")?; let mut new_state = ComputeState::new(); - if let Some(cli_spec) = cli_spec { - let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?; + if let Some(spec) = config.spec { + let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?; new_state.pspec = Some(pspec); } @@ -341,7 +323,7 @@ impl ComputeNode { state: Mutex::new(new_state), state_changed: Condvar::new(), ext_download_progress: RwLock::new(HashMap::new()), - compute_ctl_config, + compute_ctl_config: config.compute_ctl_config, }) } @@ -537,11 +519,14 @@ impl ComputeNode { let pspec = compute_state.pspec.as_ref().expect("spec must be set"); info!( - "starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}", + "starting compute for project {}, operation {}, tenant {}, timeline {}, project {}, branch {}, endpoint {}, features {:?}, spec.remote_extensions {:?}", pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"), pspec.spec.operation_uuid.as_deref().unwrap_or("None"), pspec.tenant_id, pspec.timeline_id, + pspec.spec.project_id.as_deref().unwrap_or("None"), + pspec.spec.branch_id.as_deref().unwrap_or("None"), + pspec.spec.endpoint_id.as_deref().unwrap_or("None"), pspec.spec.features, pspec.spec.remote_extensions, ); @@ -645,31 +630,28 @@ impl ComputeNode { }); } - // Configure and start rsyslog for HIPAA if necessary - if let ComputeAudit::Hipaa = pspec.spec.audit_log_level { - let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string()); - if remote_endpoint.is_empty() { - anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty"); - } + // Configure and start rsyslog for compliance audit logging + match pspec.spec.audit_log_level { + ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { + let remote_endpoint = + std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string()); + if remote_endpoint.is_empty() { + anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty"); + } - let log_directory_path = Path::new(&self.params.pgdata).join("log"); - let log_directory_path = log_directory_path.to_string_lossy().to_string(); - configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?; + let log_directory_path = Path::new(&self.params.pgdata).join("log"); + let log_directory_path = log_directory_path.to_string_lossy().to_string(); + configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?; - // Launch a background task to clean up the audit logs - launch_pgaudit_gc(log_directory_path); + // Launch a background task to clean up the audit logs + launch_pgaudit_gc(log_directory_path); + } + _ => {} } // Configure and start rsyslog for Postgres logs export - if self.has_feature(ComputeFeature::PostgresLogsExport) { - if let Some(ref project_id) = pspec.spec.cluster.cluster_id { - let host = PostgresLogsRsyslogConfig::default_host(project_id); - let conf = PostgresLogsRsyslogConfig::new(Some(&host)); - configure_postgres_logs_export(conf)?; - } else { - warn!("not configuring rsyslog for Postgres logs export: project ID is missing") - } - } + let conf = PostgresLogsRsyslogConfig::new(pspec.spec.logs_export_host.as_deref()); + configure_postgres_logs_export(conf)?; // Launch remaining service threads let _monitor_handle = launch_monitor(self); @@ -1573,6 +1555,10 @@ impl ComputeNode { }); } + // Reconfigure rsyslog for Postgres logs export + let conf = PostgresLogsRsyslogConfig::new(spec.logs_export_host.as_deref()); + configure_postgres_logs_export(conf)?; + // Write new config let pgdata_path = Path::new(&self.params.pgdata); config::write_postgres_conf( diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 614ab076ffec..71c6123c3bf3 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -7,7 +7,7 @@ use std::io::prelude::*; use std::path::Path; use compute_api::responses::TlsConfig; -use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, GenericOption}; +use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption}; use crate::pg_helpers::{ GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value, @@ -89,6 +89,15 @@ pub fn write_postgres_conf( escape_conf_value(&s.to_string()) )?; } + if let Some(s) = &spec.project_id { + writeln!(file, "neon.project_id={}", escape_conf_value(s))?; + } + if let Some(s) = &spec.branch_id { + writeln!(file, "neon.branch_id={}", escape_conf_value(s))?; + } + if let Some(s) = &spec.endpoint_id { + writeln!(file, "neon.endpoint_id={}", escape_conf_value(s))?; + } // tls if let Some(tls_config) = tls_config { @@ -169,7 +178,7 @@ pub fn write_postgres_conf( // and don't allow the user or the control plane admin to change them. match spec.audit_log_level { ComputeAudit::Disabled => {} - ComputeAudit::Log => { + ComputeAudit::Log | ComputeAudit::Base => { writeln!(file, "# Managed by compute_ctl base audit settings: start")?; writeln!(file, "pgaudit.log='ddl,role'")?; // Disable logging of catalog queries to reduce the noise @@ -193,16 +202,20 @@ pub fn write_postgres_conf( } writeln!(file, "# Managed by compute_ctl base audit settings: end")?; } - ComputeAudit::Hipaa => { + ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { writeln!( file, "# Managed by compute_ctl compliance audit settings: begin" )?; - // This log level is very verbose - // but this is necessary for HIPAA compliance. - // Exclude 'misc' category, because it doesn't contain anythig relevant. - writeln!(file, "pgaudit.log='all, -misc'")?; - writeln!(file, "pgaudit.log_parameter=on")?; + // Enable logging of parameters. + // This is very verbose and may contain sensitive data. + if spec.audit_log_level == ComputeAudit::Full { + writeln!(file, "pgaudit.log_parameter=on")?; + writeln!(file, "pgaudit.log='all'")?; + } else { + writeln!(file, "pgaudit.log_parameter=off")?; + writeln!(file, "pgaudit.log='all, -misc'")?; + } // Disable logging of catalog queries // The catalog doesn't contain sensitive data, so we don't need to audit it. writeln!(file, "pgaudit.log_catalog=off")?; @@ -255,7 +268,7 @@ pub fn write_postgres_conf( // We need Postgres to send logs to rsyslog so that we can forward them // further to customers' log aggregation systems. - if spec.features.contains(&ComputeFeature::PostgresLogsExport) { + if spec.logs_export_host.is_some() { writeln!(file, "log_destination='stderr,syslog'")?; } diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs index 89d55e1af32b..f221752c38c3 100644 --- a/compute_tools/src/http/middleware/authorize.rs +++ b/compute_tools/src/http/middleware/authorize.rs @@ -6,20 +6,15 @@ use axum_extra::{ TypedHeader, headers::{Authorization, authorization::Bearer}, }; +use compute_api::requests::ComputeClaims; use futures::future::BoxFuture; use http::{Request, Response, StatusCode}; use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet}; -use serde::Deserialize; use tower_http::auth::AsyncAuthorizeRequest; -use tracing::warn; +use tracing::{debug, warn}; use crate::http::{JsonResponse, extract::RequestId}; -#[derive(Clone, Debug, Deserialize)] -pub(in crate::http) struct Claims { - compute_id: String, -} - #[derive(Clone, Debug)] pub(in crate::http) struct Authorize { compute_id: String, @@ -97,7 +92,7 @@ impl AsyncAuthorizeRequest for Authorize { if data.claims.compute_id != compute_id { return Err(JsonResponse::error( StatusCode::UNAUTHORIZED, - "invalid claims in authorization token", + "invalid compute ID in authorization token claims", )); } @@ -112,13 +107,19 @@ impl AsyncAuthorizeRequest for Authorize { impl Authorize { /// Verify the token using the JSON Web Key set and return the token data. - fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result> { + fn verify( + jwks: &JwkSet, + token: &str, + validation: &Validation, + ) -> Result> { + debug!("verifying token {}", token); + for jwk in jwks.keys.iter() { let decoding_key = match DecodingKey::from_jwk(jwk) { Ok(key) => key, Err(e) => { warn!( - "Failed to construct decoding key from {}: {}", + "failed to construct decoding key from {}: {}", jwk.common.key_id.as_ref().unwrap(), e ); @@ -127,11 +128,11 @@ impl Authorize { } }; - match jsonwebtoken::decode::(token, &decoding_key, validation) { + match jsonwebtoken::decode::(token, &decoding_key, validation) { Ok(data) => return Ok(data), Err(e) => { warn!( - "Failed to decode authorization token using {}: {}", + "failed to decode authorization token using {}: {}", jwk.common.key_id.as_ref().unwrap(), e ); @@ -141,6 +142,6 @@ impl Authorize { } } - Err(anyhow!("Failed to verify authorization token")) + Err(anyhow!("failed to verify authorization token")) } } diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index 7c8f72440f6b..bbdb7d091728 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -306,36 +306,6 @@ paths: schema: $ref: "#/components/schemas/GenericError" - /configure_telemetry: - post: - tags: - - Configure - summary: Configure rsyslog - description: | - This API endpoint configures rsyslog to forward Postgres logs - to a specified otel collector. - operationId: configureTelemetry - requestBody: - required: true - content: - application/json: - schema: - type: object - properties: - logs_export_host: - type: string - description: | - Hostname and the port of the otel collector. Leave empty to disable logs forwarding. - Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:54526 - responses: - 204: - description: "Telemetry configured successfully" - 500: - content: - application/json: - schema: - $ref: "#/components/schemas/GenericError" - components: securitySchemes: JWT: diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs index 5c9dd22c3dab..f7a19da61156 100644 --- a/compute_tools/src/http/routes/configure.rs +++ b/compute_tools/src/http/routes/configure.rs @@ -1,11 +1,9 @@ use std::sync::Arc; -use axum::body::Body; use axum::extract::State; use axum::response::Response; -use compute_api::requests::{ConfigurationRequest, ConfigureTelemetryRequest}; +use compute_api::requests::ConfigurationRequest; use compute_api::responses::{ComputeStatus, ComputeStatusResponse}; -use compute_api::spec::ComputeFeature; use http::StatusCode; use tokio::task; use tracing::info; @@ -13,7 +11,6 @@ use tracing::info; use crate::compute::{ComputeNode, ParsedSpec}; use crate::http::JsonResponse; use crate::http::extract::Json; -use crate::rsyslog::{PostgresLogsRsyslogConfig, configure_postgres_logs_export}; // Accept spec in JSON format and request compute configuration. If anything // goes wrong after we set the compute status to `ConfigurationPending` and @@ -25,13 +22,6 @@ pub(in crate::http) async fn configure( State(compute): State>, request: Json, ) -> Response { - if !compute.params.live_config_allowed { - return JsonResponse::error( - StatusCode::PRECONDITION_FAILED, - "live configuration is not allowed for this compute node".to_string(), - ); - } - let pspec = match ParsedSpec::try_from(request.spec.clone()) { Ok(p) => p, Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e), @@ -95,25 +85,3 @@ pub(in crate::http) async fn configure( JsonResponse::success(StatusCode::OK, body) } - -pub(in crate::http) async fn configure_telemetry( - State(compute): State>, - request: Json, -) -> Response { - if !compute.has_feature(ComputeFeature::PostgresLogsExport) { - return JsonResponse::error( - StatusCode::PRECONDITION_FAILED, - "Postgres logs export feature is not enabled".to_string(), - ); - } - - let conf = PostgresLogsRsyslogConfig::new(request.logs_export_host.as_deref()); - if let Err(err) = configure_postgres_logs_export(conf) { - return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, err.to_string()); - } - - Response::builder() - .status(StatusCode::NO_CONTENT) - .body(Body::from("")) - .unwrap() -} diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs index 179369e3efd9..10f767e97ca7 100644 --- a/compute_tools/src/http/server.rs +++ b/compute_tools/src/http/server.rs @@ -87,7 +87,6 @@ impl From<&Server> for Router> { let authenticated_router = Router::>::new() .route("/check_writability", post(check_writability::is_writable)) .route("/configure", post(configure::configure)) - .route("/configure_telemetry", post(configure::configure_telemetry)) .route("/database_schema", get(database_schema::get_schema_dump)) .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects)) .route("/insights", get(insights::get_insights)) diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs index 52f1795703bb..fa00476fd2e8 100644 --- a/compute_tools/src/metrics.rs +++ b/compute_tools/src/metrics.rs @@ -19,13 +19,13 @@ pub(crate) static INSTALLED_EXTENSIONS: Lazy = Lazy::new(|| { // but for all our APIs we defined a 'slug'/method/operationId in the OpenAPI spec. // And it's fair to call it a 'RPC' (Remote Procedure Call). pub enum CPlaneRequestRPC { - GetSpec, + GetConfig, } impl CPlaneRequestRPC { pub fn as_str(&self) -> &str { match self { - CPlaneRequestRPC::GetSpec => "GetSpec", + CPlaneRequestRPC::GetConfig => "GetConfig", } } } diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs index 80594db3f152..ba08302df2b0 100644 --- a/compute_tools/src/rsyslog.rs +++ b/compute_tools/src/rsyslog.rs @@ -119,16 +119,9 @@ impl<'a> PostgresLogsRsyslogConfig<'a> { }; Ok(config_content) } - - /// Returns the default host for otel collector that receives Postgres logs - pub fn default_host(project_id: &str) -> String { - format!( - "config-{}-collector.neon-telemetry.svc.cluster.local:10514", - project_id - ) - } } +/// Writes rsyslogd configuration for Postgres logs export and restarts rsyslog. pub fn configure_postgres_logs_export(conf: PostgresLogsRsyslogConfig) -> Result<()> { let new_config = conf.build()?; let current_config = PostgresLogsRsyslogConfig::current_config()?; @@ -261,16 +254,5 @@ mod tests { let res = conf.build(); assert!(res.is_err()); } - - { - // Verify config with default host - let host = PostgresLogsRsyslogConfig::default_host("shy-breeze-123"); - let conf = PostgresLogsRsyslogConfig::new(Some(&host)); - let res = conf.build(); - assert!(res.is_ok()); - let conf_str = res.unwrap(); - assert!(conf_str.contains(r#"shy-breeze-123"#)); - assert!(conf_str.contains(r#"port="10514""#)); - } } } diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index a76af21e9f28..4b38e6e29c72 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -3,9 +3,8 @@ use std::path::Path; use anyhow::{Result, anyhow, bail}; use compute_api::responses::{ - ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse, + ComputeConfig, ControlPlaneComputeStatus, ControlPlaneConfigResponse, }; -use compute_api::spec::ComputeSpec; use reqwest::StatusCode; use tokio_postgres::Client; use tracing::{error, info, instrument}; @@ -21,7 +20,7 @@ use crate::params::PG_HBA_ALL_MD5; fn do_control_plane_request( uri: &str, jwt: &str, -) -> Result { +) -> Result { let resp = reqwest::blocking::Client::new() .get(uri) .header("Authorization", format!("Bearer {}", jwt)) @@ -29,14 +28,14 @@ fn do_control_plane_request( .map_err(|e| { ( true, - format!("could not perform spec request to control plane: {:?}", e), + format!("could not perform request to control plane: {:?}", e), UNKNOWN_HTTP_STATUS.to_string(), ) })?; let status = resp.status(); match status { - StatusCode::OK => match resp.json::() { + StatusCode::OK => match resp.json::() { Ok(spec_resp) => Ok(spec_resp), Err(e) => Err(( true, @@ -69,40 +68,35 @@ fn do_control_plane_request( } } -/// Request spec from the control-plane by compute_id. If `NEON_CONTROL_PLANE_TOKEN` -/// env variable is set, it will be used for authorization. -pub fn get_spec_from_control_plane( - base_uri: &str, - compute_id: &str, -) -> Result<(Option, ComputeCtlConfig)> { +/// Request config from the control-plane by compute_id. If +/// `NEON_CONTROL_PLANE_TOKEN` env variable is set, it will be used for +/// authorization. +pub fn get_config_from_control_plane(base_uri: &str, compute_id: &str) -> Result { let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec"); - let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") { - Ok(v) => v, - Err(_) => "".to_string(), - }; + let jwt: String = std::env::var("NEON_CONTROL_PLANE_TOKEN").unwrap_or_default(); let mut attempt = 1; - info!("getting spec from control plane: {}", cp_uri); + info!("getting config from control plane: {}", cp_uri); // Do 3 attempts to get spec from the control plane using the following logic: // - network error -> then retry // - compute id is unknown or any other error -> bail out // - no spec for compute yet (Empty state) -> return Ok(None) - // - got spec -> return Ok(Some(spec)) + // - got config -> return Ok(Some(config)) while attempt < 4 { let result = match do_control_plane_request(&cp_uri, &jwt) { - Ok(spec_resp) => { + Ok(config_resp) => { CPLANE_REQUESTS_TOTAL .with_label_values(&[ - CPlaneRequestRPC::GetSpec.as_str(), + CPlaneRequestRPC::GetConfig.as_str(), &StatusCode::OK.to_string(), ]) .inc(); - match spec_resp.status { - ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)), + match config_resp.status { + ControlPlaneComputeStatus::Empty => Ok(config_resp.into()), ControlPlaneComputeStatus::Attached => { - if let Some(spec) = spec_resp.spec { - Ok((Some(spec), spec_resp.compute_ctl_config)) + if config_resp.spec.is_some() { + Ok(config_resp.into()) } else { bail!("compute is attached, but spec is empty") } @@ -111,7 +105,7 @@ pub fn get_spec_from_control_plane( } Err((retry, msg, status)) => { CPLANE_REQUESTS_TOTAL - .with_label_values(&[CPlaneRequestRPC::GetSpec.as_str(), &status]) + .with_label_values(&[CPlaneRequestRPC::GetConfig.as_str(), &status]) .inc(); if retry { Err(anyhow!(msg)) @@ -122,7 +116,7 @@ pub fn get_spec_from_control_plane( }; if let Err(e) = &result { - error!("attempt {} to get spec failed with: {}", attempt, e); + error!("attempt {} to get config failed with: {}", attempt, e); } else { return result; } @@ -133,13 +127,13 @@ pub fn get_spec_from_control_plane( // All attempts failed, return error. Err(anyhow::anyhow!( - "Exhausted all attempts to retrieve the spec from the control plane" + "Exhausted all attempts to retrieve the config from the control plane" )) } /// Check `pg_hba.conf` and update if needed to allow external connections. pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { - // XXX: consider making it a part of spec.json + // XXX: consider making it a part of config.json let pghba_path = pgdata_path.join("pg_hba.conf"); if config::line_in_file(&pghba_path, PG_HBA_ALL_MD5)? { @@ -153,7 +147,7 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> { /// Create a standby.signal file pub fn add_standby_signal(pgdata_path: &Path) -> Result<()> { - // XXX: consider making it a part of spec.json + // XXX: consider making it a part of config.json let signalfile = pgdata_path.join("standby.signal"); if !signalfile.exists() { diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs index e7d67f6ac524..0d1389dbad04 100644 --- a/compute_tools/src/spec_apply.rs +++ b/compute_tools/src/spec_apply.rs @@ -278,12 +278,12 @@ impl ComputeNode { // so that all config operations are audit logged. match spec.audit_log_level { - ComputeAudit::Hipaa => { + ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { phases.push(CreatePgauditExtension); phases.push(CreatePgauditlogtofileExtension); phases.push(DisablePostgresDBPgAudit); } - ComputeAudit::Log => { + ComputeAudit::Log | ComputeAudit::Base => { phases.push(CreatePgauditExtension); phases.push(DisablePostgresDBPgAudit); } diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 747268f80b29..db9715dc6264 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -20,8 +20,10 @@ use compute_api::spec::ComputeMode; use control_plane::endpoint::ComputeControlPlane; use control_plane::local_env::{ InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf, - SafekeeperConf, + ObjectStorageConf, SafekeeperConf, }; +use control_plane::object_storage::OBJECT_STORAGE_DEFAULT_PORT; +use control_plane::object_storage::ObjectStorage; use control_plane::pageserver::PageServerNode; use control_plane::safekeeper::SafekeeperNode; use control_plane::storage_controller::{ @@ -39,7 +41,7 @@ use pageserver_api::controller_api::{ use pageserver_api::models::{ ShardParameters, TenantConfigRequest, TimelineCreateRequest, TimelineInfo, }; -use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; +use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; use postgres_connection::parse_host_port; use safekeeper_api::membership::SafekeeperGeneration; @@ -91,6 +93,8 @@ enum NeonLocalCmd { #[command(subcommand)] Safekeeper(SafekeeperCmd), #[command(subcommand)] + ObjectStorage(ObjectStorageCmd), + #[command(subcommand)] Endpoint(EndpointCmd), #[command(subcommand)] Mappings(MappingsCmd), @@ -454,6 +458,32 @@ enum SafekeeperCmd { Restart(SafekeeperRestartCmdArgs), } +#[derive(clap::Subcommand)] +#[clap(about = "Manage object storage")] +enum ObjectStorageCmd { + Start(ObjectStorageStartCmd), + Stop(ObjectStorageStopCmd), +} + +#[derive(clap::Args)] +#[clap(about = "Start object storage")] +struct ObjectStorageStartCmd { + #[clap(short = 't', long, help = "timeout until we fail the command")] + #[arg(default_value = "10s")] + start_timeout: humantime::Duration, +} + +#[derive(clap::Args)] +#[clap(about = "Stop object storage")] +struct ObjectStorageStopCmd { + #[arg(value_enum, default_value = "fast")] + #[clap( + short = 'm', + help = "If 'immediate', don't flush repository data at shutdown" + )] + stop_mode: StopMode, +} + #[derive(clap::Args)] #[clap(about = "Start local safekeeper")] struct SafekeeperStartCmdArgs { @@ -759,6 +789,7 @@ fn main() -> Result<()> { } NeonLocalCmd::StorageBroker(subcmd) => rt.block_on(handle_storage_broker(&subcmd, env)), NeonLocalCmd::Safekeeper(subcmd) => rt.block_on(handle_safekeeper(&subcmd, env)), + NeonLocalCmd::ObjectStorage(subcmd) => rt.block_on(handle_object_storage(&subcmd, env)), NeonLocalCmd::Endpoint(subcmd) => rt.block_on(handle_endpoint(&subcmd, env)), NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env), }; @@ -975,6 +1006,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result { } }) .collect(), + object_storage: ObjectStorageConf { + port: OBJECT_STORAGE_DEFAULT_PORT, + }, pg_distrib_dir: None, neon_distrib_dir: None, default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)), @@ -1083,7 +1117,7 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any stripe_size: args .shard_stripe_size .map(ShardStripeSize) - .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE), + .unwrap_or(DEFAULT_STRIPE_SIZE), }, placement_policy: args.placement_policy.clone(), config: tenant_conf, @@ -1396,7 +1430,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res vec![(parsed.0, parsed.1.unwrap_or(5432))], // If caller is telling us what pageserver to use, this is not a tenant which is // full managed by storage controller, therefore not sharded. - ShardParameters::DEFAULT_STRIPE_SIZE, + DEFAULT_STRIPE_SIZE, ) } else { // Look up the currently attached location of the tenant, and its striping metadata, @@ -1683,6 +1717,41 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) -> Ok(()) } +async fn handle_object_storage(subcmd: &ObjectStorageCmd, env: &local_env::LocalEnv) -> Result<()> { + use ObjectStorageCmd::*; + let storage = ObjectStorage::from_env(env); + + // In tests like test_forward_compatibility or test_graceful_cluster_restart + // old neon binaries (without object_storage) are present + if !storage.bin.exists() { + eprintln!( + "{} binary not found. Ignore if this is a compatibility test", + storage.bin + ); + return Ok(()); + } + + match subcmd { + Start(ObjectStorageStartCmd { start_timeout }) => { + if let Err(e) = storage.start(start_timeout).await { + eprintln!("object_storage start failed: {e}"); + exit(1); + } + } + Stop(ObjectStorageStopCmd { stop_mode }) => { + let immediate = match stop_mode { + StopMode::Fast => false, + StopMode::Immediate => true, + }; + if let Err(e) = storage.stop(immediate) { + eprintln!("proxy stop failed: {e}"); + exit(1); + } + } + }; + Ok(()) +} + async fn handle_storage_broker(subcmd: &StorageBrokerCmd, env: &local_env::LocalEnv) -> Result<()> { match subcmd { StorageBrokerCmd::Start(args) => { @@ -1777,6 +1846,13 @@ async fn handle_start_all_impl( .map_err(|e| e.context(format!("start safekeeper {}", safekeeper.id))) }); } + + js.spawn(async move { + ObjectStorage::from_env(env) + .start(&retry_timeout) + .await + .map_err(|e| e.context("start object_storage")) + }); })(); let mut errors = Vec::new(); @@ -1874,6 +1950,11 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) { } } + let storage = ObjectStorage::from_env(env); + if let Err(e) = storage.stop(immediate) { + eprintln!("object_storage stop failed: {:#}", e); + } + for ps_conf in &env.pageservers { let pageserver = PageServerNode::from_env(env, ps_conf); if let Err(e) = pageserver.stop(immediate) { diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index b46d61682794..2fa7a62f8fdc 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -29,7 +29,7 @@ //! compute.log - log output of `compute_ctl` and `postgres` //! endpoint.json - serialized `EndpointConf` struct //! postgresql.conf - postgresql settings -//! spec.json - passed to `compute_ctl` +//! config.json - passed to `compute_ctl` //! pgdata/ //! postgresql.conf - copy of postgresql.conf created by `compute_ctl` //! zenith.signal @@ -46,7 +46,9 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; use anyhow::{Context, Result, anyhow, bail}; use compute_api::requests::ConfigurationRequest; -use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse}; +use compute_api::responses::{ + ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, +}; use compute_api::spec::{ Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, RemoteExtSpec, Role, @@ -619,86 +621,101 @@ impl Endpoint { remote_extensions = None; }; - // Create spec file - let mut spec = ComputeSpec { - skip_pg_catalog_updates: self.skip_pg_catalog_updates, - format_version: 1.0, - operation_uuid: None, - features: self.features.clone(), - swap_size_bytes: None, - disk_quota_bytes: None, - disable_lfc_resizing: None, - cluster: Cluster { - cluster_id: None, // project ID: not used - name: None, // project name: not used - state: None, - roles: if create_test_user { - vec![Role { + // Create config file + let config = { + let mut spec = ComputeSpec { + skip_pg_catalog_updates: self.skip_pg_catalog_updates, + format_version: 1.0, + operation_uuid: None, + features: self.features.clone(), + swap_size_bytes: None, + disk_quota_bytes: None, + disable_lfc_resizing: None, + cluster: Cluster { + cluster_id: None, // project ID: not used + name: None, // project name: not used + state: None, + roles: if create_test_user { + vec![Role { + name: PgIdent::from_str("test").unwrap(), + encrypted_password: None, + options: None, + }] + } else { + Vec::new() + }, + databases: if create_test_user { + vec![Database { + name: PgIdent::from_str("neondb").unwrap(), + owner: PgIdent::from_str("test").unwrap(), + options: None, + restrict_conn: false, + invalid: false, + }] + } else { + Vec::new() + }, + settings: None, + postgresql_conf: Some(postgresql_conf.clone()), + }, + delta_operations: None, + tenant_id: Some(self.tenant_id), + timeline_id: Some(self.timeline_id), + project_id: None, + branch_id: None, + endpoint_id: Some(self.endpoint_id.clone()), + mode: self.mode, + pageserver_connstring: Some(pageserver_connstring), + safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()), + safekeeper_connstrings, + storage_auth_token: auth_token.clone(), + remote_extensions, + pgbouncer_settings: None, + shard_stripe_size: Some(shard_stripe_size), + local_proxy_config: None, + reconfigure_concurrency: self.reconfigure_concurrency, + drop_subscriptions_before_start: self.drop_subscriptions_before_start, + audit_log_level: ComputeAudit::Disabled, + logs_export_host: None::, + }; + + // this strange code is needed to support respec() in tests + if self.cluster.is_some() { + debug!("Cluster is already set in the endpoint spec, using it"); + spec.cluster = self.cluster.clone().unwrap(); + + debug!("spec.cluster {:?}", spec.cluster); + + // fill missing fields again + if create_test_user { + spec.cluster.roles.push(Role { name: PgIdent::from_str("test").unwrap(), encrypted_password: None, options: None, - }] - } else { - Vec::new() - }, - databases: if create_test_user { - vec![Database { + }); + spec.cluster.databases.push(Database { name: PgIdent::from_str("neondb").unwrap(), owner: PgIdent::from_str("test").unwrap(), options: None, restrict_conn: false, invalid: false, - }] - } else { - Vec::new() - }, - settings: None, - postgresql_conf: Some(postgresql_conf.clone()), - }, - delta_operations: None, - tenant_id: Some(self.tenant_id), - timeline_id: Some(self.timeline_id), - mode: self.mode, - pageserver_connstring: Some(pageserver_connstring), - safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()), - safekeeper_connstrings, - storage_auth_token: auth_token.clone(), - remote_extensions, - pgbouncer_settings: None, - shard_stripe_size: Some(shard_stripe_size), - local_proxy_config: None, - reconfigure_concurrency: self.reconfigure_concurrency, - drop_subscriptions_before_start: self.drop_subscriptions_before_start, - audit_log_level: ComputeAudit::Disabled, - }; + }); + } + spec.cluster.postgresql_conf = Some(postgresql_conf); + } - // this strange code is needed to support respec() in tests - if self.cluster.is_some() { - debug!("Cluster is already set in the endpoint spec, using it"); - spec.cluster = self.cluster.clone().unwrap(); - - debug!("spec.cluster {:?}", spec.cluster); - - // fill missing fields again - if create_test_user { - spec.cluster.roles.push(Role { - name: PgIdent::from_str("test").unwrap(), - encrypted_password: None, - options: None, - }); - spec.cluster.databases.push(Database { - name: PgIdent::from_str("neondb").unwrap(), - owner: PgIdent::from_str("test").unwrap(), - options: None, - restrict_conn: false, - invalid: false, - }); + ComputeConfig { + spec: Some(spec), + compute_ctl_config: ComputeCtlConfig::default(), } - spec.cluster.postgresql_conf = Some(postgresql_conf); - } + }; + // TODO(tristan957): Remove the write to spec.json after compatibility + // tests work themselves out let spec_path = self.endpoint_path().join("spec.json"); - std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; + std::fs::write(spec_path, serde_json::to_string_pretty(&config.spec)?)?; + let config_path = self.endpoint_path().join("config.json"); + std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?; // Open log file. We'll redirect the stdout and stderr of `compute_ctl` to it. let logfile = std::fs::OpenOptions::new() @@ -706,6 +723,16 @@ impl Endpoint { .append(true) .open(self.endpoint_path().join("compute.log"))?; + // TODO(tristan957): Remove when compatibility tests are no longer an + // issue + let old_compute_ctl = { + let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl")); + let help_output = cmd.arg("--help").output()?; + let help_output = String::from_utf8_lossy(&help_output.stdout); + + !help_output.contains("--config") + }; + // Launch compute_ctl let conn_str = self.connstr("cloud_admin", "postgres"); println!("Starting postgres node at '{}'", conn_str); @@ -724,9 +751,18 @@ impl Endpoint { ]) .args(["--pgdata", self.pgdata().to_str().unwrap()]) .args(["--connstr", &conn_str]) + // TODO(tristan957): Change this to --config when compatibility tests + // are no longer an issue .args([ "--spec-path", - self.endpoint_path().join("spec.json").to_str().unwrap(), + self.endpoint_path() + .join(if old_compute_ctl { + "spec.json" + } else { + "config.json" + }) + .to_str() + .unwrap(), ]) .args([ "--pgbin", @@ -869,10 +905,12 @@ impl Endpoint { stripe_size: Option, safekeepers: Option>, ) -> Result<()> { - let mut spec: ComputeSpec = { - let spec_path = self.endpoint_path().join("spec.json"); - let file = std::fs::File::open(spec_path)?; - serde_json::from_reader(file)? + let (mut spec, compute_ctl_config) = { + let config_path = self.endpoint_path().join("config.json"); + let file = std::fs::File::open(config_path)?; + let config: ComputeConfig = serde_json::from_reader(file)?; + + (config.spec.unwrap(), config.compute_ctl_config) }; let postgresql_conf = self.read_postgresql_conf()?; @@ -922,7 +960,7 @@ impl Endpoint { .body( serde_json::to_string(&ConfigurationRequest { spec, - compute_ctl_config: ComputeCtlConfig::default(), + compute_ctl_config, }) .unwrap(), ) diff --git a/control_plane/src/lib.rs b/control_plane/src/lib.rs index 2af272f3885d..2d9fe2c807b3 100644 --- a/control_plane/src/lib.rs +++ b/control_plane/src/lib.rs @@ -10,6 +10,7 @@ mod background_process; pub mod broker; pub mod endpoint; pub mod local_env; +pub mod object_storage; pub mod pageserver; pub mod postgresql_conf; pub mod safekeeper; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 3f3794c0eef4..fa10abe91a4c 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -15,9 +15,10 @@ use clap::ValueEnum; use postgres_backend::AuthType; use reqwest::Url; use serde::{Deserialize, Serialize}; -use utils::auth::{Claims, encode_from_key_file}; +use utils::auth::encode_from_key_file; use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; +use crate::object_storage::{OBJECT_STORAGE_REMOTE_STORAGE_DIR, ObjectStorage}; use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode}; use crate::safekeeper::SafekeeperNode; @@ -55,6 +56,7 @@ pub struct LocalEnv { // used to issue tokens during e.g pg start pub private_key_path: PathBuf, + pub public_key_path: PathBuf, pub broker: NeonBroker, @@ -68,6 +70,8 @@ pub struct LocalEnv { pub safekeepers: Vec, + pub object_storage: ObjectStorageConf, + // Control plane upcall API for pageserver: if None, we will not run storage_controller If set, this will // be propagated into each pageserver's configuration. pub control_plane_api: Url, @@ -95,6 +99,7 @@ pub struct OnDiskConfig { pub neon_distrib_dir: PathBuf, pub default_tenant_id: Option, pub private_key_path: PathBuf, + pub public_key_path: PathBuf, pub broker: NeonBroker, pub storage_controller: NeonStorageControllerConf, #[serde( @@ -103,6 +108,7 @@ pub struct OnDiskConfig { )] pub pageservers: Vec, pub safekeepers: Vec, + pub object_storage: ObjectStorageConf, pub control_plane_api: Option, pub control_plane_hooks_api: Option, pub control_plane_compute_hook_api: Option, @@ -136,11 +142,18 @@ pub struct NeonLocalInitConf { pub storage_controller: Option, pub pageservers: Vec, pub safekeepers: Vec, + pub object_storage: ObjectStorageConf, pub control_plane_api: Option, pub control_plane_hooks_api: Option, pub generate_local_ssl_certs: bool, } +#[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)] +#[serde(default)] +pub struct ObjectStorageConf { + pub port: u16, +} + /// Broker config for cluster internal communication. #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)] #[serde(default)] @@ -398,6 +411,10 @@ impl LocalEnv { self.pg_dir(pg_version, "lib") } + pub fn object_storage_bin(&self) -> PathBuf { + self.neon_distrib_dir.join("object_storage") + } + pub fn pageserver_bin(&self) -> PathBuf { self.neon_distrib_dir.join("pageserver") } @@ -431,6 +448,10 @@ impl LocalEnv { self.base_data_dir.join("safekeepers").join(data_dir_name) } + pub fn object_storage_data_dir(&self) -> PathBuf { + self.base_data_dir.join("object_storage") + } + pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> { if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) { Ok(conf) @@ -582,6 +603,7 @@ impl LocalEnv { neon_distrib_dir, default_tenant_id, private_key_path, + public_key_path, broker, storage_controller, pageservers, @@ -591,6 +613,7 @@ impl LocalEnv { control_plane_compute_hook_api: _, branch_name_mappings, generate_local_ssl_certs, + object_storage, } = on_disk_config; LocalEnv { base_data_dir: repopath.to_owned(), @@ -598,6 +621,7 @@ impl LocalEnv { neon_distrib_dir, default_tenant_id, private_key_path, + public_key_path, broker, storage_controller, pageservers, @@ -606,6 +630,7 @@ impl LocalEnv { control_plane_hooks_api, branch_name_mappings, generate_local_ssl_certs, + object_storage, } }; @@ -705,6 +730,7 @@ impl LocalEnv { neon_distrib_dir: self.neon_distrib_dir.clone(), default_tenant_id: self.default_tenant_id, private_key_path: self.private_key_path.clone(), + public_key_path: self.public_key_path.clone(), broker: self.broker.clone(), storage_controller: self.storage_controller.clone(), pageservers: vec![], // it's skip_serializing anyway @@ -714,6 +740,7 @@ impl LocalEnv { control_plane_compute_hook_api: None, branch_name_mappings: self.branch_name_mappings.clone(), generate_local_ssl_certs: self.generate_local_ssl_certs, + object_storage: self.object_storage.clone(), }, ) } @@ -730,7 +757,7 @@ impl LocalEnv { } // this function is used only for testing purposes in CLI e g generate tokens during init - pub fn generate_auth_token(&self, claims: &Claims) -> anyhow::Result { + pub fn generate_auth_token(&self, claims: &S) -> anyhow::Result { let private_key_path = self.get_private_key_path(); let key_data = fs::read(private_key_path)?; encode_from_key_file(claims, &key_data) @@ -797,6 +824,7 @@ impl LocalEnv { control_plane_api, generate_local_ssl_certs, control_plane_hooks_api, + object_storage, } = conf; // Find postgres binaries. @@ -828,6 +856,7 @@ impl LocalEnv { ) .context("generate auth keys")?; let private_key_path = PathBuf::from("auth_private_key.pem"); + let public_key_path = PathBuf::from("auth_public_key.pem"); // create the runtime type because the remaining initialization code below needs // a LocalEnv instance op operation @@ -838,6 +867,7 @@ impl LocalEnv { neon_distrib_dir, default_tenant_id: Some(default_tenant_id), private_key_path, + public_key_path, broker, storage_controller: storage_controller.unwrap_or_default(), pageservers: pageservers.iter().map(Into::into).collect(), @@ -846,6 +876,7 @@ impl LocalEnv { control_plane_hooks_api, branch_name_mappings: Default::default(), generate_local_ssl_certs, + object_storage, }; if generate_local_ssl_certs { @@ -873,8 +904,13 @@ impl LocalEnv { .context("pageserver init failed")?; } + ObjectStorage::from_env(&env) + .init() + .context("object storage init failed")?; + // setup remote remote location for default LocalFs remote storage std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?; + std::fs::create_dir_all(env.base_data_dir.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR))?; env.persist_config() } @@ -944,7 +980,7 @@ fn generate_ssl_ca_cert(cert_path: &Path, key_path: &Path) -> anyhow::Result<()> // -out rootCA.crt -keyout rootCA.key let keygen_output = Command::new("openssl") .args([ - "req", "-x509", "-newkey", "rsa:2048", "-nodes", "-days", "36500", + "req", "-x509", "-newkey", "ed25519", "-nodes", "-days", "36500", ]) .args(["-subj", "/CN=Neon Local CA"]) .args(["-out", cert_path.to_str().unwrap()]) @@ -974,7 +1010,7 @@ fn generate_ssl_cert( // -subj "/CN=localhost" -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" let keygen_output = Command::new("openssl") .args(["req", "-new", "-nodes"]) - .args(["-newkey", "rsa:2048"]) + .args(["-newkey", "ed25519"]) .args(["-subj", "/CN=localhost"]) .args(["-addext", "subjectAltName=DNS:localhost,IP:127.0.0.1"]) .args(["-keyout", key_path.to_str().unwrap()]) diff --git a/control_plane/src/object_storage.rs b/control_plane/src/object_storage.rs new file mode 100644 index 000000000000..1a595b780972 --- /dev/null +++ b/control_plane/src/object_storage.rs @@ -0,0 +1,107 @@ +use crate::background_process::{self, start_process, stop_process}; +use crate::local_env::LocalEnv; +use anyhow::anyhow; +use anyhow::{Context, Result}; +use camino::Utf8PathBuf; +use std::io::Write; +use std::time::Duration; + +/// Directory within .neon which will be used by default for LocalFs remote storage. +pub const OBJECT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/object_storage"; +pub const OBJECT_STORAGE_DEFAULT_PORT: u16 = 9993; + +pub struct ObjectStorage { + pub bin: Utf8PathBuf, + pub data_dir: Utf8PathBuf, + pub pemfile: Utf8PathBuf, + pub port: u16, +} + +impl ObjectStorage { + pub fn from_env(env: &LocalEnv) -> ObjectStorage { + ObjectStorage { + bin: Utf8PathBuf::from_path_buf(env.object_storage_bin()).unwrap(), + data_dir: Utf8PathBuf::from_path_buf(env.object_storage_data_dir()).unwrap(), + pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(), + port: env.object_storage.port, + } + } + + fn config_path(&self) -> Utf8PathBuf { + self.data_dir.join("object_storage.json") + } + + fn listen_addr(&self) -> Utf8PathBuf { + format!("127.0.0.1:{}", self.port).into() + } + + pub fn init(&self) -> Result<()> { + println!("Initializing object storage in {:?}", self.data_dir); + let parent = self.data_dir.parent().unwrap(); + + #[derive(serde::Serialize)] + struct Cfg { + listen: Utf8PathBuf, + pemfile: Utf8PathBuf, + local_path: Utf8PathBuf, + r#type: String, + } + let cfg = Cfg { + listen: self.listen_addr(), + pemfile: parent.join(self.pemfile.clone()), + local_path: parent.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR), + r#type: "LocalFs".to_string(), + }; + std::fs::create_dir_all(self.config_path().parent().unwrap())?; + std::fs::write(self.config_path(), serde_json::to_string(&cfg)?) + .context("write object storage config")?; + Ok(()) + } + + pub async fn start(&self, retry_timeout: &Duration) -> Result<()> { + println!("Starting s3 proxy at {}", self.listen_addr()); + std::io::stdout().flush().context("flush stdout")?; + + let process_status_check = || async { + tokio::time::sleep(Duration::from_millis(500)).await; + let res = reqwest::Client::new() + .get(format!("http://{}/metrics", self.listen_addr())) + .send() + .await; + match res { + Ok(response) if response.status().is_success() => Ok(true), + Ok(_) => Err(anyhow!("Failed to query /metrics")), + Err(e) => Err(anyhow!("Failed to check node status: {e}")), + } + }; + + let res = start_process( + "object_storage", + &self.data_dir.clone().into_std_path_buf(), + &self.bin.clone().into_std_path_buf(), + vec![self.config_path().to_string()], + vec![("RUST_LOG".into(), "debug".into())], + background_process::InitialPidFile::Create(self.pid_file()), + retry_timeout, + process_status_check, + ) + .await; + if res.is_err() { + eprintln!("Logs:\n{}", std::fs::read_to_string(self.log_file())?); + } + + res + } + + pub fn stop(&self, immediate: bool) -> anyhow::Result<()> { + stop_process(immediate, "object_storage", &self.pid_file()) + } + + fn log_file(&self) -> Utf8PathBuf { + self.data_dir.join("object_storage.log") + } + + fn pid_file(&self) -> Utf8PathBuf { + self.data_dir.join("object_storage.pid") + } +} diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 591eb3728b1d..5c985e6dc831 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -535,6 +535,11 @@ impl PageServerNode { .map(|x| x.parse::()) .transpose() .context("Failed to parse 'gc_compaction_enabled' as bool")?, + gc_compaction_verification: settings + .remove("gc_compaction_verification") + .map(|x| x.parse::()) + .transpose() + .context("Failed to parse 'gc_compaction_verification' as bool")?, gc_compaction_initial_threshold_kb: settings .remove("gc_compaction_initial_threshold_kb") .map(|x| x.parse::()) diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 8000576e87ad..a4b56ae5c01b 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -13,7 +13,9 @@ use pageserver_api::controller_api::{ NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse, TenantLocateResponse, }; -use pageserver_api::models::{TenantConfigRequest, TimelineCreateRequest, TimelineInfo}; +use pageserver_api::models::{ + TenantConfig, TenantConfigRequest, TimelineCreateRequest, TimelineInfo, +}; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use postgres_backend::AuthType; @@ -82,7 +84,8 @@ impl NeonStorageControllerStopArgs { pub struct AttachHookRequest { pub tenant_shard_id: TenantShardId, pub node_id: Option, - pub generation_override: Option, + pub generation_override: Option, // only new tenants + pub config: Option, // only new tenants } #[derive(Serialize, Deserialize)] @@ -805,6 +808,7 @@ impl StorageController { tenant_shard_id, node_id: Some(pageserver_id), generation_override: None, + config: None, }; let response = self diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index b7e479d90cbd..19c686dcfd31 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -941,7 +941,7 @@ async fn main() -> anyhow::Result<()> { let mut node_to_fill_descs = Vec::new(); for desc in node_descs { - let to_drain = nodes.iter().any(|id| *id == desc.id); + let to_drain = nodes.contains(&desc.id); if to_drain { node_to_drain_descs.push(desc); } else { diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index 418aaf876da2..9409e9d055ff 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -11,8 +11,8 @@ generate_id() { PG_VERSION=${PG_VERSION:-14} -SPEC_FILE_ORG=/var/db/postgres/specs/spec.json -SPEC_FILE=/tmp/spec.json +CONFIG_FILE_ORG=/var/db/postgres/configs/config.json +CONFIG_FILE=/tmp/config.json echo "Waiting pageserver become ready." while ! nc -z pageserver 6400; do @@ -20,7 +20,7 @@ while ! nc -z pageserver 6400; do done echo "Page server is ready." -cp ${SPEC_FILE_ORG} ${SPEC_FILE} +cp ${CONFIG_FILE_ORG} ${CONFIG_FILE} if [ -n "${TENANT_ID:-}" ] && [ -n "${TIMELINE_ID:-}" ]; then tenant_id=${TENANT_ID} @@ -73,17 +73,27 @@ else ulid_extension=ulid fi echo "Adding pgx_ulid" -shared_libraries=$(jq -r '.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${SPEC_FILE}) -sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${SPEC_FILE} +shared_libraries=$(jq -r '.spec.cluster.settings[] | select(.name=="shared_preload_libraries").value' ${CONFIG_FILE}) +sed -i "s/${shared_libraries}/${shared_libraries},${ulid_extension}/" ${CONFIG_FILE} echo "Overwrite tenant id and timeline id in spec file" -sed -i "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE} -sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE} +sed -i "s/TENANT_ID/${tenant_id}/" ${CONFIG_FILE} +sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE} -cat ${SPEC_FILE} +cat ${CONFIG_FILE} + +# TODO(tristan957): Remove these workarounds for backwards compatibility after +# the next compute release. That includes these next few lines and the +# --spec-path in the compute_ctl invocation. +if compute_ctl --help | grep --quiet -- '--config'; then + SPEC_PATH="$CONFIG_FILE" +else + jq '.spec' < "$CONFIG_FILE" > /tmp/spec.json + SPEC_PATH=/tmp/spec.json +fi echo "Start compute node" /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \ -C "postgresql://cloud_admin@localhost:55433/postgres" \ -b /usr/local/bin/postgres \ --compute-id "compute-$RANDOM" \ - -S ${SPEC_FILE} + --spec-path "$SPEC_PATH" diff --git a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json new file mode 100644 index 000000000000..3ddf96512a3a --- /dev/null +++ b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json @@ -0,0 +1,148 @@ +{ + "spec": { + "format_version": 1.0, + + "timestamp": "2022-10-12T18:00:00.000Z", + "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c", + + "cluster": { + "cluster_id": "docker_compose", + "name": "docker_compose_test", + "state": "restarted", + "roles": [ + { + "name": "cloud_admin", + "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8", + "options": null + } + ], + "databases": [ + ], + "settings": [ + { + "name": "fsync", + "value": "off", + "vartype": "bool" + }, + { + "name": "wal_level", + "value": "logical", + "vartype": "enum" + }, + { + "name": "wal_log_hints", + "value": "on", + "vartype": "bool" + }, + { + "name": "log_connections", + "value": "on", + "vartype": "bool" + }, + { + "name": "port", + "value": "55433", + "vartype": "integer" + }, + { + "name": "shared_buffers", + "value": "1MB", + "vartype": "string" + }, + { + "name": "max_connections", + "value": "100", + "vartype": "integer" + }, + { + "name": "listen_addresses", + "value": "0.0.0.0", + "vartype": "string" + }, + { + "name": "max_wal_senders", + "value": "10", + "vartype": "integer" + }, + { + "name": "max_replication_slots", + "value": "10", + "vartype": "integer" + }, + { + "name": "wal_sender_timeout", + "value": "5s", + "vartype": "string" + }, + { + "name": "wal_keep_size", + "value": "0", + "vartype": "integer" + }, + { + "name": "password_encryption", + "value": "md5", + "vartype": "enum" + }, + { + "name": "restart_after_crash", + "value": "off", + "vartype": "bool" + }, + { + "name": "synchronous_standby_names", + "value": "walproposer", + "vartype": "string" + }, + { + "name": "shared_preload_libraries", + "value": "neon,pg_cron,timescaledb,pg_stat_statements", + "vartype": "string" + }, + { + "name": "neon.safekeepers", + "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454", + "vartype": "string" + }, + { + "name": "neon.timeline_id", + "value": "TIMELINE_ID", + "vartype": "string" + }, + { + "name": "neon.tenant_id", + "value": "TENANT_ID", + "vartype": "string" + }, + { + "name": "neon.pageserver_connstring", + "value": "host=pageserver port=6400", + "vartype": "string" + }, + { + "name": "max_replication_write_lag", + "value": "500MB", + "vartype": "string" + }, + { + "name": "max_replication_flush_lag", + "value": "10GB", + "vartype": "string" + }, + { + "name": "cron.database", + "value": "postgres", + "vartype": "string" + } + ] + }, + + "delta_operations": [ + ] + }, + "compute_ctl_config": { + "jwks": { + "keys": [] + } + } +} diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json deleted file mode 100644 index 0308cab4515a..000000000000 --- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json +++ /dev/null @@ -1,141 +0,0 @@ -{ - "format_version": 1.0, - - "timestamp": "2022-10-12T18:00:00.000Z", - "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c", - - "cluster": { - "cluster_id": "docker_compose", - "name": "docker_compose_test", - "state": "restarted", - "roles": [ - { - "name": "cloud_admin", - "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8", - "options": null - } - ], - "databases": [ - ], - "settings": [ - { - "name": "fsync", - "value": "off", - "vartype": "bool" - }, - { - "name": "wal_level", - "value": "logical", - "vartype": "enum" - }, - { - "name": "wal_log_hints", - "value": "on", - "vartype": "bool" - }, - { - "name": "log_connections", - "value": "on", - "vartype": "bool" - }, - { - "name": "port", - "value": "55433", - "vartype": "integer" - }, - { - "name": "shared_buffers", - "value": "1MB", - "vartype": "string" - }, - { - "name": "max_connections", - "value": "100", - "vartype": "integer" - }, - { - "name": "listen_addresses", - "value": "0.0.0.0", - "vartype": "string" - }, - { - "name": "max_wal_senders", - "value": "10", - "vartype": "integer" - }, - { - "name": "max_replication_slots", - "value": "10", - "vartype": "integer" - }, - { - "name": "wal_sender_timeout", - "value": "5s", - "vartype": "string" - }, - { - "name": "wal_keep_size", - "value": "0", - "vartype": "integer" - }, - { - "name": "password_encryption", - "value": "md5", - "vartype": "enum" - }, - { - "name": "restart_after_crash", - "value": "off", - "vartype": "bool" - }, - { - "name": "synchronous_standby_names", - "value": "walproposer", - "vartype": "string" - }, - { - "name": "shared_preload_libraries", - "value": "neon,pg_cron,timescaledb,pg_stat_statements", - "vartype": "string" - }, - { - "name": "neon.safekeepers", - "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454", - "vartype": "string" - }, - { - "name": "neon.timeline_id", - "value": "TIMELINE_ID", - "vartype": "string" - }, - { - "name": "neon.tenant_id", - "value": "TENANT_ID", - "vartype": "string" - }, - { - "name": "neon.pageserver_connstring", - "value": "host=pageserver port=6400", - "vartype": "string" - }, - { - "name": "max_replication_write_lag", - "value": "500MB", - "vartype": "string" - }, - { - "name": "max_replication_flush_lag", - "value": "10GB", - "vartype": "string" - }, - { - "name": "cron.database", - "value": "postgres", - "vartype": "string" - } - ] - }, - - "delta_operations": [ - ] -} diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 493a0a552334..fd3ad1fffcad 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -159,7 +159,7 @@ services: #- RUST_BACKTRACE=1 # Mount the test files directly, for faster editing cycle. volumes: - - ./compute_wrapper/var/db/postgres/specs/:/var/db/postgres/specs/ + - ./compute_wrapper/var/db/postgres/configs/:/var/db/postgres/configs/ - ./compute_wrapper/shell/:/shell/ ports: - 55433:55433 # pg protocol handler diff --git a/docs/storage_controller.md b/docs/storage_controller.md index ac4aca4219ac..d761210033ba 100644 --- a/docs/storage_controller.md +++ b/docs/storage_controller.md @@ -151,7 +151,7 @@ Example body: ``` { "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc", - "stripe_size": 32768, + "stripe_size": 2048, "shards": [ {"node_id": 344, "shard_number": 0}, {"node_id": 722, "shard_number": 1}, diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs index d88451c5495e..98f2fc297cfe 100644 --- a/libs/compute_api/src/requests.rs +++ b/libs/compute_api/src/requests.rs @@ -5,6 +5,14 @@ use crate::privilege::Privilege; use crate::responses::ComputeCtlConfig; use crate::spec::{ComputeSpec, ExtVersion, PgIdent}; +/// When making requests to the `compute_ctl` external HTTP server, the client +/// must specify a set of claims in `Authorization` header JWTs such that +/// `compute_ctl` can authorize the request. +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct ComputeClaims { + pub compute_id: String, +} + /// Request of the /configure API /// /// We now pass only `spec` in the configuration request, but later we can @@ -30,9 +38,3 @@ pub struct SetRoleGrantsRequest { pub privileges: Vec, pub role: PgIdent, } - -/// Request of the /configure_telemetry API -#[derive(Debug, Deserialize, Serialize)] -pub struct ConfigureTelemetryRequest { - pub logs_export_host: Option, -} diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index c8f6019c5cf1..353949736b4c 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -14,6 +14,32 @@ pub struct GenericAPIError { pub error: String, } +/// All configuration parameters necessary for a compute. When +/// [`ComputeConfig::spec`] is provided, it means that the compute is attached +/// to a tenant. [`ComputeConfig::compute_ctl_config`] will always be provided +/// and contains parameters necessary for operating `compute_ctl` independently +/// of whether a tenant is attached to the compute or not. +/// +/// This also happens to be the body of `compute_ctl`'s /configure request. +#[derive(Debug, Deserialize, Serialize)] +pub struct ComputeConfig { + /// The compute spec + pub spec: Option, + + /// The compute_ctl configuration + #[allow(dead_code)] + pub compute_ctl_config: ComputeCtlConfig, +} + +impl From for ComputeConfig { + fn from(value: ControlPlaneConfigResponse) -> Self { + Self { + spec: value.spec, + compute_ctl_config: value.compute_ctl_config, + } + } +} + #[derive(Debug, Clone, Serialize)] pub struct ExtensionInstallResponse { pub extension: PgIdent, @@ -161,7 +187,7 @@ pub struct TlsConfig { /// Response of the `/computes/{compute_id}/spec` control-plane API. #[derive(Deserialize, Debug)] -pub struct ControlPlaneSpecResponse { +pub struct ControlPlaneConfigResponse { pub spec: Option, pub status: ControlPlaneComputeStatus, pub compute_ctl_config: ComputeCtlConfig, diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index cff1f4c89a6d..5e67ccce0018 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -1,8 +1,8 @@ -//! `ComputeSpec` represents the contents of the spec.json file. -//! -//! The spec.json file is used to pass information to 'compute_ctl'. It contains -//! all the information needed to start up the right version of PostgreSQL, -//! and connect it to the storage nodes. +//! The ComputeSpec contains all the information needed to start up +//! the right version of PostgreSQL, and connect it to the storage nodes. +//! It can be passed as part of the `config.json`, or the control plane can +//! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or +//! compute_ctl can fetch it by calling the control plane's API. use std::collections::HashMap; use indexmap::IndexMap; @@ -104,6 +104,12 @@ pub struct ComputeSpec { pub timeline_id: Option, pub pageserver_connstring: Option, + // More neon ids that we expose to the compute_ctl + // and to postgres as neon extension GUCs. + pub project_id: Option, + pub branch_id: Option, + pub endpoint_id: Option, + /// Safekeeper membership config generation. It is put in /// neon.safekeepers GUC and serves two purposes: /// 1) Non zero value forces walproposer to use membership configurations. @@ -159,15 +165,13 @@ pub struct ComputeSpec { #[serde(default)] // Default false pub drop_subscriptions_before_start: bool, - /// Log level for audit logging: - /// - /// Disabled - no audit logging. This is the default. - /// log - log masked statements to the postgres log using pgaudit extension - /// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension - /// - /// Extensions should be present in shared_preload_libraries + /// Log level for compute audit logging #[serde(default)] pub audit_log_level: ComputeAudit, + + /// Hostname and the port of the otel collector. Leave empty to disable Postgres logs forwarding. + /// Example: config-shy-breeze-123-collector-monitoring.neon-telemetry.svc.cluster.local:10514 + pub logs_export_host: Option, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. @@ -179,9 +183,6 @@ pub enum ComputeFeature { /// track short-lived connections as user activity. ActivityMonitorExperimental, - /// Allow to configure rsyslog for Postgres logs export - PostgresLogsExport, - /// This is a special feature flag that is used to represent unknown feature flags. /// Basically all unknown to enum flags are represented as this one. See unit test /// `parse_unknown_features()` for more details. @@ -288,14 +289,25 @@ impl ComputeMode { } /// Log level for audit logging -/// Disabled, log, hipaa -/// Default is Disabled #[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)] pub enum ComputeAudit { #[default] Disabled, + // Deprecated, use Base instead Log, + // (pgaudit.log = 'ddl', pgaudit.log_parameter='off') + // logged to the standard postgresql log stream + Base, + // Deprecated, use Full or Extended instead Hipaa, + // (pgaudit.log = 'all, -misc', pgaudit.log_parameter='off') + // logged to separate files collected by rsyslog + // into dedicated log storage with strict access + Extended, + // (pgaudit.log='all', pgaudit.log_parameter='on'), + // logged to separate files collected by rsyslog + // into dedicated log storage with strict access. + Full, } #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml index 6d24ee352a13..5f6578f76e83 100644 --- a/libs/http-utils/Cargo.toml +++ b/libs/http-utils/Cargo.toml @@ -30,6 +30,7 @@ tokio.workspace = true tracing.workspace = true url.workspace = true uuid.workspace = true +x509-cert.workspace = true # to use tokio channels as streams, this is faster to compile than async_stream # why is it only here? no other crate should use it, streams are rarely needed. diff --git a/libs/http-utils/src/server.rs b/libs/http-utils/src/server.rs index 07fd56ac0123..f93f71c9622d 100644 --- a/libs/http-utils/src/server.rs +++ b/libs/http-utils/src/server.rs @@ -4,6 +4,8 @@ use futures::StreamExt; use futures::stream::FuturesUnordered; use hyper0::Body; use hyper0::server::conn::Http; +use metrics::{IntCounterVec, register_int_counter_vec}; +use once_cell::sync::Lazy; use routerify::{RequestService, RequestServiceBuilder}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_rustls::TlsAcceptor; @@ -26,6 +28,24 @@ pub struct Server { tls_acceptor: Option, } +static CONNECTION_STARTED_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "http_server_connection_started_total", + "Number of established http/https connections", + &["scheme"] + ) + .expect("failed to define a metric") +}); + +static CONNECTION_ERROR_COUNT: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "http_server_connection_errors_total", + "Number of occured connection errors by type", + &["type"] + ) + .expect("failed to define a metric") +}); + impl Server { pub fn new( request_service: Arc>, @@ -60,6 +80,15 @@ impl Server { false } + let tcp_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tcp"]); + let tls_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["tls"]); + let http_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["http"]); + let https_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["https"]); + let panic_error_cnt = CONNECTION_ERROR_COUNT.with_label_values(&["panic"]); + + let http_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["http"]); + let https_connection_cnt = CONNECTION_STARTED_COUNT.with_label_values(&["https"]); + let mut connections = FuturesUnordered::new(); loop { tokio::select! { @@ -67,6 +96,7 @@ impl Server { let (tcp_stream, remote_addr) = match stream { Ok(stream) => stream, Err(err) => { + tcp_error_cnt.inc(); if !suppress_io_error(&err) { info!("Failed to accept TCP connection: {err:#}"); } @@ -78,11 +108,18 @@ impl Server { let tls_acceptor = self.tls_acceptor.clone(); let cancel = cancel.clone(); + let tls_error_cnt = tls_error_cnt.clone(); + let http_error_cnt = http_error_cnt.clone(); + let https_error_cnt = https_error_cnt.clone(); + let http_connection_cnt = http_connection_cnt.clone(); + let https_connection_cnt = https_connection_cnt.clone(); + connections.push(tokio::spawn( async move { match tls_acceptor { Some(tls_acceptor) => { // Handle HTTPS connection. + https_connection_cnt.inc(); let tls_stream = tokio::select! { tls_stream = tls_acceptor.accept(tcp_stream) => tls_stream, _ = cancel.cancelled() => return, @@ -90,6 +127,7 @@ impl Server { let tls_stream = match tls_stream { Ok(tls_stream) => tls_stream, Err(err) => { + tls_error_cnt.inc(); if !suppress_io_error(&err) { info!(%remote_addr, "Failed to accept TLS connection: {err:#}"); } @@ -97,6 +135,7 @@ impl Server { } }; if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await { + https_error_cnt.inc(); if !suppress_hyper_error(&err) { info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}"); } @@ -104,7 +143,9 @@ impl Server { } None => { // Handle HTTP connection. + http_connection_cnt.inc(); if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await { + http_error_cnt.inc(); if !suppress_hyper_error(&err) { info!(%remote_addr, "Failed to serve HTTP connection: {err:#}"); } @@ -115,6 +156,7 @@ impl Server { } Some(conn) = connections.next() => { if let Err(err) = conn { + panic_error_cnt.inc(); error!("Connection panicked: {err:#}"); } } @@ -122,6 +164,7 @@ impl Server { // Wait for graceful shutdown of all connections. while let Some(conn) = connections.next().await { if let Err(err) = conn { + panic_error_cnt.inc(); error!("Connection panicked: {err:#}"); } } diff --git a/libs/http-utils/src/tls_certs.rs b/libs/http-utils/src/tls_certs.rs index 0c18d84d987d..2799db78a600 100644 --- a/libs/http-utils/src/tls_certs.rs +++ b/libs/http-utils/src/tls_certs.rs @@ -3,11 +3,14 @@ use std::{sync::Arc, time::Duration}; use anyhow::Context; use arc_swap::ArcSwap; use camino::Utf8Path; +use metrics::{IntCounterVec, UIntGaugeVec, register_int_counter_vec, register_uint_gauge_vec}; +use once_cell::sync::Lazy; use rustls::{ - pki_types::{CertificateDer, PrivateKeyDer}, + pki_types::{CertificateDer, PrivateKeyDer, UnixTime}, server::{ClientHello, ResolvesServerCert}, sign::CertifiedKey, }; +use x509_cert::der::Reader; pub async fn load_cert_chain(filename: &Utf8Path) -> anyhow::Result>> { let cert_data = tokio::fs::read(filename) @@ -53,6 +56,76 @@ pub async fn load_certified_key( Ok(certified_key) } +/// rustls's CertifiedKey with extra parsed fields used for metrics. +struct ParsedCertifiedKey { + certified_key: CertifiedKey, + expiration_time: UnixTime, +} + +/// Parse expiration time from an X509 certificate. +fn parse_expiration_time(cert: &CertificateDer<'_>) -> anyhow::Result { + let parsed_cert = x509_cert::der::SliceReader::new(cert) + .context("Failed to parse cerficiate")? + .decode::() + .context("Failed to parse cerficiate")?; + + Ok(UnixTime::since_unix_epoch( + parsed_cert + .tbs_certificate + .validity + .not_after + .to_unix_duration(), + )) +} + +async fn load_and_parse_certified_key( + key_filename: &Utf8Path, + cert_filename: &Utf8Path, +) -> anyhow::Result { + let certified_key = load_certified_key(key_filename, cert_filename).await?; + let expiration_time = parse_expiration_time(certified_key.end_entity_cert()?)?; + Ok(ParsedCertifiedKey { + certified_key, + expiration_time, + }) +} + +static CERT_EXPIRATION_TIME: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "tls_certs_expiration_time_seconds", + "Expiration time of the loaded certificate since unix epoch in seconds", + &["resolver_name"] + ) + .expect("failed to define a metric") +}); + +static CERT_RELOAD_STARTED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "tls_certs_reload_started_total", + "Number of certificate reload loop iterations started", + &["resolver_name"] + ) + .expect("failed to define a metric") +}); + +static CERT_RELOAD_UPDATED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "tls_certs_reload_updated_total", + "Number of times the certificate was updated to the new one", + &["resolver_name"] + ) + .expect("failed to define a metric") +}); + +static CERT_RELOAD_FAILED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "tls_certs_reload_failed_total", + "Number of times the certificate reload failed", + &["resolver_name"] + ) + .expect("failed to define a metric") +}); + /// Implementation of [`rustls::server::ResolvesServerCert`] which reloads certificates from /// the disk periodically. #[derive(Debug)] @@ -63,16 +136,28 @@ pub struct ReloadingCertificateResolver { impl ReloadingCertificateResolver { /// Creates a new Resolver by loading certificate and private key from FS and /// creating tokio::task to reload them with provided reload_period. + /// resolver_name is used as metric's label. pub async fn new( + resolver_name: &str, key_filename: &Utf8Path, cert_filename: &Utf8Path, reload_period: Duration, ) -> anyhow::Result> { + // Create metrics for current resolver. + let cert_expiration_time = CERT_EXPIRATION_TIME.with_label_values(&[resolver_name]); + let cert_reload_started_counter = + CERT_RELOAD_STARTED_COUNTER.with_label_values(&[resolver_name]); + let cert_reload_updated_counter = + CERT_RELOAD_UPDATED_COUNTER.with_label_values(&[resolver_name]); + let cert_reload_failed_counter = + CERT_RELOAD_FAILED_COUNTER.with_label_values(&[resolver_name]); + + let parsed_key = load_and_parse_certified_key(key_filename, cert_filename).await?; + let this = Arc::new(Self { - certified_key: ArcSwap::from_pointee( - load_certified_key(key_filename, cert_filename).await?, - ), + certified_key: ArcSwap::from_pointee(parsed_key.certified_key), }); + cert_expiration_time.set(parsed_key.expiration_time.as_secs()); tokio::spawn({ let weak_this = Arc::downgrade(&this); @@ -88,17 +173,22 @@ impl ReloadingCertificateResolver { Some(this) => this, None => break, // Resolver has been destroyed, exit. }; - match load_certified_key(&key_filename, &cert_filename).await { - Ok(new_certified_key) => { - if new_certified_key.cert == this.certified_key.load().cert { + cert_reload_started_counter.inc(); + + match load_and_parse_certified_key(&key_filename, &cert_filename).await { + Ok(parsed_key) => { + if parsed_key.certified_key.cert == this.certified_key.load().cert { tracing::debug!("Certificate has not changed since last reloading"); } else { tracing::info!("Certificate has been reloaded"); - this.certified_key.store(Arc::new(new_certified_key)); + this.certified_key.store(Arc::new(parsed_key.certified_key)); + cert_expiration_time.set(parsed_key.expiration_time.as_secs()); + cert_reload_updated_counter.inc(); } last_reload_failed = false; } Err(err) => { + cert_reload_failed_counter.inc(); // Note: Reloading certs may fail if it conflicts with the script updating // the files at the same time. Warn only if the error is persistent. if last_reload_failed { diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 8f56d60a4af9..53b68afb0f51 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -180,6 +180,7 @@ pub struct ConfigToml { #[serde(skip_serializing_if = "Option::is_none")] pub generate_unarchival_heatmap: Option, pub tracing: Option, + pub enable_tls_page_service_api: bool, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -206,6 +207,10 @@ pub struct PageServicePipeliningConfigPipelined { /// Causes runtime errors if larger than max get_vectored batch size. pub max_batch_size: NonZeroUsize, pub execution: PageServiceProtocolPipelinedExecutionStrategy, + // The default below is such that new versions of the software can start + // with the old configuration. + #[serde(default)] + pub batching: PageServiceProtocolPipelinedBatchingStrategy, } #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -215,6 +220,19 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy { Tasks, } +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum PageServiceProtocolPipelinedBatchingStrategy { + /// All get page requests in a batch will be at the same LSN + #[default] + UniformLsn, + /// Get page requests in a batch may be at different LSN + /// + /// One key cannot be present more than once at different LSNs in + /// the same batch. + ScatteredLsn, +} + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] #[serde(tag = "mode", rename_all = "kebab-case")] pub enum GetVectoredConcurrentIo { @@ -451,6 +469,8 @@ pub struct TenantConfigToml { // gc-compaction related configs /// Enable automatic gc-compaction trigger on this tenant. pub gc_compaction_enabled: bool, + /// Enable verification of gc-compaction results. + pub gc_compaction_verification: bool, /// The initial threshold for gc-compaction in KB. Once the total size of layers below the gc-horizon is above this threshold, /// gc-compaction will be triggered. pub gc_compaction_initial_threshold_kb: u64, @@ -612,9 +632,12 @@ impl Default for ConfigToml { page_service_pipelining: if !cfg!(test) { PageServicePipeliningConfig::Serial } else { + // Do not turn this into the default until scattered reads have been + // validated and rolled-out fully. PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { max_batch_size: NonZeroUsize::new(32).unwrap(), execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures, + batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn, }) }, get_vectored_concurrent_io: if !cfg!(test) { @@ -631,6 +654,7 @@ impl Default for ConfigToml { load_previous_heatmap: None, generate_unarchival_heatmap: None, tracing: None, + enable_tls_page_service_api: false, } } } @@ -690,6 +714,7 @@ pub mod tenant_conf_defaults { // image layers should be created. pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; pub const DEFAULT_GC_COMPACTION_ENABLED: bool = false; + pub const DEFAULT_GC_COMPACTION_VERIFICATION: bool = true; pub const DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB: u64 = 5 * 1024 * 1024; // 5GB pub const DEFAULT_GC_COMPACTION_RATIO_PERCENT: u64 = 100; } @@ -744,6 +769,7 @@ impl Default for TenantConfigToml { wal_receiver_protocol_override: None, rel_size_v2_enabled: false, gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED, + gc_compaction_verification: DEFAULT_GC_COMPACTION_VERIFICATION, gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB, gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT, sampling_ratio: None, diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 3cb62f9d180b..91f9c03ba4f6 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -7,7 +7,8 @@ use std::time::{Duration, Instant}; /// API (`/control/v1` prefix). Implemented by the server /// in [`storage_controller::http`] use serde::{Deserialize, Serialize}; -use utils::id::{NodeId, TenantId}; +use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; use crate::models::{PageserverUtilization, ShardParameters, TenantConfig}; use crate::shard::{ShardStripeSize, TenantShardId}; @@ -499,6 +500,15 @@ pub struct SafekeeperSchedulingPolicyRequest { pub scheduling_policy: SkSchedulingPolicy, } +/// Import request for safekeeper timelines. +#[derive(Serialize, Deserialize, Clone)] +pub struct TimelineImportRequest { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub start_lsn: Lsn, + pub sk_set: Vec, +} + #[cfg(test)] mod test { use serde_json; diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 8836e7ec8729..0c4d7fd4cb70 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -927,7 +927,7 @@ impl Key { /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`. #[inline(always)] - pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> { + pub fn to_rel_block(self) -> Result<(RelTag, BlockNumber), ToRelBlockError> { Ok(match self.field1 { 0x00 => ( RelTag { @@ -938,7 +938,7 @@ impl Key { }, self.field6, ), - _ => anyhow::bail!("unexpected value kind 0x{:02x}", self.field1), + _ => return Err(ToRelBlockError(self.field1)), }) } } @@ -951,6 +951,17 @@ impl std::str::FromStr for Key { } } +#[derive(Debug)] +pub struct ToRelBlockError(u8); + +impl fmt::Display for ToRelBlockError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "unexpected value kind 0x{:02x}", self.0) + } +} + +impl std::error::Error for ToRelBlockError {} + #[cfg(test)] mod tests { use std::str::FromStr; diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index e505f23e49ed..79e3ef553b97 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -613,8 +613,7 @@ mod tests { use rand::{RngCore, SeedableRng}; use super::*; - use crate::models::ShardParameters; - use crate::shard::{ShardCount, ShardNumber}; + use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber, ShardStripeSize}; // Helper function to create a key range. // @@ -964,12 +963,8 @@ mod tests { } #[test] fn sharded_range_relation_gap() { - let shard_identity = ShardIdentity::new( - ShardNumber(0), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, - ) - .unwrap(); + let shard_identity = + ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap(); let range = ShardedRange::new( Range { @@ -985,12 +980,8 @@ mod tests { #[test] fn shard_identity_keyspaces_single_key() { - let shard_identity = ShardIdentity::new( - ShardNumber(1), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, - ) - .unwrap(); + let shard_identity = + ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap(); let range = ShardedRange::new( Range { @@ -1034,12 +1025,8 @@ mod tests { #[test] fn shard_identity_keyspaces_forkno_gap() { - let shard_identity = ShardIdentity::new( - ShardNumber(1), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, - ) - .unwrap(); + let shard_identity = + ShardIdentity::new(ShardNumber(1), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap(); let range = ShardedRange::new( Range { @@ -1061,7 +1048,7 @@ mod tests { let shard_identity = ShardIdentity::new( ShardNumber(shard_number), ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, + DEFAULT_STRIPE_SIZE, ) .unwrap(); @@ -1144,37 +1131,44 @@ mod tests { /// for a single tenant. #[test] fn sharded_range_fragment_simple() { + const SHARD_COUNT: u8 = 4; + const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0; + let shard_identity = ShardIdentity::new( ShardNumber(0), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, + ShardCount::new(SHARD_COUNT), + ShardStripeSize(STRIPE_SIZE), ) .unwrap(); // A range which we happen to know covers exactly one stripe which belongs to this shard let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); - let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap(); + let mut input_end = input_start; + input_end.field6 += STRIPE_SIZE; // field6 is block number // Ask for stripe_size blocks, we get the whole stripe assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 32768), - (32768, vec![(32768, input_start..input_end)]) + do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE), + (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)]) ); // Ask for more, we still get the whole stripe assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 10000000), - (32768, vec![(32768, input_start..input_end)]) + do_fragment(input_start, input_end, &shard_identity, 10 * STRIPE_SIZE), + (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)]) ); // Ask for target_nblocks of half the stripe size, we get two halves assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 16384), + do_fragment(input_start, input_end, &shard_identity, STRIPE_SIZE / 2), ( - 32768, + STRIPE_SIZE, vec![ - (16384, input_start..input_start.add(16384)), - (16384, input_start.add(16384)..input_end) + ( + STRIPE_SIZE / 2, + input_start..input_start.add(STRIPE_SIZE / 2) + ), + (STRIPE_SIZE / 2, input_start.add(STRIPE_SIZE / 2)..input_end) ] ) ); @@ -1182,40 +1176,53 @@ mod tests { #[test] fn sharded_range_fragment_multi_stripe() { + const SHARD_COUNT: u8 = 4; + const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0; + const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE; + let shard_identity = ShardIdentity::new( ShardNumber(0), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, + ShardCount::new(SHARD_COUNT), + ShardStripeSize(STRIPE_SIZE), ) .unwrap(); // A range which covers multiple stripes, exactly one of which belongs to the current shard. let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); - let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap(); + let mut input_end = input_start; + input_end.field6 += RANGE_SIZE; // field6 is block number + // Ask for all the blocks, get a fragment that covers the whole range but reports // its size to be just the blocks belonging to our shard. assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 131072), - (32768, vec![(32768, input_start..input_end)]) + do_fragment(input_start, input_end, &shard_identity, RANGE_SIZE), + (STRIPE_SIZE, vec![(STRIPE_SIZE, input_start..input_end)]) ); - // Ask for a sub-stripe quantity + // Ask for a sub-stripe quantity that results in 3 fragments. + let limit = STRIPE_SIZE / 3 + 1; assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 16000), + do_fragment(input_start, input_end, &shard_identity, limit), ( - 32768, + STRIPE_SIZE, vec![ - (16000, input_start..input_start.add(16000)), - (16000, input_start.add(16000)..input_start.add(32000)), - (768, input_start.add(32000)..input_end), + (limit, input_start..input_start.add(limit)), + (limit, input_start.add(limit)..input_start.add(2 * limit)), + ( + STRIPE_SIZE - 2 * limit, + input_start.add(2 * limit)..input_end + ), ] ) ); // Try on a range that starts slightly after our owned stripe assert_eq!( - do_fragment(input_start.add(1), input_end, &shard_identity, 131072), - (32767, vec![(32767, input_start.add(1)..input_end)]) + do_fragment(input_start.add(1), input_end, &shard_identity, RANGE_SIZE), + ( + STRIPE_SIZE - 1, + vec![(STRIPE_SIZE - 1, input_start.add(1)..input_end)] + ) ); } @@ -1223,32 +1230,40 @@ mod tests { /// a previous relation. #[test] fn sharded_range_fragment_starting_from_logical_size() { + const SHARD_COUNT: u8 = 4; + const STRIPE_SIZE: u32 = DEFAULT_STRIPE_SIZE.0; + const RANGE_SIZE: u32 = SHARD_COUNT as u32 * STRIPE_SIZE; + let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap(); - let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap(); + let mut input_end = Key::from_hex("000000067f00000001000000ae0100000000").unwrap(); + input_end.field6 += RANGE_SIZE; // field6 is block number // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too let shard_identity = ShardIdentity::new( ShardNumber(0), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, + ShardCount::new(SHARD_COUNT), + ShardStripeSize(STRIPE_SIZE), ) .unwrap(); assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 0x10000), - (0x8001, vec![(0x8001, input_start..input_end)]) + do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE), + ( + STRIPE_SIZE + 1, + vec![(STRIPE_SIZE + 1, input_start..input_end)] + ) ); // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards // store all logical sizes) let shard_identity = ShardIdentity::new( ShardNumber(1), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, + ShardCount::new(SHARD_COUNT), + ShardStripeSize(STRIPE_SIZE), ) .unwrap(); assert_eq!( - do_fragment(input_start, input_end, &shard_identity, 0x10000), - (0x1, vec![(0x1, input_start..input_end)]) + do_fragment(input_start, input_end, &shard_identity, 2 * STRIPE_SIZE), + (1, vec![(1, input_start..input_end)]) ); } @@ -1284,12 +1299,8 @@ mod tests { ); // Same, but using a sharded identity - let shard_identity = ShardIdentity::new( - ShardNumber(0), - ShardCount::new(4), - ShardParameters::DEFAULT_STRIPE_SIZE, - ) - .unwrap(); + let shard_identity = + ShardIdentity::new(ShardNumber(0), ShardCount::new(4), DEFAULT_STRIPE_SIZE).unwrap(); assert_eq!( do_fragment(input_start, input_end, &shard_identity, 0x8000), (u32::MAX, vec![(u32::MAX, input_start..input_end),]) @@ -1331,7 +1342,7 @@ mod tests { ShardIdentity::new( ShardNumber((prng.next_u32() % shard_count) as u8), ShardCount::new(shard_count as u8), - ShardParameters::DEFAULT_STRIPE_SIZE, + DEFAULT_STRIPE_SIZE, ) .unwrap() }; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 2ffff676882f..f491ed10e1a6 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -26,7 +26,7 @@ use utils::{completion, serde_system_time}; use crate::config::Ratio; use crate::key::{CompactKey, Key}; use crate::reltag::RelTag; -use crate::shard::{ShardCount, ShardStripeSize, TenantShardId}; +use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId}; /// The state of a tenant in this pageserver. /// @@ -438,8 +438,6 @@ pub struct ShardParameters { } impl ShardParameters { - pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); - pub fn is_unsharded(&self) -> bool { self.count.is_unsharded() } @@ -449,7 +447,7 @@ impl Default for ShardParameters { fn default() -> Self { Self { count: ShardCount::new(0), - stripe_size: Self::DEFAULT_STRIPE_SIZE, + stripe_size: DEFAULT_STRIPE_SIZE, } } } @@ -578,6 +576,8 @@ pub struct TenantConfigPatch { #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_compaction_enabled: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] + pub gc_compaction_verification: FieldPatch, + #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_compaction_initial_threshold_kb: FieldPatch, #[serde(skip_serializing_if = "FieldPatch::is_noop")] pub gc_compaction_ratio_percent: FieldPatch, @@ -698,6 +698,9 @@ pub struct TenantConfig { #[serde(skip_serializing_if = "Option::is_none")] pub gc_compaction_enabled: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub gc_compaction_verification: Option, + #[serde(skip_serializing_if = "Option::is_none")] pub gc_compaction_initial_threshold_kb: Option, @@ -746,6 +749,7 @@ impl TenantConfig { mut wal_receiver_protocol_override, mut rel_size_v2_enabled, mut gc_compaction_enabled, + mut gc_compaction_verification, mut gc_compaction_initial_threshold_kb, mut gc_compaction_ratio_percent, mut sampling_ratio, @@ -837,6 +841,9 @@ impl TenantConfig { patch .gc_compaction_enabled .apply(&mut gc_compaction_enabled); + patch + .gc_compaction_verification + .apply(&mut gc_compaction_verification); patch .gc_compaction_initial_threshold_kb .apply(&mut gc_compaction_initial_threshold_kb); @@ -878,6 +885,7 @@ impl TenantConfig { wal_receiver_protocol_override, rel_size_v2_enabled, gc_compaction_enabled, + gc_compaction_verification, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, sampling_ratio, @@ -976,6 +984,9 @@ impl TenantConfig { gc_compaction_enabled: self .gc_compaction_enabled .unwrap_or(global_conf.gc_compaction_enabled), + gc_compaction_verification: self + .gc_compaction_verification + .unwrap_or(global_conf.gc_compaction_verification), gc_compaction_initial_threshold_kb: self .gc_compaction_initial_threshold_kb .unwrap_or(global_conf.gc_compaction_initial_threshold_kb), @@ -1680,6 +1691,7 @@ pub struct SecondaryProgress { pub struct TenantScanRemoteStorageShard { pub tenant_shard_id: TenantShardId, pub generation: Option, + pub stripe_size: Option, } #[derive(Serialize, Deserialize, Debug, Default)] diff --git a/libs/pageserver_api/src/record.rs b/libs/pageserver_api/src/record.rs index fda504a26ef3..73516c52203e 100644 --- a/libs/pageserver_api/src/record.rs +++ b/libs/pageserver_api/src/record.rs @@ -58,6 +58,8 @@ pub enum NeonWalRecord { /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and /// its references in `timeline.rs`. will_init: bool, + /// Only append the record if the current image is the same as the one specified in this field. + only_if: Option, }, } @@ -81,6 +83,17 @@ impl NeonWalRecord { append: s.as_ref().to_string(), clear: false, will_init: false, + only_if: None, + } + } + + #[cfg(feature = "testing")] + pub fn wal_append_conditional(s: impl AsRef, only_if: impl AsRef) -> Self { + Self::Test { + append: s.as_ref().to_string(), + clear: false, + will_init: false, + only_if: Some(only_if.as_ref().to_string()), } } @@ -90,6 +103,7 @@ impl NeonWalRecord { append: s.as_ref().to_string(), clear: true, will_init: false, + only_if: None, } } @@ -99,6 +113,7 @@ impl NeonWalRecord { append: s.as_ref().to_string(), clear: true, will_init: true, + only_if: None, } } } diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 8386d6e586f6..feb59f5070cb 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -78,6 +78,12 @@ impl Default for ShardStripeSize { } } +impl std::fmt::Display for ShardStripeSize { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.fmt(f) + } +} + /// Layout version: for future upgrades where we might change how the key->shard mapping works #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)] pub struct ShardLayout(u8); @@ -86,8 +92,11 @@ const LAYOUT_V1: ShardLayout = ShardLayout(1); /// ShardIdentity uses a magic layout value to indicate if it is unusable const LAYOUT_BROKEN: ShardLayout = ShardLayout(255); -/// Default stripe size in pages: 256MiB divided by 8kiB page size. -const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8); +/// The default stripe size in pages. 16 MiB divided by 8 kiB page size. +/// +/// A lower stripe size distributes ingest load better across shards, but reduces IO amortization. +/// 16 MiB appears to be a reasonable balance: . +pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(16 * 1024 / 8); #[derive(thiserror::Error, Debug, PartialEq, Eq)] pub enum ShardConfigError { @@ -537,7 +546,7 @@ mod tests { field6: 0x7d06, }; - let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key); + let shard = key_to_shard_number(ShardCount(10), ShardStripeSize(32768), &key); assert_eq!(shard, ShardNumber(8)); } diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index a0a891f0dc1a..654dde8da642 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -5,7 +5,6 @@ #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] use std::future::Future; -use std::io::ErrorKind; use std::net::SocketAddr; use std::os::fd::{AsRawFd, RawFd}; use std::pin::Pin; @@ -227,7 +226,7 @@ impl MaybeWriteOnly { match self { MaybeWriteOnly::Full(framed) => framed.read_startup_message().await, MaybeWriteOnly::WriteOnly(_) => { - Err(io::Error::new(ErrorKind::Other, "reading from write only half").into()) + Err(io::Error::other("reading from write only half").into()) } MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), } @@ -237,7 +236,7 @@ impl MaybeWriteOnly { match self { MaybeWriteOnly::Full(framed) => framed.read_message().await, MaybeWriteOnly::WriteOnly(_) => { - Err(io::Error::new(ErrorKind::Other, "reading from write only half").into()) + Err(io::Error::other("reading from write only half").into()) } MaybeWriteOnly::Broken => panic!("IO on invalid MaybeWriteOnly"), } @@ -975,7 +974,7 @@ impl AsyncWrite for CopyDataWriter<'_, IO> { .write_message_noflush(&BeMessage::CopyData(buf)) // write_message only writes to the buffer, so it can fail iff the // message is invaid, but CopyData can't be invalid. - .map_err(|_| io::Error::new(ErrorKind::Other, "failed to serialize CopyData"))?; + .map_err(|_| io::Error::other("failed to serialize CopyData"))?; Poll::Ready(Ok(buf.len())) } diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index 907ef9eed3b4..75ca12301463 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -85,8 +85,8 @@ static KEY: Lazy> = Lazy::new(|| { static CERT: Lazy> = Lazy::new(|| { let mut cursor = Cursor::new(include_bytes!("cert.pem")); - let cert = rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap(); - cert + + rustls_pemfile::certs(&mut cursor).next().unwrap().unwrap() }); // test that basic select with ssl works diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs index 8e216d0f44ad..4e5e48ecf585 100644 --- a/libs/pq_proto/src/framed.rs +++ b/libs/pq_proto/src/framed.rs @@ -35,7 +35,7 @@ impl ConnectionError { pub fn into_io_error(self) -> io::Error { match self { ConnectionError::Io(io) => io, - ConnectionError::Protocol(pe) => io::Error::new(io::ErrorKind::Other, pe.to_string()), + ConnectionError::Protocol(pe) => io::Error::other(pe.to_string()), } } } diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index e435ffbf7e05..e7afc6456401 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -257,7 +257,7 @@ pub enum ProtocolError { impl ProtocolError { /// Proxy stream.rs uses only io::Error; provide it. pub fn into_io_error(self) -> io::Error { - io::Error::new(io::ErrorKind::Other, self.to_string()) + io::Error::other(self.to_string()) } } diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs index 27e05e24ec4a..2daf9a80d453 100644 --- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs +++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs @@ -212,7 +212,7 @@ impl ScramSha256 { password, channel_binding, } => (nonce, password, channel_binding), - _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")), + _ => return Err(io::Error::other("invalid SCRAM state")), }; let message = @@ -291,7 +291,7 @@ impl ScramSha256 { server_key, auth_message, } => (server_key, auth_message), - _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")), + _ => return Err(io::Error::other("invalid SCRAM state")), }; let message = @@ -301,10 +301,7 @@ impl ScramSha256 { let verifier = match parsed { ServerFinalMessage::Error(e) => { - return Err(io::Error::new( - io::ErrorKind::Other, - format!("SCRAM error: {}", e), - )); + return Err(io::Error::other(format!("SCRAM error: {}", e))); } ServerFinalMessage::Verifier(verifier) => verifier, }; diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 7bdf340f74b7..bd18d80915a7 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -28,7 +28,7 @@ toml_edit.workspace = true tracing.workspace = true scopeguard.workspace = true metrics.workspace = true -utils.workspace = true +utils = { path = "../utils", default-features = false } pin-project-lite.workspace = true azure_core.workspace = true diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index dee61a410d7d..18146c5464d5 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -801,8 +801,7 @@ where // that support needs to be hacked in. // // including {self:?} into the message would be useful, but unsure how to unproject. - _ => std::task::Poll::Ready(Err(std::io::Error::new( - std::io::ErrorKind::Other, + _ => std::task::Poll::Ready(Err(std::io::Error::other( "cloned or initial values cannot be read", ))), } @@ -855,7 +854,7 @@ where }; Err(azure_core::error::Error::new( azure_core::error::ErrorKind::Io, - std::io::Error::new(std::io::ErrorKind::Other, msg), + std::io::Error::other(msg), )) } diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 4180602ac78f..fd2fa63fd09c 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -5,7 +5,8 @@ edition.workspace = true license.workspace = true [features] -default = [] +default = ["rename_noreplace"] +rename_noreplace = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions testing = ["fail/failpoints"] @@ -35,7 +36,7 @@ serde_with.workspace = true serde_json.workspace = true signal-hook.workspace = true thiserror.workspace = true -tokio.workspace = true +tokio = { workspace = true, features = ["signal"] } tokio-tar.workspace = true tokio-util.workspace = true toml_edit = { workspace = true, features = ["serde"] } diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs index cc5b0b1d1393..db4fc5685c10 100644 --- a/libs/utils/src/auth.rs +++ b/libs/utils/src/auth.rs @@ -173,7 +173,7 @@ impl std::fmt::Debug for JwtAuth { } // this function is used only for testing purposes in CLI e g generate tokens during init -pub fn encode_from_key_file(claims: &Claims, key_data: &[u8]) -> Result { +pub fn encode_from_key_file(claims: &S, key_data: &[u8]) -> Result { let key = EncodingKey::from_ed_pem(key_data)?; Ok(encode(&Header::new(STORAGE_TOKEN_ALGORITHM), claims, &key)?) } diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index 290a5b26863a..215fa36df49b 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -81,12 +81,9 @@ pub fn path_with_suffix_extension( } pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> { - let parent = file_path.parent().ok_or_else(|| { - io::Error::new( - io::ErrorKind::Other, - format!("File {file_path:?} has no parent"), - ) - })?; + let parent = file_path + .parent() + .ok_or_else(|| io::Error::other(format!("File {file_path:?} has no parent")))?; fsync(file_path)?; fsync(parent)?; diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs index a406ab0378e7..e16edaaa9a96 100644 --- a/libs/utils/src/fs_ext.rs +++ b/libs/utils/src/fs_ext.rs @@ -3,7 +3,9 @@ use std::{fs, io, path::Path}; use anyhow::Context; +#[cfg(feature = "rename_noreplace")] mod rename_noreplace; +#[cfg(feature = "rename_noreplace")] pub use rename_noreplace::rename_noreplace; pub trait PathExt { diff --git a/libs/utils/src/fs_ext/rename_noreplace.rs b/libs/utils/src/fs_ext/rename_noreplace.rs index fc6f794b57f8..d0c07353d022 100644 --- a/libs/utils/src/fs_ext/rename_noreplace.rs +++ b/libs/utils/src/fs_ext/rename_noreplace.rs @@ -8,7 +8,7 @@ pub fn rename_noreplace( dst: &P2, ) -> nix::Result<()> { { - #[cfg(target_os = "linux")] + #[cfg(all(target_os = "linux", target_env = "gnu"))] { nix::fcntl::renameat2( None, @@ -29,7 +29,7 @@ pub fn rename_noreplace( })??; nix::errno::Errno::result(res).map(drop) } - #[cfg(not(any(target_os = "linux", target_os = "macos")))] + #[cfg(not(any(all(target_os = "linux", target_env = "gnu"), target_os = "macos")))] { std::compile_error!("OS does not support no-replace renames"); } diff --git a/libs/utils/src/signals.rs b/libs/utils/src/signals.rs index f2be1957c42a..426bb659167b 100644 --- a/libs/utils/src/signals.rs +++ b/libs/utils/src/signals.rs @@ -1,6 +1,8 @@ pub use signal_hook::consts::TERM_SIGNALS; pub use signal_hook::consts::signal::*; use signal_hook::iterator::Signals; +use tokio::signal::unix::{SignalKind, signal}; +use tracing::info; pub enum Signal { Quit, @@ -36,3 +38,30 @@ impl ShutdownSignals { Ok(()) } } + +/// Runs in a loop since we want to be responsive to multiple signals +/// even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown) +/// +pub async fn signal_handler(token: tokio_util::sync::CancellationToken) { + let mut sigint = signal(SignalKind::interrupt()).unwrap(); + let mut sigterm = signal(SignalKind::terminate()).unwrap(); + let mut sigquit = signal(SignalKind::quit()).unwrap(); + + loop { + let signal = tokio::select! { + _ = sigquit.recv() => { + info!("Got signal SIGQUIT. Terminating in immediate shutdown mode."); + std::process::exit(111); + } + _ = sigint.recv() => "SIGINT", + _ = sigterm.recv() => "SIGTERM", + }; + + if !token.is_cancelled() { + info!("Got signal {signal}. Terminating gracefully in fast shutdown mode."); + token.cancel(); + } else { + info!("Got signal {signal}. Already shutting down."); + } + } +} diff --git a/object_storage/Cargo.toml b/object_storage/Cargo.toml new file mode 100644 index 000000000000..17fbaefe6f37 --- /dev/null +++ b/object_storage/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "object_storage" +version = "0.0.1" +edition.workspace = true +license.workspace = true +[dependencies] +anyhow.workspace = true +axum-extra.workspace = true +axum.workspace = true +camino.workspace = true +futures.workspace = true +jsonwebtoken.workspace = true +prometheus.workspace = true +remote_storage.workspace = true +serde.workspace = true +serde_json.workspace = true +tokio-util.workspace = true +tokio.workspace = true +tracing.workspace = true +utils = { path = "../libs/utils", default-features = false } +workspace_hack.workspace = true +[dev-dependencies] +camino-tempfile.workspace = true +http-body-util.workspace = true +itertools.workspace = true +rand.workspace = true +test-log.workspace = true +tower.workspace = true diff --git a/object_storage/src/app.rs b/object_storage/src/app.rs new file mode 100644 index 000000000000..7b5627f0db95 --- /dev/null +++ b/object_storage/src/app.rs @@ -0,0 +1,561 @@ +use anyhow::anyhow; +use axum::body::{Body, Bytes}; +use axum::response::{IntoResponse, Response}; +use axum::{Router, http::StatusCode}; +use object_storage::{PrefixS3Path, S3Path, Storage, bad_request, internal_error, not_found, ok}; +use remote_storage::TimeoutOrCancel; +use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, RemotePath}; +use std::{sync::Arc, time::SystemTime, time::UNIX_EPOCH}; +use tokio_util::sync::CancellationToken; +use tracing::{error, info}; +use utils::backoff::retry; + +pub fn app(state: Arc) -> Router<()> { + use axum::routing::{delete as _delete, get as _get}; + let delete_prefix = _delete(delete_prefix); + Router::new() + .route( + "/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}", + _get(get).put(set).delete(delete), + ) + .route( + "/{tenant_id}/{timeline_id}/{endpoint_id}", + delete_prefix.clone(), + ) + .route("/{tenant_id}/{timeline_id}", delete_prefix.clone()) + .route("/{tenant_id}", delete_prefix) + .route("/metrics", _get(metrics)) + .route("/status", _get(async || StatusCode::OK.into_response())) + .with_state(state) +} + +type Result = anyhow::Result; +type State = axum::extract::State>; + +const CONTENT_TYPE: &str = "content-type"; +const APPLICATION_OCTET_STREAM: &str = "application/octet-stream"; +const WARN_THRESHOLD: u32 = 3; +const MAX_RETRIES: u32 = 10; + +async fn metrics() -> Result { + prometheus::TextEncoder::new() + .encode_to_string(&prometheus::gather()) + .map(|s| s.into_response()) + .map_err(|e| internal_error(e, "/metrics", "collecting metrics")) +} + +async fn get(S3Path { path }: S3Path, state: State) -> Result { + info!(%path, "downloading"); + let download_err = |e| { + if let DownloadError::NotFound = e { + info!(%path, %e, "downloading"); // 404 is not an issue of _this_ service + return not_found(&path); + } + internal_error(e, &path, "downloading") + }; + let cancel = state.cancel.clone(); + let opts = &DownloadOpts::default(); + + let stream = retry( + async || state.storage.download(&path, opts, &cancel).await, + DownloadError::is_permanent, + WARN_THRESHOLD, + MAX_RETRIES, + "downloading", + &cancel, + ) + .await + .unwrap_or(Err(DownloadError::Cancelled)) + .map_err(download_err)? + .download_stream; + + Response::builder() + .status(StatusCode::OK) + .header(CONTENT_TYPE, APPLICATION_OCTET_STREAM) + .body(Body::from_stream(stream)) + .map_err(|e| internal_error(e, path, "reading response")) +} + +// Best solution for files is multipart upload, but remote_storage doesn't support it, +// so we can either read Bytes in memory and push at once or forward BodyDataStream to +// remote_storage. The latter may seem more peformant, but BodyDataStream doesn't have a +// guaranteed size() which may produce issues while uploading to s3. +// So, currently we're going with an in-memory copy plus a boundary to prevent uploading +// very large files. +async fn set(S3Path { path }: S3Path, state: State, bytes: Bytes) -> Result { + info!(%path, "uploading"); + let request_len = bytes.len(); + let max_len = state.max_upload_file_limit; + if request_len > max_len { + return Err(bad_request( + anyhow!("File size {request_len} exceeds max {max_len}"), + "uploading", + )); + } + + let cancel = state.cancel.clone(); + let fun = async || { + let stream = bytes_to_stream(bytes.clone()); + state + .storage + .upload(stream, request_len, &path, None, &cancel) + .await + }; + retry( + fun, + TimeoutOrCancel::caused_by_cancel, + WARN_THRESHOLD, + MAX_RETRIES, + "uploading", + &cancel, + ) + .await + .unwrap_or(Err(anyhow!("uploading cancelled"))) + .map_err(|e| internal_error(e, path, "reading response"))?; + Ok(ok()) +} + +async fn delete(S3Path { path }: S3Path, state: State) -> Result { + info!(%path, "deleting"); + let cancel = state.cancel.clone(); + retry( + async || state.storage.delete(&path, &cancel).await, + TimeoutOrCancel::caused_by_cancel, + WARN_THRESHOLD, + MAX_RETRIES, + "deleting", + &cancel, + ) + .await + .unwrap_or(Err(anyhow!("deleting cancelled"))) + .map_err(|e| internal_error(e, path, "deleting"))?; + Ok(ok()) +} + +async fn delete_prefix(PrefixS3Path { path }: PrefixS3Path, state: State) -> Result { + info!(%path, "deleting prefix"); + let cancel = state.cancel.clone(); + retry( + async || state.storage.delete_prefix(&path, &cancel).await, + TimeoutOrCancel::caused_by_cancel, + WARN_THRESHOLD, + MAX_RETRIES, + "deleting prefix", + &cancel, + ) + .await + .unwrap_or(Err(anyhow!("deleting prefix cancelled"))) + .map_err(|e| internal_error(e, path, "deleting prefix"))?; + Ok(ok()) +} + +pub async fn check_storage_permissions( + client: &GenericRemoteStorage, + cancel: CancellationToken, +) -> anyhow::Result<()> { + info!("storage permissions check"); + + // as_nanos() as multiple instances proxying same bucket may be started at once + let now = SystemTime::now() + .duration_since(UNIX_EPOCH)? + .as_nanos() + .to_string(); + + let path = RemotePath::from_string(&format!("write_access_{now}"))?; + info!(%path, "uploading"); + + let body = now.to_string(); + let stream = bytes_to_stream(Bytes::from(body.clone())); + client + .upload(stream, body.len(), &path, None, &cancel) + .await?; + + use tokio::io::AsyncReadExt; + info!(%path, "downloading"); + let download_opts = DownloadOpts { + kind: remote_storage::DownloadKind::Small, + ..Default::default() + }; + let mut body_read_buf = Vec::new(); + let stream = client + .download(&path, &download_opts, &cancel) + .await? + .download_stream; + tokio_util::io::StreamReader::new(stream) + .read_to_end(&mut body_read_buf) + .await?; + let body_read = String::from_utf8(body_read_buf)?; + if body != body_read { + error!(%body, %body_read, "File contents do not match"); + anyhow::bail!("Read back file doesn't match original") + } + + info!(%path, "removing"); + client.delete(&path, &cancel).await +} + +fn bytes_to_stream(bytes: Bytes) -> impl futures::Stream> { + futures::stream::once(futures::future::ready(Ok(bytes))) +} + +#[cfg(test)] +mod tests { + use super::*; + use axum::{body::Body, extract::Request, response::Response}; + use http_body_util::BodyExt; + use itertools::iproduct; + use std::env::var; + use std::sync::Arc; + use std::time::Duration; + use test_log::test as testlog; + use tower::{Service, util::ServiceExt}; + use utils::id::{TenantId, TimelineId}; + + // see libs/remote_storage/tests/test_real_s3.rs + const REAL_S3_ENV: &str = "ENABLE_REAL_S3_REMOTE_STORAGE"; + const REAL_S3_BUCKET: &str = "REMOTE_STORAGE_S3_BUCKET"; + const REAL_S3_REGION: &str = "REMOTE_STORAGE_S3_REGION"; + + async fn proxy() -> (Storage, Option) { + let cancel = CancellationToken::new(); + let (dir, storage) = if var(REAL_S3_ENV).is_err() { + // tests execute in parallel and we need a new directory for each of them + let dir = camino_tempfile::tempdir().unwrap(); + let fs = + remote_storage::LocalFs::new(dir.path().into(), Duration::from_secs(5)).unwrap(); + (Some(dir), GenericRemoteStorage::LocalFs(fs)) + } else { + // test_real_s3::create_s3_client is hard to reference, reimplementing here + let millis = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis(); + use rand::Rng; + let random = rand::thread_rng().r#gen::(); + + let s3_config = remote_storage::S3Config { + bucket_name: var(REAL_S3_BUCKET).unwrap(), + bucket_region: var(REAL_S3_REGION).unwrap(), + prefix_in_bucket: Some(format!("test_{millis}_{random:08x}/")), + endpoint: None, + concurrency_limit: std::num::NonZeroUsize::new(100).unwrap(), + max_keys_per_list_response: None, + upload_storage_class: None, + }; + let bucket = remote_storage::S3Bucket::new(&s3_config, Duration::from_secs(1)) + .await + .unwrap(); + (None, GenericRemoteStorage::AwsS3(Arc::new(bucket))) + }; + + let proxy = Storage { + auth: object_storage::JwtAuth::new(TEST_PUB_KEY_ED25519).unwrap(), + storage, + cancel: cancel.clone(), + max_upload_file_limit: usize::MAX, + }; + check_storage_permissions(&proxy.storage, cancel) + .await + .unwrap(); + (proxy, dir) + } + + // see libs/utils/src/auth.rs + const TEST_PUB_KEY_ED25519: &[u8] = b" +-----BEGIN PUBLIC KEY----- +MCowBQYDK2VwAyEARYwaNBayR+eGI0iXB4s3QxE3Nl2g1iWbr6KtLWeVD/w= +-----END PUBLIC KEY----- +"; + + const TEST_PRIV_KEY_ED25519: &[u8] = br#" +-----BEGIN PRIVATE KEY----- +MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH +-----END PRIVATE KEY----- +"#; + + async fn request(req: Request) -> Response { + let (proxy, _) = proxy().await; + app(Arc::new(proxy)) + .into_service() + .oneshot(req) + .await + .unwrap() + } + + #[testlog(tokio::test)] + async fn status() { + let res = Request::builder() + .uri("/status") + .body(Body::empty()) + .map(request) + .unwrap() + .await; + assert_eq!(res.status(), StatusCode::OK); + } + + fn routes() -> impl Iterator { + iproduct!( + vec!["/1", "/1/2", "/1/2/3", "/1/2/3/4"], + vec!["GET", "PUT", "DELETE"] + ) + } + + #[testlog(tokio::test)] + async fn no_token() { + for (uri, method) in routes() { + info!(%uri, %method); + let res = Request::builder() + .uri(uri) + .method(method) + .body(Body::empty()) + .map(request) + .unwrap() + .await; + assert!(matches!( + res.status(), + StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST + )); + } + } + + #[testlog(tokio::test)] + async fn invalid_token() { + for (uri, method) in routes() { + info!(%uri, %method); + let status = Request::builder() + .uri(uri) + .header("Authorization", "Bearer 123") + .method(method) + .body(Body::empty()) + .map(request) + .unwrap() + .await; + assert!(matches!( + status.status(), + StatusCode::METHOD_NOT_ALLOWED | StatusCode::BAD_REQUEST + )); + } + } + + const TENANT_ID: TenantId = + TenantId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]); + const TIMELINE_ID: TimelineId = + TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]); + const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg"; + fn token() -> String { + let claims = object_storage::Claims { + tenant_id: TENANT_ID, + timeline_id: TIMELINE_ID, + endpoint_id: ENDPOINT_ID.into(), + exp: u64::MAX, + }; + let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap(); + let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO); + jsonwebtoken::encode(&header, &claims, &key).unwrap() + } + + #[testlog(tokio::test)] + async fn unauthorized() { + let (proxy, _) = proxy().await; + let mut app = app(Arc::new(proxy)).into_service(); + let token = token(); + let args = itertools::iproduct!( + vec![TENANT_ID.to_string(), TenantId::generate().to_string()], + vec![TIMELINE_ID.to_string(), TimelineId::generate().to_string()], + vec![ENDPOINT_ID, "ep-ololo"] + ) + .skip(1); + + for ((uri, method), (tenant, timeline, endpoint)) in iproduct!(routes(), args) { + info!(%uri, %method, %tenant, %timeline, %endpoint); + let request = Request::builder() + .uri(format!("/{tenant}/{timeline}/{endpoint}/sub/path/key")) + .method(method) + .header("Authorization", format!("Bearer {}", token)) + .body(Body::empty()) + .unwrap(); + let status = ServiceExt::ready(&mut app) + .await + .unwrap() + .call(request) + .await + .unwrap() + .status(); + assert_eq!(status, StatusCode::UNAUTHORIZED); + } + } + + #[testlog(tokio::test)] + async fn method_not_allowed() { + let token = token(); + let iter = iproduct!(vec!["", "/.."], vec!["GET", "PUT"]); + for (key, method) in iter { + let status = Request::builder() + .uri(format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}{key}")) + .method(method) + .header("Authorization", format!("Bearer {token}")) + .body(Body::empty()) + .map(request) + .unwrap() + .await + .status(); + assert!(matches!( + status, + StatusCode::BAD_REQUEST | StatusCode::METHOD_NOT_ALLOWED + )); + } + } + + async fn requests_chain( + chain: impl Iterator, + token: impl Fn(&str) -> String, + ) { + let (proxy, _) = proxy().await; + let mut app = app(Arc::new(proxy)).into_service(); + for (uri, method, body, expected_status, compare_body) in chain { + info!(%uri, %method, %body, %expected_status); + let bearer = format!("Bearer {}", token(&uri)); + let request = Request::builder() + .uri(uri) + .method(method) + .header("Authorization", &bearer) + .body(Body::from(body)) + .unwrap(); + let response = ServiceExt::ready(&mut app) + .await + .unwrap() + .call(request) + .await + .unwrap(); + assert_eq!(response.status(), expected_status); + if !compare_body { + continue; + } + let read_body = response.into_body().collect().await.unwrap().to_bytes(); + assert_eq!(body, read_body); + } + } + + #[testlog(tokio::test)] + async fn metrics() { + let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key"); + let req = vec![ + (uri.clone(), "PUT", "body", StatusCode::OK, false), + (uri.clone(), "DELETE", "", StatusCode::OK, false), + ]; + requests_chain(req.into_iter(), |_| token()).await; + + let res = Request::builder() + .uri("/metrics") + .body(Body::empty()) + .map(request) + .unwrap() + .await; + assert_eq!(res.status(), StatusCode::OK); + let body = res.into_body().collect().await.unwrap().to_bytes(); + let body = String::from_utf8_lossy(&body); + tracing::debug!(%body); + // Storage metrics are not gathered for LocalFs + if var(REAL_S3_ENV).is_ok() { + assert!(body.contains("remote_storage_s3_deleted_objects_total")); + } + assert!(body.contains("process_threads")); + } + + #[testlog(tokio::test)] + async fn insert_retrieve_remove() { + let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/key"); + let chain = vec![ + (uri.clone(), "GET", "", StatusCode::NOT_FOUND, false), + (uri.clone(), "PUT", "пыщьпыщь", StatusCode::OK, false), + (uri.clone(), "GET", "пыщьпыщь", StatusCode::OK, true), + (uri.clone(), "DELETE", "", StatusCode::OK, false), + (uri, "GET", "", StatusCode::NOT_FOUND, false), + ]; + requests_chain(chain.into_iter(), |_| token()).await; + } + + fn delete_prefix_token(uri: &str) -> String { + use serde::Serialize; + let parts = uri.split("/").collect::>(); + #[derive(Serialize)] + struct PrefixClaims { + tenant_id: TenantId, + timeline_id: Option, + endpoint_id: Option, + exp: u64, + } + let claims = PrefixClaims { + tenant_id: parts.get(1).map(|c| c.parse().unwrap()).unwrap(), + timeline_id: parts.get(2).map(|c| c.parse().unwrap()), + endpoint_id: parts.get(3).map(ToString::to_string), + exp: u64::MAX, + }; + let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap(); + let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO); + jsonwebtoken::encode(&header, &claims, &key).unwrap() + } + + // Can't use single digit numbers as they won't be validated as TimelineId and EndpointId + #[testlog(tokio::test)] + async fn delete_prefix() { + let tenant_id = + TenantId::from_array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]).to_string(); + let t2 = TimelineId::from_array([2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + let t3 = TimelineId::from_array([3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + let t4 = TimelineId::from_array([4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); + let f = |timeline, path| format!("/{tenant_id}/{timeline}{path}"); + // Why extra slash in string literals? Axum is weird with URIs: + // /1/2 and 1/2/ match different routes, thus first yields OK and second NOT_FOUND + // as it matches /tenant/timeline/endpoint, see https://stackoverflow.com/a/75355932 + // The cost of removing trailing slash is suprisingly hard: + // * Add tower dependency with NormalizePath layer + // * wrap Router<()> in this layer https://github.com/tokio-rs/axum/discussions/2377 + // * Rewrite make_service() -> into_make_service() + // * Rewrite oneshot() (not available for NormalizePath) + // I didn't manage to get it working correctly + let chain = vec![ + // create 1/2/3/4, 1/2/3/5, delete prefix 1/2/3 -> empty + (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), + (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), // we can override file contents + (f(t2, "/3/5"), "PUT", "", StatusCode::OK, false), + (f(t2, "/3"), "DELETE", "", StatusCode::OK, false), + (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t2, "/3/5"), "GET", "", StatusCode::NOT_FOUND, false), + // create 1/2/3/4, 1/2/5/6, delete prefix 1/2/3 -> 1/2/5/6 + (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), + (f(t2, "/5/6"), "PUT", "", StatusCode::OK, false), + (f(t2, "/3"), "DELETE", "", StatusCode::OK, false), + (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t2, "/5/6"), "GET", "", StatusCode::OK, false), + // create 1/2/3/4, 1/2/7/8, delete prefix 1/2 -> empty + (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), + (f(t2, "/7/8"), "PUT", "", StatusCode::OK, false), + (f(t2, ""), "DELETE", "", StatusCode::OK, false), + (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t2, "/7/8"), "GET", "", StatusCode::NOT_FOUND, false), + // create 1/2/3/4, 1/2/5/6, 1/3/8/9, delete prefix 1/2/3 -> 1/2/5/6, 1/3/8/9 + (f(t2, "/3/4"), "PUT", "", StatusCode::OK, false), + (f(t2, "/5/6"), "PUT", "", StatusCode::OK, false), + (f(t3, "/8/9"), "PUT", "", StatusCode::OK, false), + (f(t2, "/3"), "DELETE", "", StatusCode::OK, false), + (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t2, "/5/6"), "GET", "", StatusCode::OK, false), + (f(t3, "/8/9"), "GET", "", StatusCode::OK, false), + // create 1/4/5/6, delete prefix 1/2 -> 1/3/8/9, 1/4/5/6 + (f(t4, "/5/6"), "PUT", "", StatusCode::OK, false), + (f(t2, ""), "DELETE", "", StatusCode::OK, false), + (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t3, "/8/9"), "GET", "", StatusCode::OK, false), + (f(t4, "/5/6"), "GET", "", StatusCode::OK, false), + // delete prefix 1 -> empty + (format!("/{tenant_id}"), "DELETE", "", StatusCode::OK, false), + (f(t2, "/3/4"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t2, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t3, "/8/9"), "GET", "", StatusCode::NOT_FOUND, false), + (f(t4, "/5/6"), "GET", "", StatusCode::NOT_FOUND, false), + ]; + requests_chain(chain.into_iter(), delete_prefix_token).await; + } +} diff --git a/object_storage/src/lib.rs b/object_storage/src/lib.rs new file mode 100644 index 000000000000..989afd4c25aa --- /dev/null +++ b/object_storage/src/lib.rs @@ -0,0 +1,344 @@ +use anyhow::Result; +use axum::extract::{FromRequestParts, Path}; +use axum::response::{IntoResponse, Response}; +use axum::{RequestPartsExt, http::StatusCode, http::request::Parts}; +use axum_extra::TypedHeader; +use axum_extra::headers::{Authorization, authorization::Bearer}; +use camino::Utf8PathBuf; +use jsonwebtoken::{DecodingKey, Validation}; +use remote_storage::{GenericRemoteStorage, RemotePath}; +use serde::{Deserialize, Serialize}; +use std::fmt::Display; +use std::result::Result as StdResult; +use std::sync::Arc; +use tokio_util::sync::CancellationToken; +use tracing::{debug, error}; +use utils::id::{TenantId, TimelineId}; + +// simplified version of utils::auth::JwtAuth +pub struct JwtAuth { + decoding_key: DecodingKey, + validation: Validation, +} + +pub const VALIDATION_ALGO: jsonwebtoken::Algorithm = jsonwebtoken::Algorithm::EdDSA; +impl JwtAuth { + pub fn new(key: &[u8]) -> Result { + Ok(Self { + decoding_key: DecodingKey::from_ed_pem(key)?, + validation: Validation::new(VALIDATION_ALGO), + }) + } + + pub fn decode(&self, token: &str) -> Result { + Ok(jsonwebtoken::decode(token, &self.decoding_key, &self.validation).map(|t| t.claims)?) + } +} + +fn normalize_key(key: &str) -> StdResult { + let key = clean_utf8(&Utf8PathBuf::from(key)); + if key.starts_with("..") || key == "." || key == "/" { + return Err(format!("invalid key {key}")); + } + match key.strip_prefix("/").map(Utf8PathBuf::from) { + Ok(p) => Ok(p), + _ => Ok(key), + } +} + +// Copied from path_clean crate with PathBuf->Utf8PathBuf +fn clean_utf8(path: &camino::Utf8Path) -> Utf8PathBuf { + use camino::Utf8Component as Comp; + let mut out = Vec::new(); + for comp in path.components() { + match comp { + Comp::CurDir => (), + Comp::ParentDir => match out.last() { + Some(Comp::RootDir) => (), + Some(Comp::Normal(_)) => { + out.pop(); + } + None | Some(Comp::CurDir) | Some(Comp::ParentDir) | Some(Comp::Prefix(_)) => { + out.push(comp) + } + }, + comp => out.push(comp), + } + } + if !out.is_empty() { + out.iter().collect() + } else { + Utf8PathBuf::from(".") + } +} + +pub struct Storage { + pub auth: JwtAuth, + pub storage: GenericRemoteStorage, + pub cancel: CancellationToken, + pub max_upload_file_limit: usize, +} + +pub type EndpointId = String; // If needed, reuse small string from proxy/src/types.rc + +#[derive(Deserialize, Serialize, PartialEq)] +pub struct Claims { + pub tenant_id: TenantId, + pub timeline_id: TimelineId, + pub endpoint_id: EndpointId, + pub exp: u64, +} + +impl Display for Claims { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "Claims(tenant_id {} timeline_id {} endpoint_id {} exp {})", + self.tenant_id, self.timeline_id, self.endpoint_id, self.exp + ) + } +} + +#[derive(Deserialize, Serialize)] +struct KeyRequest { + tenant_id: TenantId, + timeline_id: TimelineId, + endpoint_id: EndpointId, + path: String, +} + +#[derive(Debug, PartialEq)] +pub struct S3Path { + pub path: RemotePath, +} + +impl TryFrom<&KeyRequest> for S3Path { + type Error = String; + fn try_from(req: &KeyRequest) -> StdResult { + let KeyRequest { + tenant_id, + timeline_id, + endpoint_id, + path, + } = &req; + let prefix = format!("{tenant_id}/{timeline_id}/{endpoint_id}",); + let path = Utf8PathBuf::from(prefix).join(normalize_key(path)?); + let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative + Ok(S3Path { path }) + } +} + +fn unauthorized(route: impl Display, claims: impl Display) -> Response { + debug!(%route, %claims, "route doesn't match claims"); + StatusCode::UNAUTHORIZED.into_response() +} + +pub fn bad_request(err: impl Display, desc: &'static str) -> Response { + debug!(%err, desc); + (StatusCode::BAD_REQUEST, err.to_string()).into_response() +} + +pub fn ok() -> Response { + StatusCode::OK.into_response() +} + +pub fn internal_error(err: impl Display, path: impl Display, desc: &'static str) -> Response { + error!(%err, %path, desc); + StatusCode::INTERNAL_SERVER_ERROR.into_response() +} + +pub fn not_found(key: impl ToString) -> Response { + (StatusCode::NOT_FOUND, key.to_string()).into_response() +} + +impl FromRequestParts> for S3Path { + type Rejection = Response; + async fn from_request_parts( + parts: &mut Parts, + state: &Arc, + ) -> Result { + let Path(path): Path = parts + .extract() + .await + .map_err(|e| bad_request(e, "invalid route"))?; + let TypedHeader(Authorization(bearer)) = parts + .extract::>>() + .await + .map_err(|e| bad_request(e, "invalid token"))?; + let claims: Claims = state + .auth + .decode(bearer.token()) + .map_err(|e| bad_request(e, "decoding token"))?; + let route = Claims { + tenant_id: path.tenant_id, + timeline_id: path.timeline_id, + endpoint_id: path.endpoint_id.clone(), + exp: claims.exp, + }; + if route != claims { + return Err(unauthorized(route, claims)); + } + (&path) + .try_into() + .map_err(|e| bad_request(e, "invalid route")) + } +} + +#[derive(Deserialize, Serialize, PartialEq)] +pub struct PrefixKeyPath { + pub tenant_id: TenantId, + pub timeline_id: Option, + pub endpoint_id: Option, +} + +impl Display for PrefixKeyPath { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "PrefixKeyPath(tenant_id {} timeline_id {} endpoint_id {})", + self.tenant_id, + self.timeline_id + .as_ref() + .map(ToString::to_string) + .unwrap_or("".to_string()), + self.endpoint_id + .as_ref() + .map(ToString::to_string) + .unwrap_or("".to_string()) + ) + } +} + +#[derive(Debug, PartialEq)] +pub struct PrefixS3Path { + pub path: RemotePath, +} + +impl From<&PrefixKeyPath> for PrefixS3Path { + fn from(path: &PrefixKeyPath) -> Self { + let timeline_id = path + .timeline_id + .as_ref() + .map(ToString::to_string) + .unwrap_or("".to_string()); + let endpoint_id = path + .endpoint_id + .as_ref() + .map(ToString::to_string) + .unwrap_or("".to_string()); + let path = Utf8PathBuf::from(path.tenant_id.to_string()) + .join(timeline_id) + .join(endpoint_id); + let path = RemotePath::new(&path).unwrap(); // unwrap() because the path is already relative + PrefixS3Path { path } + } +} + +impl FromRequestParts> for PrefixS3Path { + type Rejection = Response; + async fn from_request_parts( + parts: &mut Parts, + state: &Arc, + ) -> Result { + let Path(path) = parts + .extract::>() + .await + .map_err(|e| bad_request(e, "invalid route"))?; + let TypedHeader(Authorization(bearer)) = parts + .extract::>>() + .await + .map_err(|e| bad_request(e, "invalid token"))?; + let claims: PrefixKeyPath = state + .auth + .decode(bearer.token()) + .map_err(|e| bad_request(e, "invalid token"))?; + if path != claims { + return Err(unauthorized(path, claims)); + } + Ok((&path).into()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn normalize_key() { + let f = super::normalize_key; + assert_eq!(f("hello/world/..").unwrap(), Utf8PathBuf::from("hello")); + assert_eq!( + f("ololo/1/../../not_ololo").unwrap(), + Utf8PathBuf::from("not_ololo") + ); + assert!(f("ololo/1/../../../").is_err()); + assert!(f(".").is_err()); + assert!(f("../").is_err()); + assert!(f("").is_err()); + assert_eq!(f("/1/2/3").unwrap(), Utf8PathBuf::from("1/2/3")); + assert!(f("/1/2/3/../../../").is_err()); + assert!(f("/1/2/3/../../../../").is_err()); + } + + const TENANT_ID: TenantId = + TenantId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6]); + const TIMELINE_ID: TimelineId = + TimelineId::from_array([1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]); + const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg"; + + #[test] + fn s3_path() { + let auth = Claims { + tenant_id: TENANT_ID, + timeline_id: TIMELINE_ID, + endpoint_id: ENDPOINT_ID.into(), + exp: u64::MAX, + }; + let s3_path = |key| { + let path = &format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}/{key}"); + let path = RemotePath::from_string(path).unwrap(); + S3Path { path } + }; + + let path = "cache_key".to_string(); + let mut key_path = KeyRequest { + path, + tenant_id: auth.tenant_id, + timeline_id: auth.timeline_id, + endpoint_id: auth.endpoint_id, + }; + assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path)); + + key_path.path = "we/can/have/nested/paths".to_string(); + assert_eq!(S3Path::try_from(&key_path).unwrap(), s3_path(key_path.path)); + + key_path.path = "../error/hello/../".to_string(); + assert!(S3Path::try_from(&key_path).is_err()); + } + + #[test] + fn prefix_s3_path() { + let mut path = PrefixKeyPath { + tenant_id: TENANT_ID, + timeline_id: None, + endpoint_id: None, + }; + let prefix_path = |s: String| RemotePath::from_string(&s).unwrap(); + assert_eq!( + PrefixS3Path::from(&path).path, + prefix_path(format!("{TENANT_ID}")) + ); + + path.timeline_id = Some(TIMELINE_ID); + assert_eq!( + PrefixS3Path::from(&path).path, + prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}")) + ); + + path.endpoint_id = Some(ENDPOINT_ID.into()); + assert_eq!( + PrefixS3Path::from(&path).path, + prefix_path(format!("{TENANT_ID}/{TIMELINE_ID}/{ENDPOINT_ID}")) + ); + } +} diff --git a/object_storage/src/main.rs b/object_storage/src/main.rs new file mode 100644 index 000000000000..40325db19de4 --- /dev/null +++ b/object_storage/src/main.rs @@ -0,0 +1,65 @@ +//! `object_storage` is a service which provides API for uploading and downloading +//! files. It is used by compute and control plane for accessing LFC prewarm data. +//! This service is deployed either as a separate component or as part of compute image +//! for large computes. +mod app; +use anyhow::Context; +use tracing::info; +use utils::logging; + +//see set() +const fn max_upload_file_limit() -> usize { + 100 * 1024 * 1024 +} + +#[derive(serde::Deserialize)] +#[serde(tag = "type")] +struct Config { + listen: std::net::SocketAddr, + pemfile: camino::Utf8PathBuf, + #[serde(flatten)] + storage_config: remote_storage::RemoteStorageConfig, + #[serde(default = "max_upload_file_limit")] + max_upload_file_limit: usize, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + logging::init( + logging::LogFormat::Plain, + logging::TracingErrorLayerEnablement::EnableWithRustLogFilter, + logging::Output::Stdout, + )?; + + let config: String = std::env::args().skip(1).take(1).collect(); + if config.is_empty() { + anyhow::bail!("Usage: object_storage config.json") + } + info!("Reading config from {config}"); + let config = std::fs::read_to_string(config.clone())?; + let config: Config = serde_json::from_str(&config).context("parsing config")?; + info!("Reading pemfile from {}", config.pemfile.clone()); + let pemfile = std::fs::read(config.pemfile.clone())?; + info!("Loading public key from {}", config.pemfile.clone()); + let auth = object_storage::JwtAuth::new(&pemfile)?; + + let listener = tokio::net::TcpListener::bind(config.listen).await.unwrap(); + info!("listening on {}", listener.local_addr().unwrap()); + + let storage = remote_storage::GenericRemoteStorage::from_config(&config.storage_config).await?; + let cancel = tokio_util::sync::CancellationToken::new(); + app::check_storage_permissions(&storage, cancel.clone()).await?; + + let proxy = std::sync::Arc::new(object_storage::Storage { + auth, + storage, + cancel: cancel.clone(), + max_upload_file_limit: config.max_upload_file_limit, + }); + + tokio::spawn(utils::signals::signal_handler(cancel.clone())); + axum::serve(listener, app::app(proxy)) + .with_graceful_shutdown(async move { cancel.cancelled().await }) + .await?; + Ok(()) +} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 56d97bf8a9d3..74f3fce6e5c3 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -10,6 +10,8 @@ default = [] # which adds some runtime cost to run tests on outage conditions testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"] +fuzz-read-path = ["testing"] + [dependencies] anyhow.workspace = true arc-swap.workspace = true diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index 000938b18917..3108b5351f75 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -126,7 +126,7 @@ async fn ingest( max_concurrency: NonZeroUsize::new(1).unwrap(), }); let (_desc, path) = layer - .write_to_disk(&ctx, None, l0_flush_state.inner()) + .write_to_disk(&ctx, None, l0_flush_state.inner(), &gate, cancel.clone()) .await? .unwrap(); tokio::fs::remove_file(path).await?; diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 77b3f90b3ea7..215682d90c04 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -65,7 +65,7 @@ use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; use once_cell::sync::Lazy; use pageserver::config::PageServerConf; -use pageserver::walredo::PostgresRedoManager; +use pageserver::walredo::{PostgresRedoManager, RedoAttemptType}; use pageserver_api::key::Key; use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::TenantShardId; @@ -223,7 +223,14 @@ impl Request { // TODO: avoid these clones manager - .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version) + .request_redo( + *key, + *lsn, + base_img.clone(), + records.clone(), + *pg_version, + RedoAttemptType::ReadPage, + ) .await .context("request_redo") } diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index de527e307b1d..3510ccb52915 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -34,7 +34,7 @@ use utils::lsn::Lsn; use crate::context::RequestContext; use crate::pgdatadir_mapping::Version; use crate::tenant::storage_layer::IoConcurrency; -use crate::tenant::timeline::GetVectoredError; +use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery}; use crate::tenant::{PageReconstructError, Timeline}; #[derive(Debug, thiserror::Error)] @@ -353,9 +353,10 @@ where let mut slru_builder = SlruSegmentsBuilder::new(&mut self.ar); for part in slru_partitions.parts { + let query = VersionedKeySpaceQuery::uniform(part, self.lsn); let blocks = self .timeline - .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx) + .get_vectored(query, self.io_concurrency.clone(), self.ctx) .await?; for (key, block) in blocks { diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 9a8494292d5b..250d4180f5ce 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -31,7 +31,6 @@ use pageserver::{ }; use postgres_backend::AuthType; use remote_storage::GenericRemoteStorage; -use tokio::signal::unix::SignalKind; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::*; @@ -453,6 +452,24 @@ fn start_pageserver( info!("Using auth for http API: {:#?}", conf.http_auth_type); info!("Using auth for pg connections: {:#?}", conf.pg_auth_type); + let tls_server_config = if conf.listen_https_addr.is_some() || conf.enable_tls_page_service_api + { + let resolver = BACKGROUND_RUNTIME.block_on(ReloadingCertificateResolver::new( + "main", + &conf.ssl_key_file, + &conf.ssl_cert_file, + conf.ssl_cert_reload_period, + ))?; + + let server_config = rustls::ServerConfig::builder() + .with_no_client_auth() + .with_cert_resolver(resolver); + + Some(Arc::new(server_config)) + } else { + None + }; + match var("NEON_AUTH_TOKEN") { Ok(v) => { info!("Loaded JWT token for authentication with Safekeeper"); @@ -671,17 +688,11 @@ fn start_pageserver( let https_task = match https_listener { Some(https_listener) => { - let resolver = MGMT_REQUEST_RUNTIME.block_on(ReloadingCertificateResolver::new( - &conf.ssl_key_file, - &conf.ssl_cert_file, - conf.ssl_cert_reload_period, - ))?; - - let server_config = rustls::ServerConfig::builder() - .with_no_client_auth() - .with_cert_resolver(resolver); + let tls_server_config = tls_server_config + .clone() + .expect("tls_server_config is set earlier if https is enabled"); - let tls_acceptor = tokio_rustls::TlsAcceptor::from(Arc::new(server_config)); + let tls_acceptor = tokio_rustls::TlsAcceptor::from(tls_server_config); let server = http_utils::server::Server::new(service, https_listener, Some(tls_acceptor))?; @@ -737,6 +748,11 @@ fn start_pageserver( tokio::net::TcpListener::from_std(pageserver_listener) .context("create tokio listener")? }, + if conf.enable_tls_page_service_api { + tls_server_config + } else { + None + }, ); // All started up! Now just sit and wait for shutdown signal. @@ -744,32 +760,7 @@ fn start_pageserver( let signal_token = CancellationToken::new(); let signal_cancel = signal_token.child_token(); - // Spawn signal handlers. Runs in a loop since we want to be responsive to multiple signals - // even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown). See: - // https://github.com/neondatabase/neon/issues/9740. - tokio::spawn(async move { - let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap(); - let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap(); - let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap(); - - loop { - let signal = tokio::select! { - _ = sigquit.recv() => { - info!("Got signal SIGQUIT. Terminating in immediate shutdown mode."); - std::process::exit(111); - } - _ = sigint.recv() => "SIGINT", - _ = sigterm.recv() => "SIGTERM", - }; - - if !signal_token.is_cancelled() { - info!("Got signal {signal}. Terminating gracefully in fast shutdown mode."); - signal_token.cancel(); - } else { - info!("Got signal {signal}. Already shutting down."); - } - } - }); + tokio::spawn(utils::signals::signal_handler(signal_token)); // Wait for cancellation signal and shut down the pageserver. // diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index ccc29e59d4ba..26ae6af70e61 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -219,6 +219,11 @@ pub struct PageServerConf { pub generate_unarchival_heatmap: bool, pub tracing: Option, + + /// Enable TLS in page service API. + /// Does not force TLS: the client negotiates TLS usage during the handshake. + /// Uses key and certificate from ssl_key_file/ssl_cert_file. + pub enable_tls_page_service_api: bool, } /// Token for authentication to safekeepers @@ -391,6 +396,7 @@ impl PageServerConf { load_previous_heatmap, generate_unarchival_heatmap, tracing, + enable_tls_page_service_api, } = config_toml; let mut conf = PageServerConf { @@ -441,6 +447,7 @@ impl PageServerConf { page_service_pipelining, get_vectored_concurrent_io, tracing, + enable_tls_page_service_api, // ------------------------------------------------------------ // fields that require additional validation or custom handling diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 566086c5270f..7ea148971f4a 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -212,6 +212,12 @@ paths: schema: type: string format: date-time + "412": + description: No timestamp is found for given LSN, e.g. if there had been no commits till LSN + content: + application/json: + schema: + $ref: "#/components/schemas/PreconditionFailedError" /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp: parameters: diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index cf67dc596ada..bbc4bfae1b16 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -67,7 +67,7 @@ use crate::tenant::mgr::{ }; use crate::tenant::remote_timeline_client::index::GcCompactionState; use crate::tenant::remote_timeline_client::{ - download_index_part, list_remote_tenant_shards, list_remote_timelines, + download_index_part, download_tenant_manifest, list_remote_tenant_shards, list_remote_timelines, }; use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; @@ -989,7 +989,7 @@ async fn get_lsn_by_timestamp_handler( if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( - "Size calculations are only available on shard zero" + "Lsn calculations by timestamp are only available on shard zero" ))); } @@ -1064,7 +1064,7 @@ async fn get_timestamp_of_lsn_handler( if !tenant_shard_id.is_shard_zero() { // Requires SLRU contents, which are only stored on shard zero return Err(ApiError::BadRequest(anyhow!( - "Size calculations are only available on shard zero" + "Timestamp calculations by lsn are only available on shard zero" ))); } @@ -1090,8 +1090,8 @@ async fn get_timestamp_of_lsn_handler( .to_string(); json_response(StatusCode::OK, time) } - None => Err(ApiError::NotFound( - anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(), + None => Err(ApiError::PreconditionFailed( + format!("Timestamp for lsn {} not found", lsn).into(), )), } } @@ -2274,6 +2274,7 @@ async fn timeline_compact_handler( if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? { flags |= CompactFlags::DryRun; } + // Manual compaction does not yield for L0. let wait_until_uploaded = parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); @@ -2911,9 +2912,22 @@ async fn tenant_scan_remote_handler( }; } + let result = + download_tenant_manifest(&state.remote_storage, &tenant_shard_id, generation, &cancel) + .instrument(info_span!("download_tenant_manifest", + tenant_id=%tenant_shard_id.tenant_id, + shard_id=%tenant_shard_id.shard_slug())) + .await; + let stripe_size = match result { + Ok((manifest, _, _)) => manifest.stripe_size, + Err(DownloadError::NotFound) => None, + Err(err) => return Err(ApiError::InternalServerError(anyhow!(err))), + }; + response.shards.push(TenantScanRemoteStorageShard { tenant_shard_id, generation: generation.into(), + stripe_size, }); } @@ -3239,7 +3253,7 @@ async fn ingest_aux_files( modification .put_file(&fname, content.as_bytes(), &ctx) .await - .map_err(ApiError::InternalServerError)?; + .map_err(|e| ApiError::InternalServerError(e.into()))?; } modification .commit(&ctx) @@ -3368,11 +3382,11 @@ async fn put_tenant_timeline_import_basebackup( let broker_client = state.broker_client.clone(); - let mut body = StreamReader::new(request.into_body().map(|res| { - res.map_err(|error| { - std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error)) - }) - })); + let mut body = StreamReader::new( + request + .into_body() + .map(|res| res.map_err(|error| std::io::Error::other(anyhow::anyhow!(error)))), + ); tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; @@ -3446,7 +3460,7 @@ async fn put_tenant_timeline_import_wal( let mut body = StreamReader::new(request.into_body().map(|res| { res.map_err(|error| { - std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error)) + std::io::Error::other( anyhow::anyhow!(error)) }) })); diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index 6dd005de5019..911449c7c503 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -27,7 +27,7 @@ use crate::context::RequestContext; use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; use crate::tenant::Timeline; -use crate::walingest::WalIngest; +use crate::walingest::{WalIngest, WalIngestErrorKind}; // Returns checkpoint LSN from controlfile pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result { @@ -157,9 +157,9 @@ async fn import_rel( .put_rel_creation(rel, nblocks as u32, ctx) .await { - match e { - RelationError::AlreadyExists => { - debug!("Relation {} already exist. We must be extending it.", rel) + match e.kind { + WalIngestErrorKind::RelationAlreadyExists(rel) => { + debug!("Relation {rel} already exists. We must be extending it.") } _ => return Err(e.into()), } diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 1fe51021fdeb..2a779b0daaf9 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -17,7 +17,7 @@ use metrics::{ use once_cell::sync::Lazy; use pageserver_api::config::{ PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, - PageServiceProtocolPipelinedExecutionStrategy, + PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy, }; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; @@ -1714,6 +1714,28 @@ pub enum SmgrQueryType { Test, } +#[derive( + Debug, + Clone, + Copy, + IntoStaticStr, + strum_macros::EnumCount, + strum_macros::EnumIter, + strum_macros::FromRepr, + enum_map::Enum, +)] +#[strum(serialize_all = "snake_case")] +pub enum GetPageBatchBreakReason { + BatchFull, + NonBatchableRequest, + NonUniformLsn, + SamePageAtDifferentLsn, + NonUniformTimeline, + ExecutorSteal, + #[cfg(feature = "testing")] + NonUniformKey, +} + pub(crate) struct SmgrQueryTimePerTimeline { global_started: [IntCounter; SmgrQueryType::COUNT], global_latency: [Histogram; SmgrQueryType::COUNT], @@ -1725,6 +1747,8 @@ pub(crate) struct SmgrQueryTimePerTimeline { per_timeline_flush_in_progress_micros: IntCounter, global_batch_wait_time: Histogram, per_timeline_batch_wait_time: Histogram, + global_batch_break_reason: [IntCounter; GetPageBatchBreakReason::COUNT], + per_timeline_batch_break_reason: GetPageBatchBreakReasonTimelineMetrics, throttling: Arc, } @@ -1858,12 +1882,55 @@ static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy = Lazy::n .expect("failed to define a metric") }); +static PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL: Lazy = Lazy::new(|| { + register_int_counter_vec!( + // it's a counter, but, name is prepared to extend it to a histogram of queue depth + "pageserver_page_service_batch_break_reason_global", + "Reason for breaking batches of get page requests", + &["reason"], + ) + .expect("failed to define a metric") +}); + +struct GetPageBatchBreakReasonTimelineMetrics { + map: EnumMap, +} + +impl GetPageBatchBreakReasonTimelineMetrics { + fn new(tenant_id: &str, shard_slug: &str, timeline_id: &str) -> Self { + GetPageBatchBreakReasonTimelineMetrics { + map: EnumMap::from_array(std::array::from_fn(|reason_idx| { + let reason = GetPageBatchBreakReason::from_usize(reason_idx); + PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.with_label_values(&[ + tenant_id, + shard_slug, + timeline_id, + reason.into(), + ]) + })), + } + } + + fn inc(&self, reason: GetPageBatchBreakReason) { + self.map[reason].inc() + } +} + +static PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "pageserver_page_service_batch_break_reason", + "Reason for breaking batches of get page requests", + &["tenant_id", "shard_id", "timeline_id", "reason"], + ) + .expect("failed to define a metric") +}); + pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_page_service_config_max_batch_size", "Configured maximum batch size for the server-side batching functionality of page_service. \ Labels expose more of the configuration parameters.", - &["mode", "execution"] + &["mode", "execution", "batching"] ) .expect("failed to define a metric") }); @@ -1871,10 +1938,11 @@ pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy = Lazy:: fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) { PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset(); let (label_values, value) = match conf { - PageServicePipeliningConfig::Serial => (["serial", "-"], 1), + PageServicePipeliningConfig::Serial => (["serial", "-", "-"], 1), PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { max_batch_size, execution, + batching, }) => { let mode = "pipelined"; let execution = match execution { @@ -1883,7 +1951,12 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) { } PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks", }; - ([mode, execution], max_batch_size.get()) + let batching = match batching { + PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => "uniform-lsn", + PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => "scattered-lsn", + }; + + ([mode, execution, batching], max_batch_size.get()) } }; PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE @@ -1979,6 +2052,15 @@ impl SmgrQueryTimePerTimeline { .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id]) .unwrap(); + let global_batch_break_reason = std::array::from_fn(|i| { + let reason = GetPageBatchBreakReason::from_usize(i); + PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL + .get_metric_with_label_values(&[reason.into()]) + .unwrap() + }); + let per_timeline_batch_break_reason = + GetPageBatchBreakReasonTimelineMetrics::new(&tenant_id, &shard_slug, &timeline_id); + let global_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone(); let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS @@ -1996,6 +2078,8 @@ impl SmgrQueryTimePerTimeline { per_timeline_flush_in_progress_micros, global_batch_wait_time, per_timeline_batch_wait_time, + global_batch_break_reason, + per_timeline_batch_break_reason, throttling: pagestream_throttle_metrics, } } @@ -2024,9 +2108,16 @@ impl SmgrQueryTimePerTimeline { } /// TODO: do something about this? seems odd, we have a similar call on SmgrOpTimer - pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) { + pub(crate) fn observe_getpage_batch_start( + &self, + batch_size: usize, + break_reason: GetPageBatchBreakReason, + ) { self.global_batch_size.observe(batch_size as f64); self.per_timeline_batch_size.observe(batch_size as f64); + + self.global_batch_break_reason[break_reason.into_usize()].inc(); + self.per_timeline_batch_break_reason.inc(break_reason); } } @@ -3392,6 +3483,15 @@ impl TimelineMetrics { shard_id, timeline_id, ]); + + for reason in GetPageBatchBreakReason::iter() { + let _ = PAGE_SERVICE_BATCH_BREAK_REASON_PER_TENANT_TIMELINE.remove_label_values(&[ + tenant_id, + shard_id, + timeline_id, + reason.into(), + ]); + } } } @@ -4270,6 +4370,7 @@ pub fn preinitialize_metrics( [ &BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT, &SMGR_QUERY_STARTED_GLOBAL, + &PAGE_SERVICE_BATCH_BREAK_REASON_GLOBAL, ] .into_iter() .for_each(|c| { diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 7e3991dbdce7..7a62d8049ba3 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -18,7 +18,7 @@ use itertools::Itertools; use once_cell::sync::OnceCell; use pageserver_api::config::{ PageServicePipeliningConfig, PageServicePipeliningConfigPipelined, - PageServiceProtocolPipelinedExecutionStrategy, + PageServiceProtocolPipelinedBatchingStrategy, PageServiceProtocolPipelinedExecutionStrategy, }; use pageserver_api::key::rel_block_to_key; use pageserver_api::models::{ @@ -58,8 +58,8 @@ use crate::context::{ DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder, }; use crate::metrics::{ - self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer, - TimelineMetrics, + self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS, + SmgrOpTimer, TimelineMetrics, }; use crate::pgdatadir_mapping::Version; use crate::span::{ @@ -105,6 +105,7 @@ pub fn spawn( pg_auth: Option>, perf_trace_dispatch: Option, tcp_listener: tokio::net::TcpListener, + tls_config: Option>, ) -> Listener { let cancel = CancellationToken::new(); let libpq_ctx = RequestContext::todo_child( @@ -124,6 +125,7 @@ pub fn spawn( perf_trace_dispatch, tcp_listener, conf.pg_auth_type, + tls_config, conf.page_service_pipelining.clone(), libpq_ctx, cancel.clone(), @@ -181,6 +183,7 @@ pub async fn libpq_listener_main( perf_trace_dispatch: Option, listener: tokio::net::TcpListener, auth_type: AuthType, + tls_config: Option>, pipelining_config: PageServicePipeliningConfig, listener_ctx: RequestContext, listener_cancel: CancellationToken, @@ -223,6 +226,7 @@ pub async fn libpq_listener_main( local_auth, socket, auth_type, + tls_config.clone(), pipelining_config.clone(), connection_ctx, connections_cancel.child_token(), @@ -264,6 +268,7 @@ async fn page_service_conn_main( auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, + tls_config: Option>, pipelining_config: PageServicePipeliningConfig, connection_ctx: RequestContext, cancel: CancellationToken, @@ -334,7 +339,8 @@ async fn page_service_conn_main( cancel.clone(), gate_guard, ); - let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?; + let pgbackend = + PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, tls_config)?; match pgbackend.run(&mut conn_handler, &cancel).await { Ok(()) => { @@ -635,6 +641,7 @@ impl std::fmt::Display for BatchedPageStreamError { struct BatchedGetPageRequest { req: PagestreamGetPageRequest, timer: SmgrOpTimer, + effective_request_lsn: Lsn, ctx: RequestContext, } @@ -664,8 +671,8 @@ enum BatchedFeMessage { GetPage { span: Span, shard: timeline::handle::WeakHandle, - effective_request_lsn: Lsn, pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>, + batch_break_reason: GetPageBatchBreakReason, }, DbSize { span: Span, @@ -718,6 +725,119 @@ impl BatchedFeMessage { BatchedFeMessage::RespondError { .. } => {} } } + + fn should_break_batch( + &self, + other: &BatchedFeMessage, + max_batch_size: NonZeroUsize, + batching_strategy: PageServiceProtocolPipelinedBatchingStrategy, + ) -> Option { + match (self, other) { + ( + BatchedFeMessage::GetPage { + shard: accum_shard, + pages: accum_pages, + .. + }, + BatchedFeMessage::GetPage { + shard: this_shard, + pages: this_pages, + .. + }, + ) => { + assert_eq!(this_pages.len(), 1); + if accum_pages.len() >= max_batch_size.get() { + trace!(%max_batch_size, "stopping batching because of batch size"); + assert_eq!(accum_pages.len(), max_batch_size.get()); + + return Some(GetPageBatchBreakReason::BatchFull); + } + if !accum_shard.is_same_handle_as(this_shard) { + trace!("stopping batching because timeline object mismatch"); + // TODO: we _could_ batch & execute each shard seperately (and in parallel). + // But the current logic for keeping responses in order does not support that. + + return Some(GetPageBatchBreakReason::NonUniformTimeline); + } + + match batching_strategy { + PageServiceProtocolPipelinedBatchingStrategy::UniformLsn => { + if let Some(last_in_batch) = accum_pages.last() { + if last_in_batch.effective_request_lsn + != this_pages[0].effective_request_lsn + { + trace!( + accum_lsn = %last_in_batch.effective_request_lsn, + this_lsn = %this_pages[0].effective_request_lsn, + "stopping batching because LSN changed" + ); + + return Some(GetPageBatchBreakReason::NonUniformLsn); + } + } + } + PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn => { + // The read path doesn't curently support serving the same page at different LSNs. + // While technically possible, it's uncertain if the complexity is worth it. + // Break the batch if such a case is encountered. + let same_page_different_lsn = accum_pages.iter().any(|batched| { + batched.req.rel == this_pages[0].req.rel + && batched.req.blkno == this_pages[0].req.blkno + && batched.effective_request_lsn + != this_pages[0].effective_request_lsn + }); + + if same_page_different_lsn { + trace!( + rel=%this_pages[0].req.rel, + blkno=%this_pages[0].req.blkno, + lsn=%this_pages[0].effective_request_lsn, + "stopping batching because same page was requested at different LSNs" + ); + + return Some(GetPageBatchBreakReason::SamePageAtDifferentLsn); + } + } + } + + None + } + #[cfg(feature = "testing")] + ( + BatchedFeMessage::Test { + shard: accum_shard, + requests: accum_requests, + .. + }, + BatchedFeMessage::Test { + shard: this_shard, + requests: this_requests, + .. + }, + ) => { + assert!(this_requests.len() == 1); + if accum_requests.len() >= max_batch_size.get() { + trace!(%max_batch_size, "stopping batching because of batch size"); + assert_eq!(accum_requests.len(), max_batch_size.get()); + return Some(GetPageBatchBreakReason::BatchFull); + } + if !accum_shard.is_same_handle_as(this_shard) { + trace!("stopping batching because timeline object mismatch"); + // TODO: we _could_ batch & execute each shard seperately (and in parallel). + // But the current logic for keeping responses in order does not support that. + return Some(GetPageBatchBreakReason::NonUniformTimeline); + } + let this_batch_key = this_requests[0].req.batch_key; + let accum_batch_key = accum_requests[0].req.batch_key; + if this_requests[0].req.batch_key != accum_requests[0].req.batch_key { + trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed"); + return Some(GetPageBatchBreakReason::NonUniformKey); + } + None + } + (_, _) => Some(GetPageBatchBreakReason::NonBatchableRequest), + } + } } impl PageServerHandler { @@ -1019,34 +1139,32 @@ impl PageServerHandler { .await?; // We're holding the Handle - // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait - let res = Self::wait_or_get_last_lsn( + let effective_request_lsn = match Self::effective_request_lsn( &shard, + shard.get_last_record_lsn(), req.hdr.request_lsn, req.hdr.not_modified_since, &shard.get_applied_gc_cutoff_lsn(), - &ctx, - ) - .maybe_perf_instrument(&ctx, |current_perf_span| { - info_span!( - target: PERF_TRACE_TARGET, - parent: current_perf_span, - "WAIT_LSN", - ) - }) - .await; - - let effective_request_lsn = match res { + ) { Ok(lsn) => lsn, Err(e) => { return respond_error!(span, e); } }; + BatchedFeMessage::GetPage { span, shard: shard.downgrade(), - effective_request_lsn, - pages: smallvec::smallvec![BatchedGetPageRequest { req, timer, ctx }], + pages: smallvec::smallvec![BatchedGetPageRequest { + req, + timer, + effective_request_lsn, + ctx, + }], + // The executor grabs the batch when it becomes idle. + // Hence, [`GetPageBatchBreakReason::ExecutorSteal`] is the + // default reason for breaking the batch. + batch_break_reason: GetPageBatchBreakReason::ExecutorSteal, } } #[cfg(feature = "testing")] @@ -1072,6 +1190,7 @@ impl PageServerHandler { #[instrument(skip_all, level = tracing::Level::TRACE)] #[allow(clippy::boxed_local)] fn pagestream_do_batch( + batching_strategy: PageServiceProtocolPipelinedBatchingStrategy, max_batch_size: NonZeroUsize, batch: &mut Result, this_msg: Result, @@ -1083,90 +1202,59 @@ impl PageServerHandler { Err(e) => return Err(Err(e)), }; - match (&mut *batch, this_msg) { - // something batched already, let's see if we can add this message to the batch - ( - Ok(BatchedFeMessage::GetPage { - span: _, - shard: accum_shard, - pages: accum_pages, - effective_request_lsn: accum_lsn, - }), - BatchedFeMessage::GetPage { - span: _, - shard: this_shard, - pages: this_pages, - effective_request_lsn: this_lsn, - }, - ) if (|| { - assert_eq!(this_pages.len(), 1); - if accum_pages.len() >= max_batch_size.get() { - trace!(%accum_lsn, %this_lsn, %max_batch_size, "stopping batching because of batch size"); - assert_eq!(accum_pages.len(), max_batch_size.get()); - return false; - } - if !accum_shard.is_same_handle_as(&this_shard) { - trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch"); - // TODO: we _could_ batch & execute each shard seperately (and in parallel). - // But the current logic for keeping responses in order does not support that. - return false; - } - // the vectored get currently only supports a single LSN, so, bounce as soon - // as the effective request_lsn changes - if *accum_lsn != this_lsn { - trace!(%accum_lsn, %this_lsn, "stopping batching because LSN changed"); - return false; - } - true - })() => - { - // ok to batch - accum_pages.extend(this_pages); - Ok(()) + let eligible_batch = match batch { + Ok(b) => b, + Err(_) => { + return Err(Ok(this_msg)); } - #[cfg(feature = "testing")] - ( - Ok(BatchedFeMessage::Test { - shard: accum_shard, - requests: accum_requests, - .. - }), - BatchedFeMessage::Test { - shard: this_shard, - requests: this_requests, - .. - }, - ) if (|| { - assert!(this_requests.len() == 1); - if accum_requests.len() >= max_batch_size.get() { - trace!(%max_batch_size, "stopping batching because of batch size"); - assert_eq!(accum_requests.len(), max_batch_size.get()); - return false; - } - if !accum_shard.is_same_handle_as(&this_shard) { - trace!("stopping batching because timeline object mismatch"); - // TODO: we _could_ batch & execute each shard seperately (and in parallel). - // But the current logic for keeping responses in order does not support that. - return false; - } - let this_batch_key = this_requests[0].req.batch_key; - let accum_batch_key = accum_requests[0].req.batch_key; - if this_requests[0].req.batch_key != accum_requests[0].req.batch_key { - trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed"); - return false; + }; + + let batch_break = + eligible_batch.should_break_batch(&this_msg, max_batch_size, batching_strategy); + + match batch_break { + Some(reason) => { + if let BatchedFeMessage::GetPage { + batch_break_reason, .. + } = eligible_batch + { + *batch_break_reason = reason; } - true - })() => - { - // ok to batch - accum_requests.extend(this_requests); - Ok(()) - } - // something batched already but this message is unbatchable - (_, this_msg) => { - // by default, don't continue batching + Err(Ok(this_msg)) } + None => { + // ok to batch + match (eligible_batch, this_msg) { + ( + BatchedFeMessage::GetPage { + pages: accum_pages, .. + }, + BatchedFeMessage::GetPage { + pages: this_pages, .. + }, + ) => { + accum_pages.extend(this_pages); + Ok(()) + } + #[cfg(feature = "testing")] + ( + BatchedFeMessage::Test { + requests: accum_requests, + .. + }, + BatchedFeMessage::Test { + requests: this_requests, + .. + }, + ) => { + accum_requests.extend(this_requests); + Ok(()) + } + // Shape guaranteed by [`BatchedFeMessage::should_break_batch`] + _ => unreachable!(), + } + } } } @@ -1387,8 +1475,8 @@ impl PageServerHandler { BatchedFeMessage::GetPage { span, shard, - effective_request_lsn, pages, + batch_break_reason, } => { fail::fail_point!("ps::handle-pagerequest-message::getpage"); let (shard, ctx) = upgrade_handle_and_set_context!(shard); @@ -1399,9 +1487,9 @@ impl PageServerHandler { let res = self .handle_get_page_at_lsn_request_batched( &shard, - effective_request_lsn, pages, io_concurrency, + batch_break_reason, &ctx, ) .instrument(span.clone()) @@ -1718,6 +1806,7 @@ impl PageServerHandler { let PageServicePipeliningConfigPipelined { max_batch_size, execution, + batching: batching_strategy, } = pipelining_config; // Macro to _define_ a pipeline stage. @@ -1769,7 +1858,7 @@ impl PageServerHandler { exit |= read_res.is_err(); let could_send = batch_tx .send(read_res, |batch, res| { - Self::pagestream_do_batch(max_batch_size, batch, res) + Self::pagestream_do_batch(batching_strategy, max_batch_size, batch, res) }) .await; exit |= could_send.is_err(); @@ -1865,7 +1954,39 @@ impl PageServerHandler { ctx: &RequestContext, ) -> Result { let last_record_lsn = timeline.get_last_record_lsn(); + let effective_request_lsn = Self::effective_request_lsn( + timeline, + last_record_lsn, + request_lsn, + not_modified_since, + latest_gc_cutoff_lsn, + )?; + + if effective_request_lsn > last_record_lsn { + timeline + .wait_lsn( + not_modified_since, + crate::tenant::timeline::WaitLsnWaiter::PageService, + timeline::WaitLsnTimeout::Default, + ctx, + ) + .await?; + + // Since we waited for 'effective_request_lsn' to arrive, that is now the last + // record LSN. (Or close enough for our purposes; the last-record LSN can + // advance immediately after we return anyway) + } + + Ok(effective_request_lsn) + } + fn effective_request_lsn( + timeline: &Timeline, + last_record_lsn: Lsn, + request_lsn: Lsn, + not_modified_since: Lsn, + latest_gc_cutoff_lsn: &RcuReadGuard, + ) -> Result { // Sanity check the request if request_lsn < not_modified_since { return Err(PageStreamError::BadRequest( @@ -1900,19 +2021,7 @@ impl PageServerHandler { } } - // Wait for WAL up to 'not_modified_since' to arrive, if necessary if not_modified_since > last_record_lsn { - timeline - .wait_lsn( - not_modified_since, - crate::tenant::timeline::WaitLsnWaiter::PageService, - timeline::WaitLsnTimeout::Default, - ctx, - ) - .await?; - // Since we waited for 'not_modified_since' to arrive, that is now the last - // record LSN. (Or close enough for our purposes; the last-record LSN can - // advance immediately after we return anyway) Ok(not_modified_since) } else { // It might be better to use max(not_modified_since, latest_gc_cutoff_lsn) @@ -2067,16 +2176,16 @@ impl PageServerHandler { async fn handle_get_page_at_lsn_request_batched( &mut self, timeline: &Timeline, - effective_lsn: Lsn, requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>, io_concurrency: IoConcurrency, + batch_break_reason: GetPageBatchBreakReason, ctx: &RequestContext, ) -> Vec> { debug_assert_current_span_has_tenant_and_timeline_id(); timeline .query_metrics - .observe_getpage_batch_start(requests.len()); + .observe_getpage_batch_start(requests.len(), batch_break_reason); // If a page trace is running, submit an event for this request. if let Some(page_trace) = timeline.page_trace.load().as_ref() { @@ -2086,20 +2195,81 @@ impl PageServerHandler { // Ignore error (trace buffer may be full or tracer may have disconnected). _ = page_trace.try_send(PageTraceEvent { key, - effective_lsn, + effective_lsn: batch.effective_request_lsn, time, }); } } + // If any request in the batch needs to wait for LSN, then do so now. + let mut perf_instrument = false; + let max_effective_lsn = requests + .iter() + .map(|req| { + if req.ctx.has_perf_span() { + perf_instrument = true; + } + + req.effective_request_lsn + }) + .max() + .expect("batch is never empty"); + + let ctx = match perf_instrument { + true => RequestContextBuilder::from(ctx) + .root_perf_span(|| { + info_span!( + target: PERF_TRACE_TARGET, + "GET_VECTORED", + tenant_id = %timeline.tenant_shard_id.tenant_id, + timeline_id = %timeline.timeline_id, + shard = %timeline.tenant_shard_id.shard_slug(), + %max_effective_lsn + ) + }) + .attached_child(), + false => ctx.attached_child(), + }; + + let last_record_lsn = timeline.get_last_record_lsn(); + if max_effective_lsn > last_record_lsn { + if let Err(e) = timeline + .wait_lsn( + max_effective_lsn, + crate::tenant::timeline::WaitLsnWaiter::PageService, + timeline::WaitLsnTimeout::Default, + &ctx, + ) + .maybe_perf_instrument(&ctx, |current_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: current_perf_span, + "WAIT_LSN", + ) + }) + .await + { + return Vec::from_iter(requests.into_iter().map(|req| { + Err(BatchedPageStreamError { + err: PageStreamError::from(e.clone()), + req: req.req.hdr, + }) + })); + } + } + let results = timeline .get_rel_page_at_lsn_batched( - requests - .iter() - .map(|p| (&p.req.rel, &p.req.blkno, p.ctx.attached_child())), - effective_lsn, + requests.iter().map(|p| { + ( + &p.req.rel, + &p.req.blkno, + p.effective_request_lsn, + p.ctx.attached_child(), + ) + }), io_concurrency, - ctx, + &ctx, ) .await; assert_eq!(results.len(), requests.len()); diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index e3e06ab91a6e..81e548a095dc 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -6,14 +6,14 @@ //! walingest.rs handles a few things like implicit relation creation and extension. //! Clarify that) //! -use std::collections::{BTreeMap, HashMap, HashSet, hash_map}; +use std::collections::{HashMap, HashSet, hash_map}; use std::ops::{ControlFlow, Range}; -use crate::PERF_TRACE_TARGET; -use anyhow::{Context, ensure}; +use crate::walingest::{WalIngestError, WalIngestErrorKind}; +use crate::{PERF_TRACE_TARGET, ensure_walingest}; +use anyhow::Context; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; -use itertools::Itertools; use pageserver_api::key::{ AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, CompactKey, DBDIR_KEY, Key, RelDirExists, TWOPHASEDIR_KEY, dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, @@ -21,7 +21,7 @@ use pageserver_api::key::{ repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range, }; -use pageserver_api::keyspace::SparseKeySpace; +use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace}; use pageserver_api::models::RelSizeMigration; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; @@ -40,7 +40,7 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta}; use super::tenant::{PageReconstructError, Timeline}; use crate::aux_file; -use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder}; +use crate::context::{PerfInstrumentFutureExt, RequestContext}; use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::metrics::{ RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD, @@ -50,7 +50,7 @@ use crate::span::{ debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id, }; use crate::tenant::storage_layer::IoConcurrency; -use crate::tenant::timeline::GetVectoredError; +use crate::tenant::timeline::{GetVectoredError, VersionedKeySpaceQuery}; /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached. pub const MAX_AUX_FILE_DELTAS: usize = 1024; @@ -136,12 +136,8 @@ impl From for CalculateLogicalSizeError { #[derive(Debug, thiserror::Error)] pub enum RelationError { - #[error("Relation Already Exists")] - AlreadyExists, #[error("invalid relnode")] InvalidRelnode, - #[error(transparent)] - Other(#[from] anyhow::Error), } /// @@ -210,10 +206,9 @@ impl Timeline { let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)]; let res = self .get_rel_page_at_lsn_batched( - pages - .iter() - .map(|(tag, blknum)| (tag, blknum, ctx.attached_child())), - effective_lsn, + pages.iter().map(|(tag, blknum)| { + (tag, blknum, effective_lsn, ctx.attached_child()) + }), io_concurrency.clone(), ctx, ) @@ -251,8 +246,7 @@ impl Timeline { /// The ordering of the returned vec corresponds to the ordering of `pages`. pub(crate) async fn get_rel_page_at_lsn_batched( &self, - pages: impl ExactSizeIterator, - effective_lsn: Lsn, + pages: impl ExactSizeIterator, io_concurrency: IoConcurrency, ctx: &RequestContext, ) -> Vec> { @@ -265,11 +259,13 @@ impl Timeline { let mut result = Vec::with_capacity(pages.len()); let result_slots = result.spare_capacity_mut(); - let mut keys_slots: BTreeMap> = - BTreeMap::default(); + let mut keys_slots: HashMap> = + HashMap::with_capacity(pages.len()); + + let mut req_keyspaces: HashMap = + HashMap::with_capacity(pages.len()); - let mut perf_instrument = false; - for (response_slot_idx, (tag, blknum, ctx)) in pages.enumerate() { + for (response_slot_idx, (tag, blknum, lsn, ctx)) in pages.enumerate() { if tag.relnode == 0 { result_slots[response_slot_idx].write(Err(PageReconstructError::Other( RelationError::InvalidRelnode.into(), @@ -280,14 +276,14 @@ impl Timeline { } let nblocks = match self - .get_rel_size(*tag, Version::Lsn(effective_lsn), &ctx) + .get_rel_size(*tag, Version::Lsn(lsn), &ctx) .maybe_perf_instrument(&ctx, |crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, parent: crnt_perf_span, "GET_REL_SIZE", reltag=%tag, - lsn=%effective_lsn, + lsn=%lsn, ) }) .await @@ -303,7 +299,7 @@ impl Timeline { if *blknum >= nblocks { debug!( "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page", - tag, blknum, effective_lsn, nblocks + tag, blknum, lsn, nblocks ); result_slots[response_slot_idx].write(Ok(ZERO_PAGE.clone())); slots_filled += 1; @@ -312,46 +308,29 @@ impl Timeline { let key = rel_block_to_key(*tag, *blknum); - if ctx.has_perf_span() { - perf_instrument = true; - } - let key_slots = keys_slots.entry(key).or_default(); key_slots.push((response_slot_idx, ctx)); - } - let keyspace = { - // add_key requires monotonicity - let mut acc = KeySpaceAccum::new(); - for key in keys_slots - .keys() - // in fact it requires strong monotonicity - .dedup() - { - acc.add_key(*key); - } - acc.to_keyspace() - }; + let acc = req_keyspaces.entry(lsn).or_default(); + acc.add_key(key); + } - let ctx = match perf_instrument { - true => RequestContextBuilder::from(ctx) - .root_perf_span(|| { - info_span!( - target: PERF_TRACE_TARGET, - "GET_VECTORED", - tenant_id = %self.tenant_shard_id.tenant_id, - timeline_id = %self.timeline_id, - lsn = %effective_lsn, - shard = %self.tenant_shard_id.shard_slug(), - ) - }) - .attached_child(), - false => ctx.attached_child(), - }; + let query: Vec<(Lsn, KeySpace)> = req_keyspaces + .into_iter() + .map(|(lsn, acc)| (lsn, acc.to_keyspace())) + .collect(); + let query = VersionedKeySpaceQuery::scattered(query); let res = self - .get_vectored(keyspace, effective_lsn, io_concurrency, &ctx) - .maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone()) + .get_vectored(query, io_concurrency, ctx) + .maybe_perf_instrument(ctx, |current_perf_span| { + info_span!( + target: PERF_TRACE_TARGET, + parent: current_perf_span, + "GET_BATCH", + batch_size = %page_count, + ) + }) .await; match res { @@ -381,12 +360,12 @@ impl Timeline { // There is no standardized way to express that the batched span followed from N request spans. // So, abuse the system and mark the request contexts as follows_from the batch span, so we get // some linkage in our trace viewer. It allows us to answer: which GET_VECTORED did this GET_PAGE wait for. - req_ctx.perf_follows_from(&ctx); + req_ctx.perf_follows_from(ctx); slots_filled += 1; } result_slots[first_slot].write(res); - first_req_ctx.perf_follows_from(&ctx); + first_req_ctx.perf_follows_from(ctx); slots_filled += 1; } } @@ -425,7 +404,7 @@ impl Timeline { } }; - req_ctx.perf_follows_from(&ctx); + req_ctx.perf_follows_from(ctx); result_slots[*slot].write(err); } @@ -664,8 +643,9 @@ impl Timeline { let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize); for batch in batches.parts { + let query = VersionedKeySpaceQuery::uniform(batch, lsn); let blocks = self - .get_vectored(batch, lsn, io_concurrency.clone(), ctx) + .get_vectored(query, io_concurrency.clone(), ctx) .await?; for (_key, block) in blocks { @@ -691,7 +671,7 @@ impl Timeline { Ok(buf.get_u32_le()) } - /// Get size of an SLRU segment + /// Does the slru segment exist? pub(crate) async fn get_slru_segment_exists( &self, kind: SlruKind, @@ -844,9 +824,9 @@ impl Timeline { .await } - /// Obtain the possible timestamp range for the given lsn. + /// Obtain the timestamp for the given lsn. /// - /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps. + /// If the lsn has no timestamps (e.g. no commits), returns None. pub(crate) async fn get_timestamp_for_lsn( &self, probe_lsn: Lsn, @@ -902,8 +882,9 @@ impl Timeline { ); for batch in batches.parts.into_iter().rev() { + let query = VersionedKeySpaceQuery::uniform(batch, probe_lsn); let blocks = self - .get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx) + .get_vectored(query, io_concurrency.clone(), ctx) .await?; for (_key, clog_page) in blocks.into_iter().rev() { @@ -1478,8 +1459,8 @@ impl DatadirModification<'_> { } /// Set the current lsn - pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> { - ensure!( + pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> { + ensure_walingest!( lsn >= self.lsn, "setting an older lsn {} than {} is not allowed", lsn, @@ -1578,7 +1559,7 @@ impl DatadirModification<'_> { &mut self, rel: RelTag, ctx: &RequestContext, - ) -> Result { + ) -> Result { // Get current size and put rel creation if rel doesn't exist // // NOTE: we check the cache first even though get_rel_exists and get_rel_size would @@ -1593,14 +1574,13 @@ impl DatadirModification<'_> { .await? { // create it with 0 size initially, the logic below will extend it - self.put_rel_creation(rel, 0, ctx) - .await - .context("Relation Error")?; + self.put_rel_creation(rel, 0, ctx).await?; Ok(0) } else { - self.tline + Ok(self + .tline .get_rel_size(rel, Version::Modified(self), ctx) - .await + .await?) } } @@ -1637,11 +1617,14 @@ impl DatadirModification<'_> { // TODO(vlad): remove this argument and replace the shard check with is_key_local shard: &ShardIdentity, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let mut gaps_at_lsns = Vec::default(); for meta in batch.metadata.iter() { - let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?; + let key = Key::from_compact(meta.key()); + let (rel, blkno) = key + .to_rel_block() + .map_err(|_| WalIngestErrorKind::InvalidKey(key, meta.lsn()))?; let new_nblocks = blkno + 1; let old_nblocks = self.create_relation_if_required(rel, ctx).await?; @@ -1683,8 +1666,8 @@ impl DatadirModification<'_> { rel: RelTag, blknum: BlockNumber, rec: NeonWalRecord, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); self.put(rel_block_to_key(rel, blknum), Value::WalRecord(rec)); Ok(()) } @@ -1696,7 +1679,7 @@ impl DatadirModification<'_> { segno: u32, blknum: BlockNumber, rec: NeonWalRecord, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { if !self.tline.tenant_shard_id.is_shard_zero() { return Ok(()); } @@ -1714,14 +1697,11 @@ impl DatadirModification<'_> { rel: RelTag, blknum: BlockNumber, img: Bytes, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); let key = rel_block_to_key(rel, blknum); if !key.is_valid_key_on_write_path() { - anyhow::bail!( - "the request contains data not supported by pageserver at {}", - key - ); + Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } self.put(rel_block_to_key(rel, blknum), Value::Image(img)); Ok(()) @@ -1733,15 +1713,12 @@ impl DatadirModification<'_> { segno: u32, blknum: BlockNumber, img: Bytes, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); let key = slru_block_to_key(kind, segno, blknum); if !key.is_valid_key_on_write_path() { - anyhow::bail!( - "the request contains data not supported by pageserver at {}", - key - ); + Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } self.put(key, Value::Image(img)); Ok(()) @@ -1751,15 +1728,11 @@ impl DatadirModification<'_> { &mut self, rel: RelTag, blknum: BlockNumber, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); let key = rel_block_to_key(rel, blknum); if !key.is_valid_key_on_write_path() { - anyhow::bail!( - "the request contains data not supported by pageserver: {} @ {}", - key, - self.lsn - ); + Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } let batch = self @@ -1776,15 +1749,11 @@ impl DatadirModification<'_> { kind: SlruKind, segno: u32, blknum: BlockNumber, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); let key = slru_block_to_key(kind, segno, blknum); if !key.is_valid_key_on_write_path() { - anyhow::bail!( - "the request contains data not supported by pageserver: {} @ {}", - key, - self.lsn - ); + Err(WalIngestErrorKind::InvalidKey(key, self.lsn))?; } let batch = self @@ -1832,8 +1801,10 @@ impl DatadirModification<'_> { dbnode: Oid, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { - let v2_enabled = self.maybe_enable_rel_size_v2()?; + ) -> Result<(), WalIngestError> { + let v2_enabled = self + .maybe_enable_rel_size_v2() + .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; // Add it to the directory (if it doesn't exist already) let buf = self.get(DBDIR_KEY, ctx).await?; @@ -1874,13 +1845,13 @@ impl DatadirModification<'_> { xid: u64, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { // Add it to the directory entry let dirbuf = self.get(TWOPHASEDIR_KEY, ctx).await?; let newdirbuf = if self.tline.pg_version >= 17 { let mut dir = TwoPhaseDirectoryV17::des(&dirbuf)?; if !dir.xids.insert(xid) { - anyhow::bail!("twophase file for xid {} already exists", xid); + Err(WalIngestErrorKind::FileAlreadyExists(xid))?; } self.pending_directory_entries.push(( DirectoryKind::TwoPhase, @@ -1891,7 +1862,7 @@ impl DatadirModification<'_> { let xid = xid as u32; let mut dir = TwoPhaseDirectory::des(&dirbuf)?; if !dir.xids.insert(xid) { - anyhow::bail!("twophase file for xid {} already exists", xid); + Err(WalIngestErrorKind::FileAlreadyExists(xid.into()))?; } self.pending_directory_entries.push(( DirectoryKind::TwoPhase, @@ -1909,22 +1880,22 @@ impl DatadirModification<'_> { &mut self, origin_id: RepOriginId, origin_lsn: Lsn, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let key = repl_origin_key(origin_id); self.put(key, Value::Image(origin_lsn.ser().unwrap().into())); Ok(()) } - pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> anyhow::Result<()> { + pub async fn drop_replorigin(&mut self, origin_id: RepOriginId) -> Result<(), WalIngestError> { self.set_replorigin(origin_id, Lsn::INVALID).await } - pub fn put_control_file(&mut self, img: Bytes) -> anyhow::Result<()> { + pub fn put_control_file(&mut self, img: Bytes) -> Result<(), WalIngestError> { self.put(CONTROLFILE_KEY, Value::Image(img)); Ok(()) } - pub fn put_checkpoint(&mut self, img: Bytes) -> anyhow::Result<()> { + pub fn put_checkpoint(&mut self, img: Bytes) -> Result<(), WalIngestError> { self.put(CHECKPOINT_KEY, Value::Image(img)); Ok(()) } @@ -1934,7 +1905,7 @@ impl DatadirModification<'_> { spcnode: Oid, dbnode: Oid, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let total_blocks = self .tline .get_db_size(spcnode, dbnode, Version::Modified(self), ctx) @@ -1973,20 +1944,21 @@ impl DatadirModification<'_> { rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, - ) -> Result<(), RelationError> { + ) -> Result<(), WalIngestError> { if rel.relnode == 0 { - return Err(RelationError::InvalidRelnode); + Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!( + "invalid relnode" + )))?; } // It's possible that this is the first rel for this db in this // tablespace. Create the reldir entry for it if so. - let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?) - .context("deserialize db")?; + let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?; let dbdir_exists = if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) { // Didn't exist. Update dbdir e.insert(false); - let buf = DbDirectory::ser(&dbdir).context("serialize db")?; + let buf = DbDirectory::ser(&dbdir)?; self.pending_directory_entries.push(( DirectoryKind::Db, MetricsUpdate::Set(dbdir.dbdirs.len() as u64), @@ -2003,27 +1975,25 @@ impl DatadirModification<'_> { RelDirectory::default() } else { // reldir already exists, fetch it - RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?) - .context("deserialize db")? + RelDirectory::des(&self.get(rel_dir_key, ctx).await?)? }; - let v2_enabled = self.maybe_enable_rel_size_v2()?; + let v2_enabled = self + .maybe_enable_rel_size_v2() + .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; if v2_enabled { if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) { - return Err(RelationError::AlreadyExists); + Err(WalIngestErrorKind::RelationAlreadyExists(rel))?; } let sparse_rel_dir_key = rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum); // check if the rel_dir_key exists in v2 - let val = self - .sparse_get(sparse_rel_dir_key, ctx) - .await - .map_err(|e| RelationError::Other(e.into()))?; + let val = self.sparse_get(sparse_rel_dir_key, ctx).await?; let val = RelDirExists::decode_option(val) - .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?; + .map_err(|_| WalIngestErrorKind::InvalidRelDirKey(sparse_rel_dir_key))?; if val == RelDirExists::Exists { - return Err(RelationError::AlreadyExists); + Err(WalIngestErrorKind::RelationAlreadyExists(rel))?; } self.put( sparse_rel_dir_key, @@ -2039,9 +2009,7 @@ impl DatadirModification<'_> { // will be key not found errors if we don't create an empty one for rel_size_v2. self.put( rel_dir_key, - Value::Image(Bytes::from( - RelDirectory::ser(&RelDirectory::default()).context("serialize")?, - )), + Value::Image(Bytes::from(RelDirectory::ser(&RelDirectory::default())?)), ); } self.pending_directory_entries @@ -2049,7 +2017,7 @@ impl DatadirModification<'_> { } else { // Add the new relation to the rel directory entry, and write it back if !rel_dir.rels.insert((rel.relnode, rel.forknum)) { - return Err(RelationError::AlreadyExists); + Err(WalIngestErrorKind::RelationAlreadyExists(rel))?; } if !dbdir_exists { self.pending_directory_entries @@ -2059,9 +2027,7 @@ impl DatadirModification<'_> { .push((DirectoryKind::Rel, MetricsUpdate::Add(1))); self.put( rel_dir_key, - Value::Image(Bytes::from( - RelDirectory::ser(&rel_dir).context("serialize")?, - )), + Value::Image(Bytes::from(RelDirectory::ser(&rel_dir)?)), ); } @@ -2086,8 +2052,8 @@ impl DatadirModification<'_> { rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); if self .tline .get_rel_exists(rel, Version::Modified(self), ctx) @@ -2117,8 +2083,8 @@ impl DatadirModification<'_> { rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { - anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode); + ) -> Result<(), WalIngestError> { + ensure_walingest!(rel.relnode != 0, RelationError::InvalidRelnode); // Put size let size_key = rel_size_to_key(rel); @@ -2142,8 +2108,10 @@ impl DatadirModification<'_> { &mut self, drop_relations: HashMap<(u32, u32), Vec>, ctx: &RequestContext, - ) -> anyhow::Result<()> { - let v2_enabled = self.maybe_enable_rel_size_v2()?; + ) -> Result<(), WalIngestError> { + let v2_enabled = self + .maybe_enable_rel_size_v2() + .map_err(WalIngestErrorKind::MaybeRelSizeV2Error)?; for ((spc_node, db_node), rel_tags) in drop_relations { let dir_key = rel_dir_to_key(spc_node, db_node); let buf = self.get(dir_key, ctx).await?; @@ -2163,7 +2131,7 @@ impl DatadirModification<'_> { let key = rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum); let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?) - .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?; + .map_err(|_| WalIngestErrorKind::InvalidKey(key, self.lsn))?; if val == RelDirExists::Exists { self.pending_directory_entries .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1))); @@ -2206,7 +2174,7 @@ impl DatadirModification<'_> { segno: u32, nblocks: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); // Add it to the directory entry @@ -2215,7 +2183,7 @@ impl DatadirModification<'_> { let mut dir = SlruSegmentDirectory::des(&buf)?; if !dir.segments.insert(segno) { - anyhow::bail!("slru segment {kind:?}/{segno} already exists"); + Err(WalIngestErrorKind::SlruAlreadyExists(kind, segno))?; } self.pending_directory_entries.push(( DirectoryKind::SlruSegment(kind), @@ -2242,7 +2210,7 @@ impl DatadirModification<'_> { kind: SlruKind, segno: u32, nblocks: BlockNumber, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { assert!(self.tline.tenant_shard_id.is_shard_zero()); // Put size @@ -2258,7 +2226,7 @@ impl DatadirModification<'_> { kind: SlruKind, segno: u32, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { // Remove it from the directory entry let dir_key = slru_dir_to_key(kind); let buf = self.get(dir_key, ctx).await?; @@ -2283,7 +2251,7 @@ impl DatadirModification<'_> { } /// Drop a relmapper file (pg_filenode.map) - pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> anyhow::Result<()> { + pub fn drop_relmap_file(&mut self, _spcnode: Oid, _dbnode: Oid) -> Result<(), WalIngestError> { // TODO Ok(()) } @@ -2293,7 +2261,7 @@ impl DatadirModification<'_> { &mut self, xid: u64, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { // Remove it from the directory entry let buf = self.get(TWOPHASEDIR_KEY, ctx).await?; let newdirbuf = if self.tline.pg_version >= 17 { @@ -2308,7 +2276,8 @@ impl DatadirModification<'_> { )); Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?) } else { - let xid: u32 = u32::try_from(xid)?; + let xid: u32 = u32::try_from(xid) + .map_err(|e| WalIngestErrorKind::LogicalError(anyhow::Error::from(e)))?; let mut dir = TwoPhaseDirectory::des(&buf)?; if !dir.xids.remove(&xid) { @@ -2333,7 +2302,7 @@ impl DatadirModification<'_> { path: &str, content: &[u8], ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let key = aux_file::encode_aux_file_key(path); // retrieve the key from the engine let old_val = match self.get(key, ctx).await { @@ -2342,7 +2311,7 @@ impl DatadirModification<'_> { Err(e) => return Err(e.into()), }; let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val { - aux_file::decode_file_value(old_val)? + aux_file::decode_file_value(old_val).map_err(WalIngestErrorKind::EncodeAuxFileError)? } else { Vec::new() }; @@ -2387,7 +2356,8 @@ impl DatadirModification<'_> { } (None, true) => warn!("removing non-existing aux file: {}", path), } - let new_val = aux_file::encode_file_value(&new_files)?; + let new_val = aux_file::encode_file_value(&new_files) + .map_err(WalIngestErrorKind::EncodeAuxFileError)?; self.put(key, Value::Image(new_val.into())); Ok(()) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 0c399d4c913d..0ba70f45b2f4 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -100,7 +100,7 @@ use crate::tenant::timeline::delete::DeleteTimelineFlow; use crate::tenant::timeline::uninit::cleanup_timeline_directory; use crate::virtual_file::VirtualFile; use crate::walingest::WalLagCooldown; -use crate::walredo::PostgresRedoManager; +use crate::walredo::{PostgresRedoManager, RedoAttemptType}; use crate::{InitializationOrder, TEMP_FILE_SUFFIX, import_datadir, span, task_mgr, walredo}; static INIT_DB_SEMAPHORE: Lazy = Lazy::new(|| Semaphore::new(8)); @@ -473,15 +473,16 @@ impl WalRedoManager { base_img: Option<(Lsn, bytes::Bytes)>, records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>, pg_version: u32, + redo_attempt_type: RedoAttemptType, ) -> Result { match self { Self::Prod(_, mgr) => { - mgr.request_redo(key, lsn, base_img, records, pg_version) + mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type) .await } #[cfg(test)] Self::Test(mgr) => { - mgr.request_redo(key, lsn, base_img, records, pg_version) + mgr.request_redo(key, lsn, base_img, records, pg_version, redo_attempt_type) .await } } @@ -920,6 +921,7 @@ enum StartCreatingTimelineResult { Idempotent(Arc), } +#[allow(clippy::large_enum_variant, reason = "TODO")] enum TimelineInitAndSyncResult { ReadyToActivate(Arc), NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata), @@ -1006,6 +1008,7 @@ enum CreateTimelineCause { Delete, } +#[allow(clippy::large_enum_variant, reason = "TODO")] enum LoadTimelineCause { Attach, Unoffload, @@ -4079,6 +4082,7 @@ impl Tenant { TenantManifest { version: LATEST_TENANT_MANIFEST_VERSION, + stripe_size: Some(self.get_shard_stripe_size()), offloaded_timelines, } } @@ -4398,10 +4402,7 @@ impl Tenant { .to_string(); fail::fail_point!("tenant-config-before-write", |_| { - Err(std::io::Error::new( - std::io::ErrorKind::Other, - "tenant-config-before-write", - )) + Err(std::io::Error::other("tenant-config-before-write")) }); // Convert the config to a toml file. @@ -5879,6 +5880,7 @@ pub(crate) mod harness { base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, _pg_version: u32, + _redo_attempt_type: RedoAttemptType, ) -> Result { let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1)); if records_neon { @@ -5931,12 +5933,20 @@ mod tests { use models::CompactLsnRange; use pageserver_api::key::{AUX_KEY_PREFIX, Key, NON_INHERITED_RANGE, RELATION_SIZE_PREFIX}; use pageserver_api::keyspace::KeySpace; + #[cfg(feature = "testing")] + use pageserver_api::keyspace::KeySpaceRandomAccum; use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; #[cfg(feature = "testing")] use pageserver_api::record::NeonWalRecord; use pageserver_api::value::Value; use pageserver_compaction::helpers::overlaps_with; + #[cfg(feature = "testing")] + use rand::SeedableRng; + #[cfg(feature = "testing")] + use rand::rngs::StdRng; use rand::{Rng, thread_rng}; + #[cfg(feature = "testing")] + use std::ops::Range; use storage_layer::{IoConcurrency, PersistentLayerKey}; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; @@ -5946,7 +5956,7 @@ mod tests { use timeline::InMemoryLayerTestDesc; #[cfg(feature = "testing")] use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; - use timeline::{CompactOptions, DeltaLayerTestDesc}; + use timeline::{CompactOptions, DeltaLayerTestDesc, VersionedKeySpaceQuery}; use utils::id::TenantId; use super::*; @@ -5958,6 +5968,318 @@ mod tests { static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); + #[cfg(feature = "testing")] + struct TestTimelineSpecification { + start_lsn: Lsn, + last_record_lsn: Lsn, + + in_memory_layers_shape: Vec<(Range, Range)>, + delta_layers_shape: Vec<(Range, Range)>, + image_layers_shape: Vec<(Range, Lsn)>, + + gap_chance: u8, + will_init_chance: u8, + } + + #[cfg(feature = "testing")] + struct Storage { + storage: HashMap<(Key, Lsn), Value>, + start_lsn: Lsn, + } + + #[cfg(feature = "testing")] + impl Storage { + fn get(&self, key: Key, lsn: Lsn) -> Bytes { + use bytes::BufMut; + + let mut crnt_lsn = lsn; + let mut got_base = false; + + let mut acc = Vec::new(); + + while crnt_lsn >= self.start_lsn { + if let Some(value) = self.storage.get(&(key, crnt_lsn)) { + acc.push(value.clone()); + + match value { + Value::WalRecord(NeonWalRecord::Test { will_init, .. }) => { + if *will_init { + got_base = true; + break; + } + } + Value::Image(_) => { + got_base = true; + break; + } + _ => unreachable!(), + } + } + + crnt_lsn = crnt_lsn.checked_sub(1u64).unwrap(); + } + + assert!( + got_base, + "Input data was incorrect. No base image for {key}@{lsn}" + ); + + tracing::debug!("Wal redo depth for {key}@{lsn} is {}", acc.len()); + + let mut blob = BytesMut::new(); + for value in acc.into_iter().rev() { + match value { + Value::WalRecord(NeonWalRecord::Test { append, .. }) => { + blob.extend_from_slice(append.as_bytes()); + } + Value::Image(img) => { + blob.put(img); + } + _ => unreachable!(), + } + } + + blob.into() + } + } + + #[cfg(feature = "testing")] + #[allow(clippy::too_many_arguments)] + async fn randomize_timeline( + tenant: &Arc, + new_timeline_id: TimelineId, + pg_version: u32, + spec: TestTimelineSpecification, + random: &mut rand::rngs::StdRng, + ctx: &RequestContext, + ) -> anyhow::Result<(Arc, Storage, Vec)> { + let mut storage: HashMap<(Key, Lsn), Value> = HashMap::default(); + let mut interesting_lsns = vec![spec.last_record_lsn]; + + for (key_range, lsn_range) in spec.in_memory_layers_shape.iter() { + let mut lsn = lsn_range.start; + while lsn < lsn_range.end { + let mut key = key_range.start; + while key < key_range.end { + let gap = random.gen_range(1..=100) <= spec.gap_chance; + let will_init = random.gen_range(1..=100) <= spec.will_init_chance; + + if gap { + continue; + } + + let record = if will_init { + Value::WalRecord(NeonWalRecord::wal_init(format!("[wil_init {key}@{lsn}]"))) + } else { + Value::WalRecord(NeonWalRecord::wal_append(format!("[delta {key}@{lsn}]"))) + }; + + storage.insert((key, lsn), record); + + key = key.next(); + } + lsn = Lsn(lsn.0 + 1); + } + + // Stash some interesting LSN for future use + for offset in [0, 5, 100].iter() { + if *offset == 0 { + interesting_lsns.push(lsn_range.start); + } else { + let below = lsn_range.start.checked_sub(*offset); + match below { + Some(v) if v >= spec.start_lsn => { + interesting_lsns.push(v); + } + _ => {} + } + + let above = Lsn(lsn_range.start.0 + offset); + interesting_lsns.push(above); + } + } + } + + for (key_range, lsn_range) in spec.delta_layers_shape.iter() { + let mut lsn = lsn_range.start; + while lsn < lsn_range.end { + let mut key = key_range.start; + while key < key_range.end { + let gap = random.gen_range(1..=100) <= spec.gap_chance; + let will_init = random.gen_range(1..=100) <= spec.will_init_chance; + + if gap { + continue; + } + + let record = if will_init { + Value::WalRecord(NeonWalRecord::wal_init(format!("[wil_init {key}@{lsn}]"))) + } else { + Value::WalRecord(NeonWalRecord::wal_append(format!("[delta {key}@{lsn}]"))) + }; + + storage.insert((key, lsn), record); + + key = key.next(); + } + lsn = Lsn(lsn.0 + 1); + } + + // Stash some interesting LSN for future use + for offset in [0, 5, 100].iter() { + if *offset == 0 { + interesting_lsns.push(lsn_range.start); + } else { + let below = lsn_range.start.checked_sub(*offset); + match below { + Some(v) if v >= spec.start_lsn => { + interesting_lsns.push(v); + } + _ => {} + } + + let above = Lsn(lsn_range.start.0 + offset); + interesting_lsns.push(above); + } + } + } + + for (key_range, lsn) in spec.image_layers_shape.iter() { + let mut key = key_range.start; + while key < key_range.end { + let blob = Bytes::from(format!("[image {key}@{lsn}]")); + let record = Value::Image(blob.clone()); + storage.insert((key, *lsn), record); + + key = key.next(); + } + + // Stash some interesting LSN for future use + for offset in [0, 5, 100].iter() { + if *offset == 0 { + interesting_lsns.push(*lsn); + } else { + let below = lsn.checked_sub(*offset); + match below { + Some(v) if v >= spec.start_lsn => { + interesting_lsns.push(v); + } + _ => {} + } + + let above = Lsn(lsn.0 + offset); + interesting_lsns.push(above); + } + } + } + + let in_memory_test_layers = { + let mut acc = Vec::new(); + + for (key_range, lsn_range) in spec.in_memory_layers_shape.iter() { + let mut data = Vec::new(); + + let mut lsn = lsn_range.start; + while lsn < lsn_range.end { + let mut key = key_range.start; + while key < key_range.end { + if let Some(record) = storage.get(&(key, lsn)) { + data.push((key, lsn, record.clone())); + } + + key = key.next(); + } + lsn = Lsn(lsn.0 + 1); + } + + acc.push(InMemoryLayerTestDesc { + data, + lsn_range: lsn_range.clone(), + is_open: false, + }) + } + + acc + }; + + let delta_test_layers = { + let mut acc = Vec::new(); + + for (key_range, lsn_range) in spec.delta_layers_shape.iter() { + let mut data = Vec::new(); + + let mut lsn = lsn_range.start; + while lsn < lsn_range.end { + let mut key = key_range.start; + while key < key_range.end { + if let Some(record) = storage.get(&(key, lsn)) { + data.push((key, lsn, record.clone())); + } + + key = key.next(); + } + lsn = Lsn(lsn.0 + 1); + } + + acc.push(DeltaLayerTestDesc { + data, + lsn_range: lsn_range.clone(), + key_range: key_range.clone(), + }) + } + + acc + }; + + let image_test_layers = { + let mut acc = Vec::new(); + + for (key_range, lsn) in spec.image_layers_shape.iter() { + let mut data = Vec::new(); + + let mut key = key_range.start; + while key < key_range.end { + if let Some(record) = storage.get(&(key, *lsn)) { + let blob = match record { + Value::Image(blob) => blob.clone(), + _ => unreachable!(), + }; + + data.push((key, blob)); + } + + key = key.next(); + } + + acc.push((*lsn, data)); + } + + acc + }; + + let tline = tenant + .create_test_timeline_with_layers( + new_timeline_id, + spec.start_lsn, + pg_version, + ctx, + in_memory_test_layers, + delta_test_layers, + image_test_layers, + spec.last_record_lsn, + ) + .await?; + + Ok(( + tline, + Storage { + storage, + start_lsn: spec.start_lsn, + }, + interesting_lsns, + )) + } + #[tokio::test] async fn test_basic() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_basic").await?.load().await; @@ -6784,10 +7106,11 @@ mod tests { for read in reads { info!("Doing vectored read on {:?}", read); + let query = VersionedKeySpaceQuery::uniform(read.clone(), reads_lsn); + let vectored_res = tline .get_vectored_impl( - read.clone(), - reads_lsn, + query, &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) @@ -6866,10 +7189,11 @@ mod tests { }; let read_lsn = child_timeline.get_last_record_lsn(); + let query = VersionedKeySpaceQuery::uniform(aux_keyspace.clone(), read_lsn); + let vectored_res = child_timeline .get_vectored_impl( - aux_keyspace.clone(), - read_lsn, + query, &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) @@ -7015,10 +7339,12 @@ mod tests { let read = KeySpace { ranges: vec![key_near_gap..gap_at_key.next(), key_near_end..current_key], }; + + let query = VersionedKeySpaceQuery::uniform(read.clone(), current_lsn); + let results = child_timeline .get_vectored_impl( - read.clone(), - current_lsn, + query, &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) @@ -7149,12 +7475,16 @@ mod tests { } for query_lsn in query_lsns { + let query = VersionedKeySpaceQuery::uniform( + KeySpace { + ranges: vec![child_gap_at_key..child_gap_at_key.next()], + }, + query_lsn, + ); + let results = child_timeline .get_vectored_impl( - KeySpace { - ranges: vec![child_gap_at_key..child_gap_at_key.next()], - }, - query_lsn, + query, &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) @@ -7653,10 +7983,11 @@ mod tests { } let mut cnt = 0; + let query = VersionedKeySpaceQuery::uniform(keyspace.clone(), lsn); + for (key, value) in tline .get_vectored_impl( - keyspace.clone(), - lsn, + query, &mut ValuesReconstructState::new(io_concurrency.clone()), &ctx, ) @@ -7863,8 +8194,9 @@ mod tests { io_concurrency: IoConcurrency, ) -> anyhow::Result<(BTreeMap>, usize)> { let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); + let query = VersionedKeySpaceQuery::uniform(keyspace.clone(), lsn); let res = tline - .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) + .get_vectored_impl(query, &mut reconstruct_state, ctx) .await?; Ok((res, reconstruct_state.get_delta_layers_visited() as usize)) } @@ -8161,13 +8493,10 @@ mod tests { // test vectored scan on parent timeline let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); + let query = + VersionedKeySpaceQuery::uniform(KeySpace::single(Key::metadata_key_range()), lsn); let res = tline - .get_vectored_impl( - KeySpace::single(Key::metadata_key_range()), - lsn, - &mut reconstruct_state, - &ctx, - ) + .get_vectored_impl(query, &mut reconstruct_state, &ctx) .await?; assert_eq!( @@ -8187,13 +8516,10 @@ mod tests { // test vectored scan on child timeline let mut reconstruct_state = ValuesReconstructState::new(io_concurrency.clone()); + let query = + VersionedKeySpaceQuery::uniform(KeySpace::single(Key::metadata_key_range()), lsn); let res = child - .get_vectored_impl( - KeySpace::single(Key::metadata_key_range()), - lsn, - &mut reconstruct_state, - &ctx, - ) + .get_vectored_impl(query, &mut reconstruct_state, &ctx) .await?; assert_eq!( @@ -8227,13 +8553,9 @@ mod tests { let io_concurrency = IoConcurrency::spawn_from_conf(tline.conf, tline.gate.enter().unwrap()); let mut reconstruct_state = ValuesReconstructState::new(io_concurrency); + let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn); let mut res = tline - .get_vectored_impl( - KeySpace::single(key..key.next()), - lsn, - &mut reconstruct_state, - ctx, - ) + .get_vectored_impl(query, &mut reconstruct_state, ctx) .await?; Ok(res.pop_last().map(|(k, v)| { assert_eq!(k, key); @@ -8733,6 +9055,21 @@ mod tests { Lsn(0x20), Value::WalRecord(NeonWalRecord::wal_init("i")), ), + ( + get_key(4), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append_conditional("j", "i")), + ), + ( + get_key(5), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_init("1")), + ), + ( + get_key(5), + Lsn(0x30), + Value::WalRecord(NeonWalRecord::wal_append_conditional("j", "2")), + ), ]; let image1 = vec![(get_key(1), "0x10".into())]; @@ -8763,8 +9100,18 @@ mod tests { // Need to remove the limit of "Neon WAL redo requires base image". - // assert_eq!(tline.get(get_key(3), Lsn(0x50), &ctx).await?, Bytes::new()); - // assert_eq!(tline.get(get_key(4), Lsn(0x50), &ctx).await?, Bytes::new()); + assert_eq!( + tline.get(get_key(3), Lsn(0x50), &ctx).await?, + Bytes::from_static(b"c") + ); + assert_eq!( + tline.get(get_key(4), Lsn(0x50), &ctx).await?, + Bytes::from_static(b"ij") + ); + + // Manual testing required: currently, read errors will panic the process in debug mode. So we + // cannot enable this assertion in the unit test. + // assert!(tline.get(get_key(5), Lsn(0x50), &ctx).await.is_err()); Ok(()) } @@ -9230,6 +9577,7 @@ mod tests { &[Lsn(0x20), Lsn(0x40), Lsn(0x50)], 3, None, + true, ) .await .unwrap(); @@ -9354,7 +9702,15 @@ mod tests { ), ]; let res = tline - .generate_key_retention(key, &history, Lsn(0x60), &[Lsn(0x40), Lsn(0x50)], 3, None) + .generate_key_retention( + key, + &history, + Lsn(0x60), + &[Lsn(0x40), Lsn(0x50)], + 3, + None, + true, + ) .await .unwrap(); let expected_res = KeyHistoryRetention { @@ -9433,6 +9789,7 @@ mod tests { &[], 3, Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))), + true, ) .await .unwrap(); @@ -9481,6 +9838,7 @@ mod tests { &[Lsn(0x30)], 3, Some((key, Lsn(0x10), Bytes::copy_from_slice(b"0x10"))), + true, ) .await .unwrap(); @@ -10331,14 +10689,13 @@ mod tests { ) .await?; - let keyspace = KeySpace::single(get_key(0)..get_key(10)); + let query = VersionedKeySpaceQuery::uniform( + KeySpace::single(get_key(0)..get_key(10)), + delta_layer_end_lsn, + ); + let results = tline - .get_vectored( - keyspace, - delta_layer_end_lsn, - IoConcurrency::sequential(), - &ctx, - ) + .get_vectored(query, IoConcurrency::sequential(), &ctx) .await .expect("No vectored errors"); for (key, res) in results { @@ -10486,9 +10843,13 @@ mod tests { ) .await?; - let keyspace = KeySpace::single(get_key(0)..get_key(10)); + let query = VersionedKeySpaceQuery::uniform( + KeySpace::single(get_key(0)..get_key(10)), + last_record_lsn, + ); + let results = tline - .get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx) + .get_vectored(query, IoConcurrency::sequential(), &ctx) .await .expect("No vectored errors"); for (key, res) in results { @@ -10502,6 +10863,214 @@ mod tests { Ok(()) } + // A randomized read path test. Generates a layer map according to a deterministic + // specification. Fills the (key, LSN) space in random manner and then performs + // random scattered queries validating the results against in-memory storage. + // + // See this internal Notion page for a diagram of the layer map: + // https://www.notion.so/neondatabase/Read-Path-Unit-Testing-Fuzzing-1d1f189e0047806c8e5cd37781b0a350?pvs=4 + // + // A fuzzing mode is also supported. In this mode, the test will use a random + // seed instead of a hardcoded one. Use it in conjunction with `cargo stress` + // to run multiple instances in parallel: + // + // $ RUST_BACKTRACE=1 RUST_LOG=INFO \ + // cargo stress --package=pageserver --features=testing,fuzz-read-path --release -- test_read_path + #[cfg(feature = "testing")] + #[tokio::test] + async fn test_read_path() -> anyhow::Result<()> { + use rand::seq::SliceRandom; + + let seed = if cfg!(feature = "fuzz-read-path") { + let seed: u64 = thread_rng().r#gen(); + seed + } else { + // Use a hard-coded seed when not in fuzzing mode. + // Note that with the current approach results are not reproducible + // accross platforms and Rust releases. + const SEED: u64 = 0; + SEED + }; + + let mut random = StdRng::seed_from_u64(seed); + + let (queries, will_init_chance, gap_chance) = if cfg!(feature = "fuzz-read-path") { + const QUERIES: u64 = 5000; + let will_init_chance: u8 = random.gen_range(0..=10); + let gap_chance: u8 = random.gen_range(0..=50); + + (QUERIES, will_init_chance, gap_chance) + } else { + const QUERIES: u64 = 1000; + const WILL_INIT_CHANCE: u8 = 1; + const GAP_CHANCE: u8 = 5; + + (QUERIES, WILL_INIT_CHANCE, GAP_CHANCE) + }; + + let harness = TenantHarness::create("test_read_path").await?; + let (tenant, ctx) = harness.load().await; + + tracing::info!("Using random seed: {seed}"); + tracing::info!(%will_init_chance, %gap_chance, "Fill params"); + + // Define the layer map shape. Note that this part is not randomized. + + const KEY_DIMENSION_SIZE: u32 = 99; + let start_key = Key::from_hex("110000000033333333444444445500000000").unwrap(); + let end_key = start_key.add(KEY_DIMENSION_SIZE); + let total_key_range = start_key..end_key; + let total_key_range_size = end_key.to_i128() - start_key.to_i128(); + let total_start_lsn = Lsn(104); + let last_record_lsn = Lsn(504); + + assert!(total_key_range_size % 3 == 0); + + let in_memory_layers_shape = vec![ + (total_key_range.clone(), Lsn(304)..Lsn(400)), + (total_key_range.clone(), Lsn(400)..last_record_lsn), + ]; + + let delta_layers_shape = vec![ + ( + start_key..(start_key.add((total_key_range_size / 3) as u32)), + Lsn(200)..Lsn(304), + ), + ( + (start_key.add((total_key_range_size / 3) as u32)) + ..(start_key.add((total_key_range_size * 2 / 3) as u32)), + Lsn(200)..Lsn(304), + ), + ( + (start_key.add((total_key_range_size * 2 / 3) as u32)) + ..(start_key.add(total_key_range_size as u32)), + Lsn(200)..Lsn(304), + ), + ]; + + let image_layers_shape = vec![ + ( + start_key.add((total_key_range_size * 2 / 3 - 10) as u32) + ..start_key.add((total_key_range_size * 2 / 3 + 10) as u32), + Lsn(456), + ), + ( + start_key.add((total_key_range_size / 3 - 10) as u32) + ..start_key.add((total_key_range_size / 3 + 10) as u32), + Lsn(256), + ), + (total_key_range.clone(), total_start_lsn), + ]; + + let specification = TestTimelineSpecification { + start_lsn: total_start_lsn, + last_record_lsn, + in_memory_layers_shape, + delta_layers_shape, + image_layers_shape, + gap_chance, + will_init_chance, + }; + + // Create and randomly fill in the layers according to the specification + let (tline, storage, interesting_lsns) = randomize_timeline( + &tenant, + TIMELINE_ID, + DEFAULT_PG_VERSION, + specification, + &mut random, + &ctx, + ) + .await?; + + // Now generate queries based on the interesting lsns that we've collected. + // + // While there's still room in the query, pick and interesting LSN and a random + // key. Then roll the dice to see if the next key should also be included in + // the query. When the roll fails, break the "batch" and pick another point in the + // (key, LSN) space. + + const PICK_NEXT_CHANCE: u8 = 50; + for _ in 0..queries { + let query = { + let mut keyspaces_at_lsn: HashMap = HashMap::default(); + let mut used_keys: HashSet = HashSet::default(); + + while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize { + let selected_lsn = interesting_lsns.choose(&mut random).expect("not empty"); + let mut selected_key = start_key.add(random.gen_range(0..KEY_DIMENSION_SIZE)); + + while used_keys.len() < Timeline::MAX_GET_VECTORED_KEYS as usize { + if used_keys.contains(&selected_key) + || selected_key >= start_key.add(KEY_DIMENSION_SIZE) + { + break; + } + + keyspaces_at_lsn + .entry(*selected_lsn) + .or_default() + .add_key(selected_key); + used_keys.insert(selected_key); + + let pick_next = random.gen_range(0..=100) <= PICK_NEXT_CHANCE; + if pick_next { + selected_key = selected_key.next(); + } else { + break; + } + } + } + + VersionedKeySpaceQuery::scattered( + keyspaces_at_lsn + .into_iter() + .map(|(lsn, acc)| (lsn, acc.to_keyspace())) + .collect(), + ) + }; + + // Run the query and validate the results + + let results = tline + .get_vectored(query.clone(), IoConcurrency::Sequential, &ctx) + .await; + + let blobs = match results { + Ok(ok) => ok, + Err(err) => { + panic!("seed={seed} Error returned for query {query}: {err}"); + } + }; + + for (key, key_res) in blobs.into_iter() { + match key_res { + Ok(blob) => { + let requested_at_lsn = query.map_key_to_lsn(&key); + let expected = storage.get(key, requested_at_lsn); + + if blob != expected { + tracing::error!( + "seed={seed} Mismatch for {key}@{requested_at_lsn} from query: {query}" + ); + } + + assert_eq!(blob, expected); + } + Err(err) => { + let requested_at_lsn = query.map_key_to_lsn(&key); + + panic!( + "seed={seed} Error returned for {key}@{requested_at_lsn} from query {query}: {err}" + ); + } + } + } + } + + Ok(()) + } + fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering { ( k1.is_delta, @@ -11544,6 +12113,99 @@ mod tests { Ok(()) } + #[cfg(feature = "testing")] + #[tokio::test] + async fn test_bottom_most_compation_redo_failure() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_bottom_most_compation_redo_failure").await?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + let img_layer = (0..10) + .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + ( + get_key(1), + Lsn(0x20), + Value::WalRecord(NeonWalRecord::wal_append("@0x20")), + ), + ( + get_key(1), + Lsn(0x24), + Value::WalRecord(NeonWalRecord::wal_append("@0x24")), + ), + ( + get_key(1), + Lsn(0x28), + // This record will fail to redo + Value::WalRecord(NeonWalRecord::wal_append_conditional("@0x28", "???")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![], // in-memory layers + vec![DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + delta1, + )], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + tline + .applied_gc_cutoff_lsn + .lock_for_write() + .store_and_unlock(Lsn(0x30)) + .wait() + .await; + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + *guard = GcInfo { + retain_lsns: vec![], + cutoffs: GcCutoffs { + time: Lsn(0x30), + space: Lsn(0x30), + }, + leases: Default::default(), + within_ancestor_pitr: false, + }; + } + + let cancel = CancellationToken::new(); + + // Compaction will fail, but should not fire any critical error. + // Gc-compaction currently cannot figure out what keys are not in the keyspace during the compaction + // process. It will always try to redo the logs it reads and if it doesn't work, fail the entire + // compaction job. Tracked in . + let res = tline + .compact_with_gc( + &cancel, + CompactOptions { + compact_key_range: None, + compact_lsn_range: None, + ..Default::default() + }, + &ctx, + ) + .await; + assert!(res.is_err()); + + Ok(()) + } + #[cfg(feature = "testing")] #[tokio::test] async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> { diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index ff9a7e57b61c..abeaa166a40b 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -15,13 +15,14 @@ //! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! use std::cmp::min; -use std::io::{Error, ErrorKind}; +use std::io::Error; use async_compression::Level; use bytes::{BufMut, BytesMut}; use pageserver_api::models::ImageCompressionAlgorithm; use tokio::io::AsyncWriteExt; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; +use tokio_util::sync::CancellationToken; use tracing::warn; use crate::context::RequestContext; @@ -169,7 +170,13 @@ pub struct BlobWriter { } impl BlobWriter { - pub fn new(inner: VirtualFile, start_offset: u64) -> Self { + pub fn new( + inner: VirtualFile, + start_offset: u64, + _gate: &utils::sync::gate::Gate, + _cancel: CancellationToken, + _ctx: &RequestContext, + ) -> Self { Self { inner, offset: start_offset, @@ -331,10 +338,7 @@ impl BlobWriter { return ( ( io_buf.slice_len(), - Err(Error::new( - ErrorKind::Other, - format!("blob too large ({len} bytes)"), - )), + Err(Error::other(format!("blob too large ({len} bytes)"))), ), srcbuf, ); @@ -435,12 +439,14 @@ pub(crate) mod tests { ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec), Error> { let temp_dir = camino_tempfile::tempdir()?; let pathbuf = temp_dir.path().join("file"); + let gate = utils::sync::gate::Gate::default(); + let cancel = CancellationToken::new(); // Write part (in block to drop the file) let mut offsets = Vec::new(); { let file = VirtualFile::create(pathbuf.as_path(), ctx).await?; - let mut wtr = BlobWriter::::new(file, 0); + let mut wtr = BlobWriter::::new(file, 0, &gate, cancel.clone(), ctx); for blob in blobs.iter() { let (_, res) = if compression { let res = wtr diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 66c586daffdc..67231556262f 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -216,12 +216,8 @@ impl<'a> FileBlockReader<'a> { match cache .read_immutable_buf(self.file_id, blknum, ctx) .await - .map_err(|e| { - std::io::Error::new( - std::io::ErrorKind::Other, - format!("Failed to read immutable buf: {e:#}"), - ) - })? { + .map_err(|e| std::io::Error::other(format!("Failed to read immutable buf: {e:#}")))? + { ReadBufResult::Found(guard) => Ok(guard.into()), ReadBufResult::NotFound(write_guard) => { // Read the page from disk into the buffer diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs index 0e07acfbc8c5..7dba4508e23f 100644 --- a/pageserver/src/tenant/remote_timeline_client/manifest.rs +++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs @@ -1,4 +1,5 @@ use chrono::NaiveDateTime; +use pageserver_api::shard::ShardStripeSize; use serde::{Deserialize, Serialize}; use utils::id::TimelineId; use utils::lsn::Lsn; @@ -14,6 +15,12 @@ pub struct TenantManifest { /// allow release rollbacks. pub version: usize, + /// This tenant's stripe size. This is only advisory, and used to recover tenant data from + /// remote storage. The autoritative source is the storage controller. If None, assume the + /// original default value of 32768 blocks (256 MB). + #[serde(skip_serializing_if = "Option::is_none")] + pub stripe_size: Option, + /// The list of offloaded timelines together with enough information /// to not have to actually load them. /// @@ -42,7 +49,12 @@ pub struct OffloadedTimelineManifest { /// The newest manifest version. This should be incremented on changes, even non-breaking ones. We /// do not use deny_unknown_fields, so new fields are not breaking. -pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1; +/// +/// 1: initial version +/// 2: +stripe_size +/// +/// When adding new versions, also add a parse_vX test case below. +pub const LATEST_TENANT_MANIFEST_VERSION: usize = 2; impl TenantManifest { /// Returns true if the manifests are equal, ignoring the version number. This avoids @@ -56,10 +68,11 @@ impl TenantManifest { // We could alternatively just clone and modify the version here. let Self { version: _, // ignore version + stripe_size, offloaded_timelines, } = self; - offloaded_timelines == &other.offloaded_timelines + stripe_size == &other.stripe_size && offloaded_timelines == &other.offloaded_timelines } /// Decodes a manifest from JSON. @@ -89,6 +102,7 @@ mod tests { }"#; let expected = TenantManifest { version: 0, + stripe_size: None, offloaded_timelines: Vec::new(), }; assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?); @@ -104,6 +118,7 @@ mod tests { }"#; let expected = TenantManifest { version: 1, + stripe_size: None, offloaded_timelines: Vec::new(), }; assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?); @@ -130,6 +145,50 @@ mod tests { }"#; let expected = TenantManifest { version: 1, + stripe_size: None, + offloaded_timelines: vec![ + OffloadedTimelineManifest { + timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?, + ancestor_timeline_id: None, + ancestor_retain_lsn: None, + archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?, + }, + OffloadedTimelineManifest { + timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?, + ancestor_timeline_id: Some(TimelineId::from_str( + "5c4df612fd159e63c1b7853fe94d97da", + )?), + ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?), + archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?, + }, + ], + }; + assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?); + Ok(()) + } + + /// v2 manifests should be parsed, for backwards compatibility. + #[test] + fn parse_v2() -> anyhow::Result<()> { + let json = r#"{ + "version": 2, + "stripe_size": 32768, + "offloaded_timelines": [ + { + "timeline_id": "5c4df612fd159e63c1b7853fe94d97da", + "archived_at": "2025-03-07T11:07:11.373105434" + }, + { + "timeline_id": "f3def5823ad7080d2ea538d8e12163fa", + "ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da", + "ancestor_retain_lsn": "0/1F79038", + "archived_at": "2025-03-05T11:10:22.257901390" + } + ] + }"#; + let expected = TenantManifest { + version: 2, + stripe_size: Some(ShardStripeSize(32768)), offloaded_timelines: vec![ OffloadedTimelineManifest { timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?, diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 2ea0c1b97902..796ad01e5452 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -715,13 +715,34 @@ pub(crate) enum LayerId { } /// Uniquely identify a layer visit by the layer -/// and LSN floor (or start LSN) of the reads. -/// The layer itself is not enough since we may -/// have different LSN lower bounds for delta layer reads. +/// and LSN range of the reads. Note that the end of the range is exclusive. +/// +/// The layer itself is not enough since we may have different LSN lower +/// bounds for delta layer reads. Scenarios where this can happen are: +/// +/// 1. Layer overlaps: imagine an image layer inside and in-memory layer +/// and a query that only partially hits the image layer. Part of the query +/// needs to read the whole in-memory layer and the other part needs to read +/// only up to the image layer. Hence, they'll have different LSN floor values +/// for the read. +/// +/// 2. Scattered reads: the read path supports starting at different LSNs. Imagine +/// The start LSN for one range is inside a layer and the start LSN for another range +/// Is above the layer (includes all of it). Both ranges need to read the layer all the +/// Way to the end but starting at different points. Hence, they'll have different LSN +/// Ceil values. +/// +/// The implication is that we might visit the same layer multiple times +/// in order to read different LSN ranges from it. In practice, this isn't very concerning +/// because: +/// 1. Layer overlaps are rare and generally not intended +/// 2. Scattered reads will stabilise after the first few layers provided their starting LSNs +/// are grouped tightly enough (likely the case). #[derive(Debug, PartialEq, Eq, Clone, Hash)] struct LayerToVisitId { layer_id: LayerId, lsn_floor: Lsn, + lsn_ceil: Lsn, } #[derive(Debug, PartialEq, Eq, Hash)] @@ -805,6 +826,7 @@ impl LayerFringe { let layer_to_visit_id = LayerToVisitId { layer_id: layer.id(), lsn_floor: lsn_range.start, + lsn_ceil: lsn_range.end, }; let entry = self.visit_reads.entry(layer_to_visit_id.clone()); diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index fd50e4805de4..39cd02d101b4 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -5,6 +5,7 @@ use std::sync::Arc; use bytes::Bytes; use pageserver_api::key::{KEY_SIZE, Key}; use pageserver_api::value::Value; +use tokio_util::sync::CancellationToken; use utils::id::TimelineId; use utils::lsn::Lsn; use utils::shard::TenantShardId; @@ -179,7 +180,7 @@ impl BatchLayerWriter { /// An image writer that takes images and produces multiple image layers. #[must_use] -pub struct SplitImageLayerWriter { +pub struct SplitImageLayerWriter<'a> { inner: ImageLayerWriter, target_layer_size: u64, lsn: Lsn, @@ -188,9 +189,12 @@ pub struct SplitImageLayerWriter { tenant_shard_id: TenantShardId, batches: BatchLayerWriter, start_key: Key, + gate: &'a utils::sync::gate::Gate, + cancel: CancellationToken, } -impl SplitImageLayerWriter { +impl<'a> SplitImageLayerWriter<'a> { + #[allow(clippy::too_many_arguments)] pub async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, @@ -198,6 +202,8 @@ impl SplitImageLayerWriter { start_key: Key, lsn: Lsn, target_layer_size: u64, + gate: &'a utils::sync::gate::Gate, + cancel: CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { @@ -208,6 +214,8 @@ impl SplitImageLayerWriter { tenant_shard_id, &(start_key..Key::MAX), lsn, + gate, + cancel.clone(), ctx, ) .await?, @@ -217,6 +225,8 @@ impl SplitImageLayerWriter { batches: BatchLayerWriter::new(conf).await?, lsn, start_key, + gate, + cancel, }) } @@ -239,6 +249,8 @@ impl SplitImageLayerWriter { self.tenant_shard_id, &(key..Key::MAX), self.lsn, + self.gate, + self.cancel.clone(), ctx, ) .await?; @@ -291,7 +303,7 @@ impl SplitImageLayerWriter { /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm /// will split them into multiple files based on size. #[must_use] -pub struct SplitDeltaLayerWriter { +pub struct SplitDeltaLayerWriter<'a> { inner: Option<(Key, DeltaLayerWriter)>, target_layer_size: u64, conf: &'static PageServerConf, @@ -300,15 +312,19 @@ pub struct SplitDeltaLayerWriter { lsn_range: Range, last_key_written: Key, batches: BatchLayerWriter, + gate: &'a utils::sync::gate::Gate, + cancel: CancellationToken, } -impl SplitDeltaLayerWriter { +impl<'a> SplitDeltaLayerWriter<'a> { pub async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, lsn_range: Range, target_layer_size: u64, + gate: &'a utils::sync::gate::Gate, + cancel: CancellationToken, ) -> anyhow::Result { Ok(Self { target_layer_size, @@ -319,6 +335,8 @@ impl SplitDeltaLayerWriter { lsn_range, last_key_written: Key::MIN, batches: BatchLayerWriter::new(conf).await?, + gate, + cancel, }) } @@ -344,6 +362,8 @@ impl SplitDeltaLayerWriter { self.tenant_shard_id, key, self.lsn_range.clone(), + self.gate, + self.cancel.clone(), ctx, ) .await?, @@ -362,11 +382,13 @@ impl SplitDeltaLayerWriter { self.tenant_shard_id, key, self.lsn_range.clone(), + self.gate, + self.cancel.clone(), ctx, ) .await?; let (start_key, prev_delta_writer) = - std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap(); + self.inner.replace((key, next_delta_writer)).unwrap(); self.batches.add_unfinished_delta_writer( prev_delta_writer, start_key..key, @@ -469,6 +491,8 @@ mod tests { get_key(0), Lsn(0x18), 4 * 1024 * 1024, + &tline.gate, + tline.cancel.clone(), &ctx, ) .await @@ -480,6 +504,8 @@ mod tests { tenant.tenant_shard_id, Lsn(0x18)..Lsn(0x20), 4 * 1024 * 1024, + &tline.gate, + tline.cancel.clone(), ) .await .unwrap(); @@ -546,6 +572,8 @@ mod tests { get_key(0), Lsn(0x18), 4 * 1024 * 1024, + &tline.gate, + tline.cancel.clone(), &ctx, ) .await @@ -556,6 +584,8 @@ mod tests { tenant.tenant_shard_id, Lsn(0x18)..Lsn(0x20), 4 * 1024 * 1024, + &tline.gate, + tline.cancel.clone(), ) .await .unwrap(); @@ -643,6 +673,8 @@ mod tests { get_key(0), Lsn(0x18), 4 * 1024, + &tline.gate, + tline.cancel.clone(), &ctx, ) .await @@ -654,6 +686,8 @@ mod tests { tenant.tenant_shard_id, Lsn(0x18)..Lsn(0x20), 4 * 1024, + &tline.gate, + tline.cancel.clone(), ) .await .unwrap(); @@ -730,6 +764,8 @@ mod tests { tenant.tenant_shard_id, Lsn(0x10)..Lsn(N as u64 * 16 + 0x10), 4 * 1024 * 1024, + &tline.gate, + tline.cancel.clone(), ) .await .unwrap(); diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 05b0bc1a5c6c..4417b8aa5135 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -50,6 +50,7 @@ use rand::distributions::Alphanumeric; use serde::{Deserialize, Serialize}; use tokio::sync::OnceCell; use tokio_epoll_uring::IoBuf; +use tokio_util::sync::CancellationToken; use tracing::*; use utils::bin_ser::BeSer; use utils::id::{TenantId, TimelineId}; @@ -400,12 +401,15 @@ impl DeltaLayerWriterInner { /// /// Start building a new delta layer. /// + #[allow(clippy::too_many_arguments)] async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, + gate: &utils::sync::gate::Gate, + cancel: CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { // Create the file initially with a temporary filename. We don't know @@ -420,7 +424,7 @@ impl DeltaLayerWriterInner { let mut file = VirtualFile::create(&path, ctx).await?; // make room for the header block file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?; - let blob_writer = BlobWriter::new(file, PAGE_SZ as u64); + let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx); // Initialize the b-tree index builder let block_buf = BlockBuf::new(); @@ -628,12 +632,15 @@ impl DeltaLayerWriter { /// /// Start building a new delta layer. /// + #[allow(clippy::too_many_arguments)] pub async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, key_start: Key, lsn_range: Range, + gate: &utils::sync::gate::Gate, + cancel: CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { @@ -644,6 +651,8 @@ impl DeltaLayerWriter { tenant_shard_id, key_start, lsn_range, + gate, + cancel, ctx, ) .await?, @@ -1885,6 +1894,8 @@ pub(crate) mod test { harness.tenant_shard_id, entries_meta.key_range.start, entries_meta.lsn_range.clone(), + &timeline.gate, + timeline.cancel.clone(), &ctx, ) .await?; @@ -2079,6 +2090,8 @@ pub(crate) mod test { tenant.tenant_shard_id, Key::MIN, Lsn(0x11)..truncate_at, + &branch.gate, + branch.cancel.clone(), ctx, ) .await @@ -2213,6 +2226,8 @@ pub(crate) mod test { tenant.tenant_shard_id, *key_start, (*lsn_min)..lsn_end, + &tline.gate, + tline.cancel.clone(), ctx, ) .await?; diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 3243b7394257..3744d615f24e 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -48,6 +48,7 @@ use rand::distributions::Alphanumeric; use serde::{Deserialize, Serialize}; use tokio::sync::OnceCell; use tokio_stream::StreamExt; +use tokio_util::sync::CancellationToken; use tracing::*; use utils::bin_ser::BeSer; use utils::id::{TenantId, TimelineId}; @@ -748,12 +749,15 @@ impl ImageLayerWriterInner { /// /// Start building a new image layer. /// + #[allow(clippy::too_many_arguments)] async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, + gate: &utils::sync::gate::Gate, + cancel: CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { // Create the file initially with a temporary filename. @@ -780,7 +784,7 @@ impl ImageLayerWriterInner { }; // make room for the header block file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?; - let blob_writer = BlobWriter::new(file, PAGE_SZ as u64); + let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx); // Initialize the b-tree index builder let block_buf = BlockBuf::new(); @@ -988,18 +992,30 @@ impl ImageLayerWriter { /// /// Start building a new image layer. /// + #[allow(clippy::too_many_arguments)] pub async fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, key_range: &Range, lsn: Lsn, + gate: &utils::sync::gate::Gate, + cancel: CancellationToken, ctx: &RequestContext, ) -> anyhow::Result { Ok(Self { inner: Some( - ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn, ctx) - .await?, + ImageLayerWriterInner::new( + conf, + timeline_id, + tenant_shard_id, + key_range, + lsn, + gate, + cancel, + ctx, + ) + .await?, ), }) } @@ -1192,7 +1208,7 @@ mod test { // This key range contains several 0x8000 page stripes, only one of which belongs to shard zero let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap(); - let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap(); + let input_end = Key::from_hex("000000067f00000001000000ae0000002000").unwrap(); let range = input_start..input_end; // Build an image layer to filter @@ -1203,6 +1219,8 @@ mod test { harness.tenant_shard_id, &range, lsn, + &timeline.gate, + timeline.cancel.clone(), &ctx, ) .await @@ -1235,7 +1253,7 @@ mod test { let shard_identity = ShardIdentity::new( ShardNumber(shard_number), shard_count, - ShardStripeSize(0x8000), + ShardStripeSize(0x800), ) .unwrap(); let harness = TenantHarness::create_custom( @@ -1268,6 +1286,8 @@ mod test { harness.tenant_shard_id, &range, lsn, + &timeline.gate, + timeline.cancel.clone(), &ctx, ) .await @@ -1287,12 +1307,12 @@ mod test { // This exact size and those below will need updating as/when the layer encoding changes, but // should be deterministic for a given version of the format, as we used no randomness generating the input. - assert_eq!(original_size, 1597440); + assert_eq!(original_size, 122880); match shard_number { 0 => { // We should have written out just one stripe for our shard identity - assert_eq!(wrote_keys, 0x8000); + assert_eq!(wrote_keys, 0x800); let replacement = replacement.unwrap(); // We should have dropped some of the data @@ -1300,7 +1320,7 @@ mod test { assert!(replacement.metadata().file_size > 0); // Assert that we dropped ~3/4 of the data. - assert_eq!(replacement.metadata().file_size, 417792); + assert_eq!(replacement.metadata().file_size, 49152); } 1 => { // Shard 1 has no keys in our input range @@ -1309,19 +1329,19 @@ mod test { } 2 => { // Shard 2 has one stripes in the input range - assert_eq!(wrote_keys, 0x8000); + assert_eq!(wrote_keys, 0x800); let replacement = replacement.unwrap(); assert!(replacement.metadata().file_size < original_size); assert!(replacement.metadata().file_size > 0); - assert_eq!(replacement.metadata().file_size, 417792); + assert_eq!(replacement.metadata().file_size, 49152); } 3 => { // Shard 3 has two stripes in the input range - assert_eq!(wrote_keys, 0x10000); + assert_eq!(wrote_keys, 0x1000); let replacement = replacement.unwrap(); assert!(replacement.metadata().file_size < original_size); assert!(replacement.metadata().file_size > 0); - assert_eq!(replacement.metadata().file_size, 811008); + assert_eq!(replacement.metadata().file_size, 73728); } _ => unreachable!(), } @@ -1346,6 +1366,8 @@ mod test { tenant.tenant_shard_id, &key_range, lsn, + &tline.gate, + tline.cancel.clone(), ctx, ) .await?; diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 388ed3201c20..5d558e66cc7e 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -719,6 +719,8 @@ impl InMemoryLayer { ctx: &RequestContext, key_range: Option>, l0_flush_global_state: &l0_flush::Inner, + gate: &utils::sync::gate::Gate, + cancel: CancellationToken, ) -> Result> { // Grab the lock in read-mode. We hold it over the I/O, but because this // layer is not writeable anymore, no one should be trying to acquire the @@ -759,6 +761,8 @@ impl InMemoryLayer { self.tenant_shard_id, Key::MIN, self.start_lsn..end_lsn, + gate, + cancel, ctx, ) .await?; diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs index 90455fd0cabd..ea354fc716d6 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs @@ -766,7 +766,7 @@ mod tests { rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[len..]); // to discover bugs Ok((dst, len)) } - Err(e) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)), + Err(e) => Err(std::io::Error::other(e)), } } } diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index 76cdddd06a8b..55db9fe06a32 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -59,6 +59,7 @@ impl LayerIterRef<'_> { /// 1. Unified iterator for image and delta layers. /// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge). /// 3. Lazy creation of the real delta/image iterator. +#[allow(clippy::large_enum_variant, reason = "TODO")] pub(crate) enum IteratorWrapper<'a> { NotLoaded { ctx: &'a RequestContext, diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 5174da0f4384..c27a4b62da9d 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -24,6 +24,7 @@ use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; use crate::PERF_TRACE_TARGET; +use crate::walredo::RedoAttemptType; use anyhow::{Context, Result, anyhow, bail, ensure}; use arc_swap::{ArcSwap, ArcSwapOption}; use bytes::Bytes; @@ -115,7 +116,7 @@ use crate::pgdatadir_mapping::{ use crate::task_mgr::TaskKind; use crate::tenant::config::AttachmentMode; use crate::tenant::gc_result::GcResult; -use crate::tenant::layer_map::{LayerMap, SearchResult}; +use crate::tenant::layer_map::LayerMap; use crate::tenant::metadata::TimelineMetadata; use crate::tenant::storage_layer::delta_layer::DeltaEntry; use crate::tenant::storage_layer::inmemory_layer::IndexEntry; @@ -584,7 +585,7 @@ pub(crate) enum PageReconstructError { WalRedo(anyhow::Error), #[error("{0}")] - MissingKey(MissingKeyError), + MissingKey(Box), } impl From for PageReconstructError { @@ -689,16 +690,23 @@ impl std::fmt::Display for ReadPath { #[derive(thiserror::Error)] pub struct MissingKeyError { - key: Key, + keyspace: KeySpace, shard: ShardNumber, - cont_lsn: Lsn, - request_lsn: Lsn, + query: Option, + // This is largest request LSN from the get page request batch + original_hwm_lsn: Lsn, ancestor_lsn: Option, /// Debug information about the read path if there's an error read_path: Option, backtrace: Option, } +impl MissingKeyError { + fn enrich(&mut self, query: VersionedKeySpaceQuery) { + self.query = Some(query); + } +} + impl std::fmt::Debug for MissingKeyError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self) @@ -709,14 +717,18 @@ impl std::fmt::Display for MissingKeyError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, - "could not find data for key {} (shard {:?}) at LSN {}, request LSN {}", - self.key, self.shard, self.cont_lsn, self.request_lsn + "could not find data for key {} (shard {:?}), original HWM LSN {}", + self.keyspace, self.shard, self.original_hwm_lsn )?; if let Some(ref ancestor_lsn) = self.ancestor_lsn { write!(f, ", ancestor {}", ancestor_lsn)?; } + if let Some(ref query) = self.query { + write!(f, ", query {}", query)?; + } + if let Some(ref read_path) = self.read_path { write!(f, "\n{}", read_path)?; } @@ -816,7 +828,7 @@ pub(crate) enum GetVectoredError { InvalidLsn(Lsn), #[error("requested key not found: {0}")] - MissingKey(MissingKeyError), + MissingKey(Box), #[error("ancestry walk")] GetReadyAncestorError(#[source] GetReadyAncestorError), @@ -927,7 +939,7 @@ impl std::fmt::Debug for Timeline { } } -#[derive(thiserror::Error, Debug)] +#[derive(thiserror::Error, Debug, Clone)] pub(crate) enum WaitLsnError { // Called on a timeline which is shutting down #[error("Shutdown")] @@ -1039,6 +1051,7 @@ pub(crate) enum ShutdownMode { Hard, } +#[allow(clippy::large_enum_variant, reason = "TODO")] enum ImageLayerCreationOutcome { /// We generated an image layer Generated { @@ -1126,14 +1139,12 @@ impl Timeline { // page_service. debug_assert!(!self.shard_identity.is_key_disposable(&key)); - let keyspace = KeySpace { - ranges: vec![key..key.next()], - }; - let mut reconstruct_state = ValuesReconstructState::new(IoConcurrency::sequential()); + let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn); + let vectored_res = self - .get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx) + .get_vectored_impl(query, &mut reconstruct_state, ctx) .await; let key_value = vectored_res?.pop_first(); @@ -1151,15 +1162,17 @@ impl Timeline { value } } - None => Err(PageReconstructError::MissingKey(MissingKeyError { - key, - shard: self.shard_identity.get_shard_number(&key), - cont_lsn: Lsn(0), - request_lsn: lsn, - ancestor_lsn: None, - backtrace: None, - read_path: None, - })), + None => Err(PageReconstructError::MissingKey(Box::new( + MissingKeyError { + keyspace: KeySpace::single(key..key.next()), + shard: self.shard_identity.get_shard_number(&key), + original_hwm_lsn: lsn, + ancestor_lsn: None, + backtrace: None, + read_path: None, + query: None, + }, + ))), } } @@ -1172,21 +1185,18 @@ impl Timeline { /// which actually vectorizes the read path. pub(crate) async fn get_vectored( &self, - keyspace: KeySpace, - lsn: Lsn, + query: VersionedKeySpaceQuery, io_concurrency: super::storage_layer::IoConcurrency, ctx: &RequestContext, ) -> Result>, GetVectoredError> { - if !lsn.is_valid() { - return Err(GetVectoredError::InvalidLsn(lsn)); - } + let total_keyspace = query.total_keyspace(); - let key_count = keyspace.total_raw_size().try_into().unwrap(); + let key_count = total_keyspace.total_raw_size().try_into().unwrap(); if key_count > Timeline::MAX_GET_VECTORED_KEYS { return Err(GetVectoredError::Oversized(key_count)); } - for range in &keyspace.ranges { + for range in &total_keyspace.ranges { let mut key = range.start; while key != range.end { assert!(!self.shard_identity.is_key_disposable(&key)); @@ -1195,9 +1205,8 @@ impl Timeline { } trace!( - "get vectored request for {:?}@{} from task kind {:?}", - keyspace, - lsn, + "get vectored query {} from task kind {:?}", + query, ctx.task_kind(), ); @@ -1206,12 +1215,7 @@ impl Timeline { .map(|metric| (metric, Instant::now())); let res = self - .get_vectored_impl( - keyspace.clone(), - lsn, - &mut ValuesReconstructState::new(io_concurrency), - ctx, - ) + .get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx) .await; if let Some((metric, start)) = start { @@ -1262,13 +1266,10 @@ impl Timeline { .for_task_kind(ctx.task_kind()) .map(ScanLatencyOngoingRecording::start_recording); + let query = VersionedKeySpaceQuery::uniform(keyspace, lsn); + let vectored_res = self - .get_vectored_impl( - keyspace.clone(), - lsn, - &mut ValuesReconstructState::new(io_concurrency), - ctx, - ) + .get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx) .await; if let Some(recording) = start { @@ -1280,18 +1281,27 @@ impl Timeline { pub(super) async fn get_vectored_impl( &self, - keyspace: KeySpace, - lsn: Lsn, + query: VersionedKeySpaceQuery, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result>, GetVectoredError> { let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() { - Some(ReadPath::new(keyspace.clone(), lsn)) + Some(ReadPath::new( + query.total_keyspace(), + query.high_watermark_lsn()?, + )) } else { None }; + reconstruct_state.read_path = read_path; + let redo_attempt_type = if ctx.task_kind() == TaskKind::Compaction { + RedoAttemptType::LegacyCompaction + } else { + RedoAttemptType::ReadPage + }; + let traversal_res: Result<(), _> = { let ctx = RequestContextBuilder::from(ctx) .perf_span(|crnt_perf_span| { @@ -1303,7 +1313,7 @@ impl Timeline { }) .attached_child(); - self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, &ctx) + self.get_vectored_reconstruct_data(query.clone(), reconstruct_state, &ctx) .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone()) .await }; @@ -1316,6 +1326,13 @@ impl Timeline { .map(|state| state.collect_pending_ios()) .collect::>(); while collect_futs.next().await.is_some() {} + + // Enrich the missing key error with the original query. + if let GetVectoredError::MissingKey(mut missing_err) = err { + missing_err.enrich(query.clone()); + return Err(GetVectoredError::MissingKey(missing_err)); + } + return Err(err); }; @@ -1333,6 +1350,8 @@ impl Timeline { let futs = FuturesUnordered::new(); for (key, state) in std::mem::take(&mut reconstruct_state.keys) { + let req_lsn_for_key = query.map_key_to_lsn(&key); + futs.push({ let walredo_self = self.myself.upgrade().expect("&self method holds the arc"); let ctx = RequestContextBuilder::from(&ctx) @@ -1379,7 +1398,7 @@ impl Timeline { let walredo_deltas = converted.num_deltas(); let walredo_res = walredo_self - .reconstruct_value(key, lsn, converted) + .reconstruct_value(key, req_lsn_for_key, converted, redo_attempt_type) .maybe_perf_instrument(&ctx, |crnt_perf_span| { info_span!( target: PERF_TRACE_TARGET, @@ -1406,15 +1425,18 @@ impl Timeline { // to avoid infinite results. if !results.is_empty() { if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD { + let total_keyspace = query.total_keyspace(); + let max_request_lsn = query.high_watermark_lsn().expect("Validated previously"); + static LOG_PACER: Lazy> = Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60)))); LOG_PACER.lock().unwrap().call(|| { - let num_keys = keyspace.total_raw_size(); + let num_keys = total_keyspace.total_raw_size(); let num_pages = results.len(); tracing::info!( shard_id = %self.tenant_shard_id.shard_slug(), - lsn = %lsn, - "Vectored read for {keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.", + lsn = %max_request_lsn, + "Vectored read for {total_keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.", ); }); } @@ -2715,6 +2737,10 @@ impl Timeline { .tenant_conf .gc_compaction_enabled .unwrap_or(self.conf.default_tenant_conf.gc_compaction_enabled); + let gc_compaction_verification = tenant_conf + .tenant_conf + .gc_compaction_verification + .unwrap_or(self.conf.default_tenant_conf.gc_compaction_verification); let gc_compaction_initial_threshold_kb = tenant_conf .tenant_conf .gc_compaction_initial_threshold_kb @@ -2729,6 +2755,7 @@ impl Timeline { .unwrap_or(self.conf.default_tenant_conf.gc_compaction_ratio_percent); GcCompactionCombinedSettings { gc_compaction_enabled, + gc_compaction_verification, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, } @@ -3927,6 +3954,154 @@ impl Timeline { } } +#[derive(Clone)] +/// Type representing a query in the ([`Lsn`], [`Key`]) space. +/// In other words, a set of segments in a 2D space. +/// +/// This representation has the advatange of avoiding hash map +/// allocations for uniform queries. +pub(crate) enum VersionedKeySpaceQuery { + /// Variant for queries at a single [`Lsn`] + Uniform { keyspace: KeySpace, lsn: Lsn }, + /// Variant for queries at multiple [`Lsn`]s + Scattered { + keyspaces_at_lsn: Vec<(Lsn, KeySpace)>, + }, +} + +impl VersionedKeySpaceQuery { + pub(crate) fn uniform(keyspace: KeySpace, lsn: Lsn) -> Self { + Self::Uniform { keyspace, lsn } + } + + pub(crate) fn scattered(keyspaces_at_lsn: Vec<(Lsn, KeySpace)>) -> Self { + Self::Scattered { keyspaces_at_lsn } + } + + /// Returns the most recent (largest) LSN included in the query. + /// If any of the LSNs included in the query are invalid, returns + /// an error instead. + fn high_watermark_lsn(&self) -> Result { + match self { + Self::Uniform { lsn, .. } => { + if !lsn.is_valid() { + return Err(GetVectoredError::InvalidLsn(*lsn)); + } + + Ok(*lsn) + } + Self::Scattered { keyspaces_at_lsn } => { + let mut max_lsn = None; + for (lsn, _keyspace) in keyspaces_at_lsn.iter() { + if !lsn.is_valid() { + return Err(GetVectoredError::InvalidLsn(*lsn)); + } + max_lsn = std::cmp::max(max_lsn, Some(lsn)); + } + + if let Some(computed) = max_lsn { + Ok(*computed) + } else { + Err(GetVectoredError::Other(anyhow!("empty input"))) + } + } + } + } + + /// Returns the total keyspace being queried: the result of projecting + /// everything in the key dimensions onto the key axis. + fn total_keyspace(&self) -> KeySpace { + match self { + Self::Uniform { keyspace, .. } => keyspace.clone(), + Self::Scattered { keyspaces_at_lsn } => keyspaces_at_lsn + .iter() + .map(|(_lsn, keyspace)| keyspace) + .fold(KeySpace::default(), |mut acc, v| { + acc.merge(v); + acc + }), + } + } + + /// Returns LSN for a specific key. + /// + /// Invariant: requested key must be part of [`Self::total_keyspace`] + pub(super) fn map_key_to_lsn(&self, key: &Key) -> Lsn { + match self { + Self::Uniform { lsn, .. } => *lsn, + Self::Scattered { keyspaces_at_lsn } => { + keyspaces_at_lsn + .iter() + .find(|(_lsn, keyspace)| keyspace.contains(key)) + .expect("Returned key was requested") + .0 + } + } + } + + /// Remove any parts of the query (segments) which overlap with the provided + /// key space (also segments). + fn remove_overlapping_with(&mut self, to_remove: &KeySpace) -> KeySpace { + match self { + Self::Uniform { keyspace, .. } => keyspace.remove_overlapping_with(to_remove), + Self::Scattered { keyspaces_at_lsn } => { + let mut removed_accum = KeySpaceRandomAccum::new(); + keyspaces_at_lsn.iter_mut().for_each(|(_lsn, keyspace)| { + let removed = keyspace.remove_overlapping_with(to_remove); + removed_accum.add_keyspace(removed); + }); + + removed_accum.to_keyspace() + } + } + } + + fn is_empty(&self) -> bool { + match self { + Self::Uniform { keyspace, .. } => keyspace.is_empty(), + Self::Scattered { keyspaces_at_lsn } => keyspaces_at_lsn + .iter() + .all(|(_lsn, keyspace)| keyspace.is_empty()), + } + } + + /// "Lower" the query on the LSN dimension + fn lower(&mut self, to: Lsn) { + match self { + Self::Uniform { lsn, .. } => { + // If the originally requested LSN is smaller than the starting + // LSN of the ancestor we are descending into, we need to respect that. + // Hence the min. + *lsn = std::cmp::min(*lsn, to); + } + Self::Scattered { keyspaces_at_lsn } => { + keyspaces_at_lsn.iter_mut().for_each(|(lsn, _keyspace)| { + *lsn = std::cmp::min(*lsn, to); + }); + } + } + } +} + +impl std::fmt::Display for VersionedKeySpaceQuery { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "[")?; + + match self { + VersionedKeySpaceQuery::Uniform { keyspace, lsn } => { + write!(f, "{keyspace} @ {lsn}")?; + } + VersionedKeySpaceQuery::Scattered { keyspaces_at_lsn } => { + for (lsn, keyspace) in keyspaces_at_lsn.iter() { + write!(f, "{keyspace} @ {lsn},")?; + } + } + } + + write!(f, "]") + } +} + impl Timeline { #[allow(clippy::doc_lazy_continuation)] /// Get the data needed to reconstruct all keys in the provided keyspace @@ -3941,16 +4116,15 @@ impl Timeline { /// 2.4. If the fringe is empty, go back to 1 async fn get_vectored_reconstruct_data( &self, - mut keyspace: KeySpace, - request_lsn: Lsn, + mut query: VersionedKeySpaceQuery, reconstruct_state: &mut ValuesReconstructState, ctx: &RequestContext, ) -> Result<(), GetVectoredError> { + let original_hwm_lsn = query.high_watermark_lsn().unwrap(); + let mut timeline_owned: Arc; let mut timeline = self; - let mut cont_lsn = Lsn(request_lsn.0 + 1); - let missing_keyspace = loop { if self.cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); @@ -3967,15 +4141,14 @@ impl Timeline { parent: crnt_perf_span, "PLAN_IO_TIMELINE", timeline = %timeline.timeline_id, - lsn = %cont_lsn, + high_watermark_lsn = %query.high_watermark_lsn().unwrap(), ) }) .attached_child(); Self::get_vectored_reconstruct_data_timeline( timeline, - keyspace.clone(), - cont_lsn, + &query, reconstruct_state, &self.cancel, &ctx, @@ -3984,23 +4157,23 @@ impl Timeline { .await? }; - keyspace.remove_overlapping_with(&completed); + query.remove_overlapping_with(&completed); // Do not descend into the ancestor timeline for aux files. // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid // stalling compaction. - keyspace.remove_overlapping_with(&KeySpace { + query.remove_overlapping_with(&KeySpace { ranges: vec![NON_INHERITED_RANGE, Key::sparse_non_inherited_keyspace()], }); // Keyspace is fully retrieved - if keyspace.is_empty() { + if query.is_empty() { break None; } let Some(ancestor_timeline) = timeline.ancestor_timeline.as_ref() else { // Not fully retrieved but no ancestor timeline. - break Some(keyspace); + break Some(query.total_keyspace()); }; // Now we see if there are keys covered by the image layer but does not exist in the @@ -4011,7 +4184,7 @@ impl Timeline { // keys from `keyspace`, we expect there to be no overlap between it and the image covered key // space. If that's not the case, we had at least one key encounter a gap in the image layer // and stop the search as a result of that. - let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace); + let mut removed = query.remove_overlapping_with(&image_covered_keyspace); // Do not fire missing key error and end early for sparse keys. Note that we hava already removed // non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of // figuring out what is the inherited key range and do a fine-grained pruning. @@ -4021,11 +4194,11 @@ impl Timeline { if !removed.is_empty() { break Some(removed); } - // If we reached this point, `remove_overlapping_with` should not have made any change to the - // keyspace. - // Take the min to avoid reconstructing a page with data newer than request Lsn. - cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1)); + // Each key range in the original query is at some point in the LSN space. + // When descending into the ancestor, lower all ranges in the LSN space + // such that new changes on the parent timeline are not visible. + query.lower(timeline.ancestor_lsn); let ctx = RequestContextBuilder::from(ctx) .perf_span(|crnt_perf_span| { @@ -4034,7 +4207,6 @@ impl Timeline { parent: crnt_perf_span, "GET_ANCESTOR", timeline = %timeline.timeline_id, - lsn = %cont_lsn, ancestor = %ancestor_timeline.timeline_id, ancestor_lsn = %timeline.ancestor_lsn ) @@ -4064,22 +4236,47 @@ impl Timeline { }; if let Some(missing_keyspace) = missing_keyspace { - return Err(GetVectoredError::MissingKey(MissingKeyError { - key: missing_keyspace.start().unwrap(), /* better if we can store the full keyspace */ - shard: self - .shard_identity - .get_shard_number(&missing_keyspace.start().unwrap()), - cont_lsn, - request_lsn, + return Err(GetVectoredError::MissingKey(Box::new(MissingKeyError { + keyspace: missing_keyspace, /* better if we can store the full keyspace */ + shard: self.shard_identity.number, + original_hwm_lsn, ancestor_lsn: Some(timeline.ancestor_lsn), backtrace: None, read_path: std::mem::take(&mut reconstruct_state.read_path), - })); + query: None, + }))); } Ok(()) } + async fn get_vectored_init_fringe( + &self, + query: &VersionedKeySpaceQuery, + ) -> Result { + let mut fringe = LayerFringe::new(); + let guard = self.layers.read().await; + + match query { + VersionedKeySpaceQuery::Uniform { keyspace, lsn } => { + // LSNs requested by the compute or determined by the pageserver + // are inclusive. Queries to the layer map use exclusive LSNs. + // Hence, bump the value before the query - same in the other + // match arm. + let cont_lsn = Lsn(lsn.0 + 1); + guard.update_search_fringe(keyspace, cont_lsn, &mut fringe)?; + } + VersionedKeySpaceQuery::Scattered { keyspaces_at_lsn } => { + for (lsn, keyspace) in keyspaces_at_lsn.iter() { + let cont_lsn_for_keyspace = Lsn(lsn.0 + 1); + guard.update_search_fringe(keyspace, cont_lsn_for_keyspace, &mut fringe)?; + } + } + } + + Ok(fringe) + } + /// Collect the reconstruct data for a keyspace from the specified timeline. /// /// Maintain a fringe [`LayerFringe`] which tracks all the layers that intersect @@ -4098,18 +4295,11 @@ impl Timeline { /// decides how to deal with these two keyspaces. async fn get_vectored_reconstruct_data_timeline( timeline: &Timeline, - keyspace: KeySpace, - mut cont_lsn: Lsn, + query: &VersionedKeySpaceQuery, reconstruct_state: &mut ValuesReconstructState, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { - let mut unmapped_keyspace = keyspace.clone(); - let mut fringe = LayerFringe::new(); - - let mut completed_keyspace = KeySpace::default(); - let mut image_covered_keyspace = KeySpaceRandomAccum::new(); - // Prevent GC from progressing while visiting the current timeline. // If we are GC-ing because a new image layer was added while traversing // the timeline, then it will remove layers that are required for fulfilling @@ -4120,11 +4310,37 @@ impl Timeline { // See `compaction::compact_with_gc` for why we need this. let _guard = timeline.gc_compaction_layer_update_lock.read().await; - loop { + // Initialize the fringe + let mut fringe = timeline.get_vectored_init_fringe(query).await?; + + let mut completed_keyspace = KeySpace::default(); + let mut image_covered_keyspace = KeySpaceRandomAccum::new(); + + while let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() { if cancel.is_cancelled() { return Err(GetVectoredError::Cancelled); } + if let Some(ref mut read_path) = reconstruct_state.read_path { + read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range); + } + + // Visit the layer and plan IOs for it + let next_cont_lsn = lsn_range.start; + layer_to_read + .get_values_reconstruct_data( + keyspace_to_read.clone(), + lsn_range, + reconstruct_state, + ctx, + ) + .await?; + + let mut unmapped_keyspace = keyspace_to_read; + let cont_lsn = next_cont_lsn; + + reconstruct_state.on_layer_visited(&layer_to_read); + let (keys_done_last_step, keys_with_image_coverage) = reconstruct_state.consume_done_keys(); unmapped_keyspace.remove_overlapping_with(&keys_done_last_step); @@ -4135,31 +4351,15 @@ impl Timeline { image_covered_keyspace.add_range(keys_with_image_coverage); } + // Query the layer map for the next layers to read. + // // Do not descent any further if the last layer we visited // completed all keys in the keyspace it inspected. This is not // required for correctness, but avoids visiting extra layers // which turns out to be a perf bottleneck in some cases. if !unmapped_keyspace.is_empty() { let guard = timeline.layers.read().await; - let layers = guard.layer_map()?; - - for range in unmapped_keyspace.ranges.iter() { - let results = layers.range_search(range.clone(), cont_lsn); - - results - .found - .into_iter() - .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { - ( - guard.upgrade(layer), - keyspace_accum.to_keyspace(), - lsn_floor..cont_lsn, - ) - }) - .for_each(|(layer, keyspace, lsn_range)| { - fringe.update(layer, keyspace, lsn_range) - }); - } + guard.update_search_fringe(&unmapped_keyspace, cont_lsn, &mut fringe)?; // It's safe to drop the layer map lock after planning the next round of reads. // The fringe keeps readable handles for the layers which are safe to read even @@ -4173,28 +4373,6 @@ impl Timeline { // at two different time points. drop(guard); } - - if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() { - if let Some(ref mut read_path) = reconstruct_state.read_path { - read_path.record_layer_visit(&layer_to_read, &keyspace_to_read, &lsn_range); - } - let next_cont_lsn = lsn_range.start; - layer_to_read - .get_values_reconstruct_data( - keyspace_to_read.clone(), - lsn_range, - reconstruct_state, - ctx, - ) - .await?; - - unmapped_keyspace = keyspace_to_read; - cont_lsn = next_cont_lsn; - - reconstruct_state.on_layer_visited(&layer_to_read); - } else { - break; - } } Ok(TimelineVisitOutcome { @@ -4808,7 +4986,13 @@ impl Timeline { let ctx = ctx.attached_child(); let work = async move { let Some((desc, path)) = frozen_layer - .write_to_disk(&ctx, key_range, self_clone.l0_flush_global_state.inner()) + .write_to_disk( + &ctx, + key_range, + self_clone.l0_flush_global_state.inner(), + &self_clone.gate, + self_clone.cancel.clone(), + ) .await? else { return Ok(None); @@ -4994,13 +5178,11 @@ impl Timeline { if key_request_accum.raw_size() >= Timeline::MAX_GET_VECTORED_KEYS || (last_key_in_range && key_request_accum.raw_size() > 0) { + let query = + VersionedKeySpaceQuery::uniform(key_request_accum.consume_keyspace(), lsn); + let results = self - .get_vectored( - key_request_accum.consume_keyspace(), - lsn, - io_concurrency.clone(), - ctx, - ) + .get_vectored(query, io_concurrency.clone(), ctx) .await?; if self.cancel.is_cancelled() { @@ -5089,7 +5271,11 @@ impl Timeline { // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should // not contain too many keys, otherwise this takes a lot of memory. let data = self - .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx) + .get_vectored_impl( + VersionedKeySpaceQuery::uniform(partition.clone(), lsn), + &mut reconstruct_state, + ctx, + ) .await?; let (data, total_kb_retrieved, total_keys_retrieved) = { let mut new_data = BTreeMap::new(); @@ -5346,6 +5532,8 @@ impl Timeline { self.tenant_shard_id, &img_range, lsn, + &self.gate, + self.cancel.clone(), ctx, ) .await?; @@ -6353,37 +6541,21 @@ impl Timeline { /// Reconstruct a value, using the given base image and WAL records in 'data'. async fn reconstruct_value( - &self, - key: Key, - request_lsn: Lsn, - data: ValueReconstructState, - ) -> Result { - self.reconstruct_value_inner(key, request_lsn, data, false) - .await - } - - /// Reconstruct a value, using the given base image and WAL records in 'data'. It does not fire critical errors because - /// sometimes it is expected to fail due to unreplayable history described in . - async fn reconstruct_value_wo_critical_error( - &self, - key: Key, - request_lsn: Lsn, - data: ValueReconstructState, - ) -> Result { - self.reconstruct_value_inner(key, request_lsn, data, true) - .await - } - - async fn reconstruct_value_inner( &self, key: Key, request_lsn: Lsn, mut data: ValueReconstructState, - no_critical_error: bool, + redo_attempt_type: RedoAttemptType, ) -> Result { // Perform WAL redo if needed data.records.reverse(); + let fire_critical_error = match redo_attempt_type { + RedoAttemptType::ReadPage => true, + RedoAttemptType::LegacyCompaction => true, + RedoAttemptType::GcCompaction => false, + }; + // If we have a page image, and no WAL, we're all set if data.records.is_empty() { if let Some((img_lsn, img)) = &data.img { @@ -6430,13 +6602,20 @@ impl Timeline { .as_ref() .context("timeline has no walredo manager") .map_err(PageReconstructError::WalRedo)? - .request_redo(key, request_lsn, data.img, data.records, self.pg_version) + .request_redo( + key, + request_lsn, + data.img, + data.records, + self.pg_version, + redo_attempt_type, + ) .await; let img = match res { Ok(img) => img, Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled), Err(walredo::Error::Other(err)) => { - if !no_critical_error { + if fire_critical_error { critical!("walredo failure during page reconstruction: {err:?}"); } return Err(PageReconstructError::WalRedo( @@ -6719,6 +6898,8 @@ impl Timeline { self.tenant_shard_id, &(min_key..end_key), lsn, + &self.gate, + self.cancel.clone(), ctx, ) .await?; @@ -6780,6 +6961,8 @@ impl Timeline { self.tenant_shard_id, deltas.key_range.start, deltas.lsn_range, + &self.gate, + self.cancel.clone(), ctx, ) .await?; diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 8403c0a7d9c2..91cc8ca10c9f 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -7,7 +7,7 @@ use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque}; use std::ops::{Deref, Range}; use std::sync::Arc; -use std::time::Instant; +use std::time::{Duration, Instant}; use super::layer_manager::LayerManager; use super::{ @@ -16,6 +16,8 @@ use super::{ Timeline, }; +use crate::tenant::timeline::DeltaEntry; +use crate::walredo::RedoAttemptType; use anyhow::{Context, anyhow}; use bytes::Bytes; use enumset::EnumSet; @@ -78,6 +80,7 @@ impl std::fmt::Display for GcCompactionJobId { pub struct GcCompactionCombinedSettings { pub gc_compaction_enabled: bool, + pub gc_compaction_verification: bool, pub gc_compaction_initial_threshold_kb: u64, pub gc_compaction_ratio_percent: u64, } @@ -223,6 +226,7 @@ impl GcCompactionQueue { gc_compaction_enabled, gc_compaction_initial_threshold_kb, gc_compaction_ratio_percent, + .. } = timeline.get_gc_compaction_settings(); if !gc_compaction_enabled { return Ok(()); @@ -315,6 +319,9 @@ impl GcCompactionQueue { flags: { let mut flags = EnumSet::new(); flags |= CompactFlags::EnhancedGcBottomMostCompaction; + if timeline.get_compaction_l0_first() { + flags |= CompactFlags::YieldForL0; + } flags }, sub_compaction: true, @@ -742,8 +749,8 @@ impl KeyHistoryRetention { async fn pipe_to( self, key: Key, - delta_writer: &mut SplitDeltaLayerWriter, - mut image_writer: Option<&mut SplitImageLayerWriter>, + delta_writer: &mut SplitDeltaLayerWriter<'_>, + mut image_writer: Option<&mut SplitImageLayerWriter<'_>>, stat: &mut CompactionStatistics, ctx: &RequestContext, ) -> anyhow::Result<()> { @@ -783,6 +790,114 @@ impl KeyHistoryRetention { } Ok(()) } + + /// Verify if every key in the retention is readable by replaying the logs. + async fn verify( + &self, + key: Key, + base_img_from_ancestor: &Option<(Key, Lsn, Bytes)>, + full_history: &[(Key, Lsn, Value)], + tline: &Arc, + ) -> anyhow::Result<()> { + // Usually the min_lsn should be the first record but we do a full iteration to be safe. + let Some(min_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).min() else { + // This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`. + return Ok(()); + }; + let Some(max_lsn) = full_history.iter().map(|(_, lsn, _)| *lsn).max() else { + // This should never happen b/c if we don't have any history of a key, we won't even do `generate_key_retention`. + return Ok(()); + }; + let mut base_img = base_img_from_ancestor + .as_ref() + .map(|(_, lsn, img)| (*lsn, img)); + let mut history = Vec::new(); + + async fn collect_and_verify( + key: Key, + lsn: Lsn, + base_img: &Option<(Lsn, &Bytes)>, + history: &[(Lsn, &NeonWalRecord)], + tline: &Arc, + ) -> anyhow::Result<()> { + let mut records = history + .iter() + .map(|(lsn, val)| (*lsn, (*val).clone())) + .collect::>(); + + // WAL redo requires records in the reverse LSN order + records.reverse(); + let data = ValueReconstructState { + img: base_img.as_ref().map(|(lsn, img)| (*lsn, (*img).clone())), + records, + }; + + tline + .reconstruct_value(key, lsn, data, RedoAttemptType::GcCompaction) + .await + .with_context(|| format!("verification failed for key {} at lsn {}", key, lsn))?; + + Ok(()) + } + + for (retain_lsn, KeyLogAtLsn(logs)) in &self.below_horizon { + for (lsn, val) in logs { + match val { + Value::Image(img) => { + base_img = Some((*lsn, img)); + history.clear(); + } + Value::WalRecord(rec) if val.will_init() => { + base_img = None; + history.clear(); + history.push((*lsn, rec)); + } + Value::WalRecord(rec) => { + history.push((*lsn, rec)); + } + } + } + if *retain_lsn >= min_lsn { + // Only verify after the key appears in the full history for the first time. + + if base_img.is_none() && history.is_empty() { + anyhow::bail!( + "verificatoin failed: key {} has no history at {}", + key, + retain_lsn + ); + }; + // We don't modify history: in theory, we could replace the history with a single + // image as in `generate_key_retention` to make redos at later LSNs faster. But we + // want to verify everything as if they are read from the real layer map. + collect_and_verify(key, *retain_lsn, &base_img, &history, tline).await?; + } + } + + for (lsn, val) in &self.above_horizon.0 { + match val { + Value::Image(img) => { + // Above the GC horizon, we verify every time we see an image. + collect_and_verify(key, *lsn, &base_img, &history, tline).await?; + base_img = Some((*lsn, img)); + history.clear(); + } + Value::WalRecord(rec) if val.will_init() => { + // Above the GC horizon, we verify every time we see an init record. + collect_and_verify(key, *lsn, &base_img, &history, tline).await?; + base_img = None; + history.clear(); + history.push((*lsn, rec)); + } + Value::WalRecord(rec) => { + history.push((*lsn, rec)); + } + } + } + // Ensure the latest record is readable. + collect_and_verify(key, max_lsn, &base_img, &history, tline).await?; + Ok(()) + } } #[derive(Debug, Serialize, Default)] @@ -819,15 +934,16 @@ pub struct CompactionStatistics { time_acquire_lock_secs: f64, time_analyze_secs: f64, time_download_layer_secs: f64, + time_to_first_kv_pair_secs: f64, time_main_loop_secs: f64, time_final_phase_secs: f64, time_total_secs: f64, // Summary - /// Ratio of the key-value size before/after gc-compaction. - uncompressed_size_ratio: f64, - /// Ratio of the physical size before/after gc-compaction. - physical_size_ratio: f64, + /// Ratio of the key-value size after/before gc-compaction. + uncompressed_retention_ratio: f64, + /// Ratio of the physical size after/before gc-compaction. + compressed_retention_ratio: f64, } impl CompactionStatistics { @@ -896,15 +1012,15 @@ impl CompactionStatistics { fn finalize(&mut self) { let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size; let produced_key_value_size = self.image_produced.size + self.wal_produced.size; - self.uncompressed_size_ratio = - original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0 + self.uncompressed_retention_ratio = + produced_key_value_size as f64 / (original_key_value_size as f64 + 1.0); // avoid div by 0 let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size; let produced_physical_size = self.image_layer_produced.size + self.delta_layer_produced.size + self.image_layer_discarded.size + self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate - self.physical_size_ratio = - original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0 + self.compressed_retention_ratio = + produced_physical_size as f64 / (original_physical_size as f64 + 1.0); // avoid div by 0 } } @@ -1113,7 +1229,17 @@ impl Timeline { // being potentially much longer. let rewrite_max = partition_count; - self.compact_shard_ancestors(rewrite_max, ctx).await?; + let outcome = self + .compact_shard_ancestors( + rewrite_max, + options.flags.contains(CompactFlags::YieldForL0), + ctx, + ) + .await?; + match outcome { + CompactionOutcome::Pending | CompactionOutcome::YieldForL0 => return Ok(outcome), + CompactionOutcome::Done | CompactionOutcome::Skipped => {} + } } Ok(CompactionOutcome::Done) @@ -1130,8 +1256,10 @@ impl Timeline { async fn compact_shard_ancestors( self: &Arc, rewrite_max: usize, + yield_for_l0: bool, ctx: &RequestContext, - ) -> Result<(), CompactionError> { + ) -> Result { + let mut outcome = CompactionOutcome::Done; let mut drop_layers = Vec::new(); let mut layers_to_rewrite: Vec = Vec::new(); @@ -1142,12 +1270,7 @@ impl Timeline { // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we // are rewriting layers. let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn(); - - tracing::info!( - "starting shard ancestor compaction, latest_gc_cutoff: {}, pitr cutoff {}", - *latest_gc_cutoff, - self.gc_info.read().unwrap().cutoffs.time - ); + let pitr_cutoff = self.gc_info.read().unwrap().cutoffs.time; let layers = self.layers.read().await; for layer_desc in layers.layer_map()?.iter_historic_layers() { @@ -1165,8 +1288,8 @@ impl Timeline { // This ancestral layer only covers keys that belong to other shards. // We include the full metadata in the log: if we had some critical bug that caused // us to incorrectly drop layers, this would simplify manually debugging + reinstating those layers. - info!(%layer, old_metadata=?layer.metadata(), - "dropping layer after shard split, contains no keys for this shard.", + debug!(%layer, old_metadata=?layer.metadata(), + "dropping layer after shard split, contains no keys for this shard", ); if cfg!(debug_assertions) { @@ -1228,19 +1351,35 @@ impl Timeline { } if layers_to_rewrite.len() >= rewrite_max { - tracing::info!(%layer, "Will rewrite layer on a future compaction, already rewrote {}", + debug!(%layer, "Will rewrite layer on a future compaction, already rewrote {}", layers_to_rewrite.len() ); - continue; + outcome = CompactionOutcome::Pending; + break; } // Fall through: all our conditions for doing a rewrite passed. layers_to_rewrite.push(layer); } - // Drop read lock on layer map before we start doing time-consuming I/O + // Drop read lock on layer map before we start doing time-consuming I/O. drop(layers); + // Drop out early if there's nothing to do. + if layers_to_rewrite.is_empty() && drop_layers.is_empty() { + return Ok(CompactionOutcome::Done); + } + + info!( + "starting shard ancestor compaction, rewriting {} layers and dropping {} layers \ + (latest_gc_cutoff={} pitr_cutoff={})", + layers_to_rewrite.len(), + drop_layers.len(), + *latest_gc_cutoff, + pitr_cutoff, + ); + let started = Instant::now(); + let mut replace_image_layers = Vec::new(); for layer in layers_to_rewrite { @@ -1248,13 +1387,15 @@ impl Timeline { return Err(CompactionError::ShuttingDown); } - tracing::info!(layer=%layer, "Rewriting layer after shard split..."); + info!(layer=%layer, "rewriting layer after shard split"); let mut image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, &layer.layer_desc().key_range, layer.layer_desc().image_layer_lsn(), + &self.gate, + self.cancel.clone(), ctx, ) .await @@ -1286,7 +1427,7 @@ impl Timeline { .map_err(CompactionError::Other)?; let new_layer = Layer::finish_creating(self.conf, self, desc, &path) .map_err(CompactionError::Other)?; - tracing::info!(layer=%new_layer, "Rewrote layer, {} -> {} bytes", + info!(layer=%new_layer, "rewrote layer, {} -> {} bytes", layer.metadata().file_size, new_layer.metadata().file_size); @@ -1296,6 +1437,26 @@ impl Timeline { // the layer has no data for us with the ShardedRange check above, but drop_layers.push(layer); } + + // Yield for L0 compaction if necessary, but make sure we update the layer map below + // with the work we've already done. + if yield_for_l0 + && self + .l0_compaction_trigger + .notified() + .now_or_never() + .is_some() + { + info!("shard ancestor compaction yielding for L0 compaction"); + outcome = CompactionOutcome::YieldForL0; + break; + } + } + + for layer in &drop_layers { + info!(%layer, old_metadata=?layer.metadata(), + "dropping layer after shard split (no keys for this shard)", + ); } // At this point, we have replaced local layer files with their rewritten form, but not yet uploaded @@ -1313,17 +1474,36 @@ impl Timeline { // necessary for correctness, but it simplifies testing, and avoids proceeding with another // Timeline's compaction while this timeline's uploads may be generating lots of disk I/O // load. - match self.remote_client.wait_completion().await { - Ok(()) => (), - Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)), - Err(WaitCompletionError::UploadQueueShutDownOrStopped) => { - return Err(CompactionError::ShuttingDown); + if outcome != CompactionOutcome::YieldForL0 { + info!("shard ancestor compaction waiting for uploads"); + tokio::select! { + result = self.remote_client.wait_completion() => match result { + Ok(()) => {}, + Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)), + Err(WaitCompletionError::UploadQueueShutDownOrStopped) => { + return Err(CompactionError::ShuttingDown); + } + }, + // Don't wait if there's L0 compaction to do. We don't need to update the outcome + // here, because we've already done the actual work. + _ = self.l0_compaction_trigger.notified(), if yield_for_l0 => {}, } } + info!( + "shard ancestor compaction done in {:.3}s{}", + started.elapsed().as_secs_f64(), + match outcome { + CompactionOutcome::Pending => + format!(", with pending work (rewrite_max={rewrite_max})"), + CompactionOutcome::YieldForL0 => String::from(", yielding for L0 compaction"), + CompactionOutcome::Skipped | CompactionOutcome::Done => String::new(), + } + ); + fail::fail_point!("compact-shard-ancestors-persistent"); - Ok(()) + Ok(outcome) } /// Update the LayerVisibilityHint of layers covered by image layers, based on whether there is @@ -1855,6 +2035,8 @@ impl Timeline { debug!("Create new layer {}..{}", lsn_range.start, lsn_range.end); lsn_range.clone() }, + &self.gate, + self.cancel.clone(), ctx, ) .await @@ -2142,6 +2324,7 @@ impl Timeline { /// ``` /// /// Note that `accumulated_values` must be sorted by LSN and should belong to a single key. + #[allow(clippy::too_many_arguments)] pub(crate) async fn generate_key_retention( self: &Arc, key: Key, @@ -2150,6 +2333,7 @@ impl Timeline { retain_lsn_below_horizon: &[Lsn], delta_threshold_cnt: usize, base_img_from_ancestor: Option<(Key, Lsn, Bytes)>, + verification: bool, ) -> anyhow::Result { // Pre-checks for the invariants @@ -2236,8 +2420,8 @@ impl Timeline { "should have at least below + above horizon batches" ); let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new(); - if let Some((key, lsn, img)) = base_img_from_ancestor { - replay_history.push((key, lsn, Value::Image(img))); + if let Some((key, lsn, ref img)) = base_img_from_ancestor { + replay_history.push((key, lsn, Value::Image(img.clone()))); } /// Generate debug information for the replay history @@ -2351,22 +2535,15 @@ impl Timeline { // Whether to reconstruct the image. In debug mode, we will generate an image // at every retain_lsn to ensure data is not corrupted, but we won't put the // image into the final layer. - let generate_image = produce_image || debug_mode; - if produce_image { + let img_and_lsn = if produce_image { records_since_last_image = 0; - } - let img_and_lsn = if generate_image { let replay_history_for_debug = if debug_mode { Some(replay_history.clone()) } else { None }; let replay_history_for_debug_ref = replay_history_for_debug.as_deref(); - let history = if produce_image { - std::mem::take(&mut replay_history) - } else { - replay_history.clone() - }; + let history = std::mem::take(&mut replay_history); let mut img = None; let mut records = Vec::with_capacity(history.len()); if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() { @@ -2401,6 +2578,7 @@ impl Timeline { records.push((lsn, rec)); } } + // WAL redo requires records in the reverse LSN order records.reverse(); let state = ValueReconstructState { img, records }; // last batch does not generate image so i is always in range, unless we force generate @@ -2411,7 +2589,7 @@ impl Timeline { lsn_split_points[i] }; let img = self - .reconstruct_value_wo_critical_error(key, request_lsn, state) + .reconstruct_value(key, request_lsn, state, RedoAttemptType::GcCompaction) .await?; Some((request_lsn, img)) } else { @@ -2433,10 +2611,16 @@ impl Timeline { assert_eq!(retention.len(), lsn_split_points.len() + 1); for (idx, logs) in retention.into_iter().enumerate() { if idx == lsn_split_points.len() { - return Ok(KeyHistoryRetention { + let retention = KeyHistoryRetention { below_horizon: result, above_horizon: KeyLogAtLsn(logs), - }); + }; + if verification { + retention + .verify(key, &base_img_from_ancestor, full_history, self) + .await?; + } + return Ok(retention); } else { result.push((lsn_split_points[idx], KeyLogAtLsn(logs))); } @@ -2903,6 +3087,9 @@ impl Timeline { } (false, res) }; + + let verification = self.get_gc_compaction_settings().gc_compaction_verification; + info!( "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} min_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}, has_data_below={}", job_desc.selected_layers.len(), @@ -3032,7 +3219,7 @@ impl Timeline { .map_err(CompactionError::Other)?; let time_download_layer = timer.elapsed(); - let timer = Instant::now(); + let mut timer = Instant::now(); // Step 2: Produce images+deltas. let mut accumulated_values = Vec::new(); @@ -3049,6 +3236,8 @@ impl Timeline { job_desc.compaction_key_range.start, lowest_retain_lsn, self.get_compaction_target_size(), + &self.gate, + self.cancel.clone(), ctx, ) .await @@ -3065,6 +3254,8 @@ impl Timeline { self.tenant_shard_id, lowest_retain_lsn..end_lsn, self.get_compaction_target_size(), + &self.gate, + self.cancel.clone(), ) .await .context("failed to create delta layer writer") @@ -3107,6 +3298,7 @@ impl Timeline { // Actually, we can decide not to write to the image layer at all at this point because // the key and LSN range are determined. However, to keep things simple here, we still // create this writer, and discard the writer in the end. + let mut time_to_first_kv_pair = None; while let Some(((key, lsn, val), desc)) = merge_iter .next_with_trace() @@ -3114,6 +3306,11 @@ impl Timeline { .context("failed to get next key-value pair") .map_err(CompactionError::Other)? { + if time_to_first_kv_pair.is_none() { + time_to_first_kv_pair = Some(timer.elapsed()); + timer = Instant::now(); + } + if cancel.is_cancelled() { return Err(CompactionError::ShuttingDown); } @@ -3155,6 +3352,8 @@ impl Timeline { self.tenant_shard_id, desc.key_range.start, desc.lsn_range.clone(), + &self.gate, + self.cancel.clone(), ctx, ) .await @@ -3172,6 +3371,8 @@ impl Timeline { self.tenant_shard_id, job_desc.compaction_key_range.end, desc.lsn_range.clone(), + &self.gate, + self.cancel.clone(), ctx, ) .await @@ -3213,6 +3414,7 @@ impl Timeline { .await .context("failed to get ancestor image") .map_err(CompactionError::Other)?, + verification, ) .await .context("failed to generate key retention") @@ -3253,6 +3455,7 @@ impl Timeline { .await .context("failed to get ancestor image") .map_err(CompactionError::Other)?, + verification, ) .await .context("failed to generate key retention") @@ -3449,6 +3652,9 @@ impl Timeline { let time_final_phase = timer.elapsed(); stat.time_final_phase_secs = time_final_phase.as_secs_f64(); + stat.time_to_first_kv_pair_secs = time_to_first_kv_pair + .unwrap_or(Duration::ZERO) + .as_secs_f64(); stat.time_main_loop_secs = time_main_loop.as_secs_f64(); stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64(); stat.time_download_layer_secs = time_download_layer.as_secs_f64(); @@ -3738,6 +3944,8 @@ impl CompactionJobExecutor for TimelineAdaptor { self.timeline.tenant_shard_id, key_range.start, lsn_range.clone(), + &self.timeline.gate, + self.timeline.cancel.clone(), ctx, ) .await?; @@ -3813,6 +4021,8 @@ impl TimelineAdaptor { self.timeline.tenant_shard_id, key_range, lsn, + &self.timeline.gate, + self.timeline.cancel.clone(), ctx, ) .await?; @@ -3909,8 +4119,6 @@ impl CompactionLayer for OwnArc { } } -use crate::tenant::timeline::DeltaEntry; - impl CompactionLayer for ResidentDeltaLayer { fn key_range(&self) -> &Range { &self.0.layer_desc().key_range diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 1b0d22dc82b1..a841cc55f011 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -30,6 +30,7 @@ use crate::tenant::storage_layer::{ AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer, ValuesReconstructState, }; +use crate::tenant::timeline::VersionedKeySpaceQuery; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; #[derive(Debug, thiserror::Error)] @@ -212,13 +213,9 @@ async fn generate_tombstone_image_layer( } } + let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key_range.clone()), image_lsn); let data = ancestor - .get_vectored_impl( - KeySpace::single(key_range.clone()), - image_lsn, - &mut reconstruct_state, - ctx, - ) + .get_vectored_impl(query, &mut reconstruct_state, ctx) .await .context("failed to retrieve aux keys") .map_err(|e| Error::launder(e, Error::Prepare))?; @@ -231,6 +228,8 @@ async fn generate_tombstone_image_layer( detached.tenant_shard_id, &key_range, image_lsn, + &detached.gate, + detached.cancel.clone(), ctx, ) .await @@ -779,6 +778,8 @@ async fn copy_lsn_prefix( target_timeline.tenant_shard_id, layer.layer_desc().key_range.start, layer.layer_desc().lsn_range.start..end_lsn, + &target_timeline.gate, + target_timeline.cancel.clone(), ctx, ) .await diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index 3ef82b36588a..c6d2944769fe 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -738,6 +738,8 @@ impl ChunkProcessingJob { self.timeline.tenant_shard_id, &self.range, self.pgdata_lsn, + &self.timeline.gate, + self.timeline.cancel.clone(), ctx, ) .await?; diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index ed92ea28ce20..ae898260d2df 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -3,17 +3,18 @@ use std::sync::Arc; use anyhow::{Context, bail, ensure}; use itertools::Itertools; +use pageserver_api::keyspace::KeySpace; use pageserver_api::shard::TenantShardId; use tokio_util::sync::CancellationToken; use tracing::trace; use utils::id::TimelineId; use utils::lsn::{AtomicLsn, Lsn}; -use super::{ReadableLayer, TimelineWriterState}; +use super::{LayerFringe, ReadableLayer, TimelineWriterState}; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::metrics::TimelineMetrics; -use crate::tenant::layer_map::{BatchedUpdates, LayerMap}; +use crate::tenant::layer_map::{BatchedUpdates, LayerMap, SearchResult}; use crate::tenant::storage_layer::{ AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc, PersistentLayerKey, ReadableLayerWeak, ResidentLayer, @@ -38,7 +39,7 @@ impl Default for LayerManager { } impl LayerManager { - pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer { + fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer { match weak { ReadableLayerWeak::PersistentLayer(desc) => { ReadableLayer::PersistentLayer(self.get_from_desc(&desc)) @@ -147,6 +148,36 @@ impl LayerManager { self.layers().keys().cloned().collect_vec() } + /// Update the [`LayerFringe`] of a read request + /// + /// Take a key space at a given LSN and query the layer map below each range + /// of the key space to find the next layers to visit. + pub(crate) fn update_search_fringe( + &self, + keyspace: &KeySpace, + cont_lsn: Lsn, + fringe: &mut LayerFringe, + ) -> Result<(), Shutdown> { + let map = self.layer_map()?; + + for range in keyspace.ranges.iter() { + let results = map.range_search(range.clone(), cont_lsn); + results + .found + .into_iter() + .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| { + ( + self.upgrade(layer), + keyspace_accum.to_keyspace(), + lsn_floor..cont_lsn, + ) + }) + .for_each(|(layer, keyspace, lsn_range)| fringe.update(layer, keyspace, lsn_range)); + } + + Ok(()) + } + fn layers(&self) -> &HashMap { use LayerManager::*; match self { diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index df2663f6bb04..3c3608d1bd1f 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -580,6 +580,7 @@ impl ConnectionManagerState { ); Ok(()) } + WalReceiverError::Cancelled => Ok(()), WalReceiverError::Other(e) => { // give out an error to have task_mgr give it a really verbose logging if cancellation.is_cancelled() { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index f41a9cfe82b9..52259f205bb8 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -73,6 +73,7 @@ pub(super) enum WalReceiverError { /// Generic error Other(anyhow::Error), ClosedGate, + Cancelled, } impl From for WalReceiverError { @@ -200,6 +201,9 @@ pub(super) async fn handle_walreceiver_connection( // with a similar error. }, WalReceiverError::SuccessfulCompletion(_) => {} + WalReceiverError::Cancelled => { + debug!("Connection cancelled") + } WalReceiverError::ClosedGate => { // doesn't happen at runtime } @@ -273,7 +277,12 @@ pub(super) async fn handle_walreceiver_connection( let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version); - let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?; + let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx) + .await + .map_err(|e| match e.kind { + crate::walingest::WalIngestErrorKind::Cancelled => WalReceiverError::Cancelled, + _ => WalReceiverError::Other(e.into()), + })?; let shard = vec![*timeline.get_shard_identity()]; @@ -445,7 +454,7 @@ pub(super) async fn handle_walreceiver_connection( .inspect_err(|err| { // TODO: we can't differentiate cancellation errors with // anyhow::Error, so just ignore it if we're cancelled. - if !cancellation.is_cancelled() { + if !cancellation.is_cancelled() && !timeline.is_stopping() { critical!("{err:?}") } })?; @@ -577,7 +586,7 @@ pub(super) async fn handle_walreceiver_connection( .inspect_err(|err| { // TODO: we can't differentiate cancellation errors with // anyhow::Error, so just ignore it if we're cancelled. - if !cancellation.is_cancelled() { + if !cancellation.is_cancelled() && !timeline.is_stopping() { critical!("{err:?}") } })?; diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs index d5dc9666ce6b..be1b55ffa3ef 100644 --- a/pageserver/src/tenant/upload_queue.rs +++ b/pageserver/src/tenant/upload_queue.rs @@ -302,6 +302,7 @@ pub struct UploadQueueStoppedDeletable { pub(super) deleted_at: SetDeletedFlagProgress, } +#[allow(clippy::large_enum_variant, reason = "TODO")] pub enum UploadQueueStopped { Deletable(UploadQueueStoppedDeletable), Uninitialized, diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 18df065f7646..e60c590f876c 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -21,13 +21,13 @@ //! redo Postgres process, but some records it can handle directly with //! bespoken Rust code. +use std::backtrace::Backtrace; use std::collections::HashMap; use std::sync::{Arc, OnceLock}; use std::time::{Duration, Instant, SystemTime}; -use anyhow::{Result, bail}; use bytes::{Buf, Bytes}; -use pageserver_api::key::rel_block_to_key; +use pageserver_api::key::{Key, rel_block_to_key}; use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; @@ -38,7 +38,7 @@ use postgres_ffi::{ fsm_logical_to_physical, pg_constants, }; use tracing::*; -use utils::bin_ser::SerializeError; +use utils::bin_ser::{DeserializeError, SerializeError}; use utils::lsn::Lsn; use utils::rate_limit::RateLimit; use utils::{critical, failpoint_support}; @@ -104,12 +104,101 @@ struct WarnIngestLag { timestamp_invalid_msg_ratelimit: RateLimit, } +pub struct WalIngestError { + pub backtrace: std::backtrace::Backtrace, + pub kind: WalIngestErrorKind, +} + +#[derive(thiserror::Error, Debug)] +pub enum WalIngestErrorKind { + #[error(transparent)] + #[allow(private_interfaces)] + PageReconstructError(#[from] PageReconstructError), + #[error(transparent)] + DeserializationFailure(#[from] DeserializeError), + #[error(transparent)] + SerializationFailure(#[from] SerializeError), + #[error("the request contains data not supported by pageserver: {0} @ {1}")] + InvalidKey(Key, Lsn), + #[error("twophase file for xid {0} already exists")] + FileAlreadyExists(u64), + #[error("slru segment {0:?}/{1} already exists")] + SlruAlreadyExists(SlruKind, u32), + #[error("relation already exists")] + RelationAlreadyExists(RelTag), + #[error("invalid reldir key {0}")] + InvalidRelDirKey(Key), + + #[error(transparent)] + LogicalError(anyhow::Error), + #[error(transparent)] + EncodeAuxFileError(anyhow::Error), + #[error(transparent)] + MaybeRelSizeV2Error(anyhow::Error), + + #[error("timeline shutting down")] + Cancelled, +} + +impl From for WalIngestError +where + WalIngestErrorKind: From, +{ + fn from(value: T) -> Self { + WalIngestError { + backtrace: Backtrace::capture(), + kind: WalIngestErrorKind::from(value), + } + } +} + +impl std::error::Error for WalIngestError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + self.kind.source() + } +} + +impl core::fmt::Display for WalIngestError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + self.kind.fmt(f) + } +} + +impl core::fmt::Debug for WalIngestError { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + if f.alternate() { + f.debug_map() + .key(&"backtrace") + .value(&self.backtrace) + .key(&"kind") + .value(&self.kind) + .finish() + } else { + writeln!(f, "Error: {:?}", self.kind)?; + if self.backtrace.status() == std::backtrace::BacktraceStatus::Captured { + writeln!(f, "Stack backtrace: {:?}", self.backtrace)?; + } + Ok(()) + } + } +} + +#[macro_export] +macro_rules! ensure_walingest { + ($($t:tt)*) => { + _ = || -> Result<(), anyhow::Error> { + anyhow::ensure!($($t)*); + Ok(()) + }().map_err(WalIngestErrorKind::LogicalError)?; + }; +} + impl WalIngest { pub async fn new( timeline: &Timeline, startpoint: Lsn, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { // Fetch the latest checkpoint into memory, so that we can compare with it // quickly in `ingest_record` and update it when it changes. let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?; @@ -145,7 +234,7 @@ impl WalIngest { interpreted: InterpretedWalRecord, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { WAL_INGEST.records_received.inc(); let prev_len = modification.len(); @@ -288,7 +377,7 @@ impl WalIngest { } /// This is the same as AdjustToFullTransactionId(xid) in PostgreSQL - fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result { + fn adjust_to_full_transaction_id(&self, xid: TransactionId) -> Result { let next_full_xid = enum_pgversion_dispatch!(&self.checkpoint, CheckPoint, cp, { cp.nextXid.value }); @@ -298,9 +387,9 @@ impl WalIngest { if xid > next_xid { // Wraparound occurred, must be from a prev epoch. if epoch == 0 { - bail!( + Err(WalIngestErrorKind::LogicalError(anyhow::anyhow!( "apparent XID wraparound with prepared transaction XID {xid}, nextXid is {next_full_xid}" - ); + )))?; } epoch -= 1; } @@ -313,7 +402,7 @@ impl WalIngest { clear_vm_bits: ClearVmBits, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let ClearVmBits { new_heap_blkno, old_heap_blkno, @@ -402,7 +491,7 @@ impl WalIngest { create: DbaseCreate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let DbaseCreate { db_id, tablespace_id, @@ -505,7 +594,7 @@ impl WalIngest { dbase_drop: DbaseDrop, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let DbaseDrop { db_id, tablespace_ids, @@ -523,7 +612,7 @@ impl WalIngest { create: SmgrCreate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let SmgrCreate { rel } = create; self.put_rel_creation(modification, rel, ctx).await?; Ok(()) @@ -537,7 +626,7 @@ impl WalIngest { truncate: XlSmgrTruncate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let XlSmgrTruncate { blkno, rnode, @@ -689,7 +778,7 @@ impl WalIngest { record: XactRecord, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let (xact_common, is_commit, is_prepared) = match record { XactRecord::Prepare(XactPrepare { xl_xid, data }) => { let xid: u64 = if modification.tline.pg_version >= 17 { @@ -813,7 +902,7 @@ impl WalIngest { truncate: ClogTruncate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let ClogTruncate { pageno, oldest_xid, @@ -889,7 +978,7 @@ impl WalIngest { zero_page: ClogZeroPage, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { let ClogZeroPage { segno, rpageno } = zero_page; self.put_slru_page_image( @@ -907,7 +996,7 @@ impl WalIngest { &mut self, modification: &mut DatadirModification, xlrec: &XlMultiXactCreate, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { // Create WAL record for updating the multixact-offsets page let pageno = xlrec.mid / pg_constants::MULTIXACT_OFFSETS_PER_PAGE as u32; let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT; @@ -1010,7 +1099,7 @@ impl WalIngest { modification: &mut DatadirModification<'_>, xlrec: &XlMultiXactTruncate, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let (maxsegment, startsegment, endsegment) = enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { cp.oldestMulti = xlrec.end_trunc_off; @@ -1058,7 +1147,7 @@ impl WalIngest { zero_page: MultiXactZeroPage, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let MultiXactZeroPage { slru_kind, segno, @@ -1080,7 +1169,7 @@ impl WalIngest { update: RelmapUpdate, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let RelmapUpdate { update, buf } = update; modification @@ -1093,7 +1182,7 @@ impl WalIngest { raw_record: RawXlogRecord, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let RawXlogRecord { info, lsn, mut buf } = raw_record; let pg_version = modification.tline.pg_version; @@ -1235,12 +1324,12 @@ impl WalIngest { put: PutLogicalMessage, modification: &mut DatadirModification<'_>, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { let PutLogicalMessage { path, buf } = put; modification.put_file(path.as_str(), &buf, ctx).await } - fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<()> { + fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<(), WalIngestError> { match record { StandbyRecord::RunningXacts(running_xacts) => { enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, { @@ -1258,7 +1347,7 @@ impl WalIngest { &mut self, record: ReploriginRecord, modification: &mut DatadirModification<'_>, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { match record { ReploriginRecord::Set(set) => { modification @@ -1278,7 +1367,7 @@ impl WalIngest { modification: &mut DatadirModification<'_>, rel: RelTag, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { modification.put_rel_creation(rel, 0, ctx).await?; Ok(()) } @@ -1291,7 +1380,7 @@ impl WalIngest { blknum: BlockNumber, img: Bytes, ctx: &RequestContext, - ) -> Result<(), PageReconstructError> { + ) -> Result<(), WalIngestError> { self.handle_rel_extend(modification, rel, blknum, ctx) .await?; modification.put_rel_page_image(rel, blknum, img)?; @@ -1305,7 +1394,7 @@ impl WalIngest { blknum: BlockNumber, rec: NeonWalRecord, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { self.handle_rel_extend(modification, rel, blknum, ctx) .await?; modification.put_rel_wal_record(rel, blknum, rec)?; @@ -1318,7 +1407,7 @@ impl WalIngest { rel: RelTag, nblocks: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { modification.put_rel_truncation(rel, nblocks, ctx).await?; Ok(()) } @@ -1329,7 +1418,7 @@ impl WalIngest { rel: RelTag, blknum: BlockNumber, ctx: &RequestContext, - ) -> Result<(), PageReconstructError> { + ) -> Result<(), WalIngestError> { let new_nblocks = blknum + 1; // Check if the relation exists. We implicitly create relations on first // record. @@ -1423,7 +1512,7 @@ impl WalIngest { blknum: BlockNumber, img: Bytes, ctx: &RequestContext, - ) -> Result<()> { + ) -> Result<(), WalIngestError> { if !self.shard.is_shard_zero() { return Ok(()); } @@ -1441,7 +1530,7 @@ impl WalIngest { segno: u32, blknum: BlockNumber, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), WalIngestError> { // we don't use a cache for this like we do for relations. SLRUS are explcitly // extended with ZEROPAGE records, not with commit records, so it happens // a lot less frequently. @@ -1509,6 +1598,7 @@ async fn get_relsize( #[allow(clippy::bool_assert_comparison)] #[cfg(test)] mod tests { + use anyhow::Result; use postgres_ffi::RELSEG_SIZE; use super::*; @@ -1530,7 +1620,7 @@ mod tests { } #[tokio::test] - async fn test_zeroed_checkpoint_decodes_correctly() -> Result<()> { + async fn test_zeroed_checkpoint_decodes_correctly() -> Result<(), anyhow::Error> { for i in 14..=16 { dispatch_pgversion!(i, { pgv::CheckPoint::decode(&pgv::ZERO_CHECKPOINT)?; diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index 22d8d8381128..ed8a95436902 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -136,6 +136,16 @@ macro_rules! bail { } } +#[derive(Debug, Clone, Copy)] +pub enum RedoAttemptType { + /// Used for the read path. Will fire critical errors and retry twice if failure. + ReadPage, + // Used for legacy compaction (only used in image compaction). Will fire critical errors and retry once if failure. + LegacyCompaction, + // Used for gc compaction. Will not fire critical errors and not retry. + GcCompaction, +} + /// /// Public interface of WAL redo manager /// @@ -156,11 +166,18 @@ impl PostgresRedoManager { base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, pg_version: u32, + redo_attempt_type: RedoAttemptType, ) -> Result { if records.is_empty() { bail!("invalid WAL redo request with no records"); } + let max_retry_attempts = match redo_attempt_type { + RedoAttemptType::ReadPage => 2, + RedoAttemptType::LegacyCompaction => 1, + RedoAttemptType::GcCompaction => 0, + }; + let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID); let mut img = base_img.map(|p| p.1); let mut batch_neon = apply_neon::can_apply_in_neon(&records[0].1); @@ -180,6 +197,7 @@ impl PostgresRedoManager { &records[batch_start..i], self.conf.wal_redo_timeout, pg_version, + max_retry_attempts, ) .await }; @@ -201,6 +219,7 @@ impl PostgresRedoManager { &records[batch_start..], self.conf.wal_redo_timeout, pg_version, + max_retry_attempts, ) .await } @@ -424,11 +443,11 @@ impl PostgresRedoManager { records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, pg_version: u32, + max_retry_attempts: u32, ) -> Result { *(self.last_redo_at.lock().unwrap()) = Some(Instant::now()); let (rel, blknum) = key.to_rel_block().context("invalid record")?; - const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; loop { let base_img = &base_img; @@ -486,7 +505,7 @@ impl PostgresRedoManager { info!(n_attempts, "retried walredo succeeded"); } n_attempts += 1; - if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() { + if n_attempts > max_retry_attempts || result.is_ok() { return result; } } @@ -560,6 +579,7 @@ mod tests { use super::PostgresRedoManager; use crate::config::PageServerConf; + use crate::walredo::RedoAttemptType; #[tokio::test] async fn test_ping() { @@ -593,6 +613,7 @@ mod tests { None, short_records(), 14, + RedoAttemptType::ReadPage, ) .instrument(h.span()) .await @@ -621,6 +642,7 @@ mod tests { None, short_records(), 14, + RedoAttemptType::ReadPage, ) .instrument(h.span()) .await @@ -642,6 +664,7 @@ mod tests { None, short_records(), 16, /* 16 currently produces stderr output on startup, which adds a nice extra edge */ + RedoAttemptType::ReadPage, ) .instrument(h.span()) .await diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs index 61ae1eb97078..a3840f1f6f72 100644 --- a/pageserver/src/walredo/apply_neon.rs +++ b/pageserver/src/walredo/apply_neon.rs @@ -276,6 +276,7 @@ pub(crate) fn apply_in_neon( append, clear, will_init, + only_if, } => { use bytes::BufMut; if *will_init { @@ -288,6 +289,13 @@ pub(crate) fn apply_in_neon( if *clear { page.clear(); } + if let Some(only_if) = only_if { + if page != only_if.as_bytes() { + return Err(anyhow::anyhow!( + "the current image does not match the expected image, cannot append" + )); + } + } page.put_slice(append.as_bytes()); } } diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 8259d24359f6..426b176af94d 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -4,6 +4,7 @@ MODULE_big = neon OBJS = \ $(WIN32RES) \ + communicator.o \ extension_server.o \ file_cache.o \ hll.o \ diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c new file mode 100644 index 000000000000..932034e22e14 --- /dev/null +++ b/pgxn/neon/communicator.c @@ -0,0 +1,2504 @@ +/*------------------------------------------------------------------------- + * + * communicator.c + * Functions for communicating with remote pageservers. + * + * This is the so-called "legacy" communicator. It consists of functions that + * are called from the smgr implementation, in pagestore_smgr.c. There are + * plans to replace this with a different implementation, see RFC. + * + * The communicator is a collection of functions that are called in each + * backend, when the backend needs to read a page or other information. It + * does not spawn background threads or anything like that. To process + * responses to prefetch requests in a timely fashion, however, it registers + * a ProcessInterrupts hook that gets called periodically from any + * CHECK_FOR_INTERRUPTS() point in the backend. + * + * By the time the functions in this file are called, the caller has already + * established that a request to the pageserver is necessary. The functions + * are only called for permanent relations (i.e. not temp or unlogged tables). + * Before making a call to the communicator, the caller has already checked + * the relation size or local file cache. + * + * However, when processing responses to getpage requests, the communicator + * writes pages directly to the LFC. + * + * The communicator functions take request LSNs as arguments; the caller is + * responsible for determining the correct LSNs to use. There's one exception + * to that, in prefetch_do_request(); it sometimes calls back to + * neon_get_request_lsns(). That's because sometimes a suitable response is + * found in the prefetch buffer and the request LSns are not needed, and the + * caller doesn't know whether it's needed or not. + * + * The main interface consists of the following "synchronous" calls: + * + * communicator_exists - Returns true if a relation file exists + * communicator_nblocks - Returns a relation's size + * communicator_dbsize - Returns a databases's total size + * communicator_read_at_lsnv - Read contents of one relation block + * communicator_read_slru_segment - Read contents of one SLRU segment + * + * In addition, there functions related to prefetching: + * communicator_prefetch_register_bufferv - Start prefetching a page + * communicator_prefetch_lookupv - Check if a page is already in prefetch queue + * + * Misc other functions: + * - communicator_init - Initialize the module at startup + * - communicator_prefetch_pump_state - Called periodically to advance the state + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xlog.h" +#include "access/xlogdefs.h" +#include "access/xlog_internal.h" +#include "access/xlogutils.h" +#include "common/hashfn.h" +#include "executor/instrument.h" +#include "libpq/pqformat.h" +#include "miscadmin.h" +#include "port/pg_iovec.h" +#include "postmaster/interrupt.h" +#include "replication/walsender.h" +#include "utils/timeout.h" + +#include "bitmap.h" +#include "communicator.h" +#include "file_cache.h" +#include "neon.h" +#include "neon_perf_counters.h" +#include "pagestore_client.h" + +#if PG_VERSION_NUM >= 150000 +#include "access/xlogrecovery.h" +#endif + +#if PG_VERSION_NUM < 160000 +typedef PGAlignedBlock PGIOAlignedBlock; +#endif + +#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \ + neon_shard_log(shard_no, elvl, "Broken connection state: " message, \ + ##__VA_ARGS__) + +page_server_api *page_server; + +static uint32 local_request_counter; +#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter) + +/* + * Various settings related to prompt (fast) handling of PageStream responses + * at any CHECK_FOR_INTERRUPTS point. + */ +int readahead_getpage_pull_timeout_ms = 0; +static int PS_TIMEOUT_ID = 0; +static bool timeout_set = false; +static bool timeout_signaled = false; + +/* + * We have a CHECK_FOR_INTERRUPTS in page_server->receive(), and we don't want + * that to handle any getpage responses if we're already working on the + * backlog of those, as we'd hit issues with determining which prefetch slot + * we just got a response for. + * + * To protect against that, we have this variable that's set whenever we start + * receiving data for prefetch slots, so that we don't get confused. + * + * Note that in certain error cases during readpage we may leak r_r_g=true, + * which results in a failure to pick up further responses until we first + * actively try to receive new getpage responses. + */ +static bool readpage_reentrant_guard = false; + +static void pagestore_timeout_handler(void); + +#define START_PREFETCH_RECEIVE_WORK() \ + do { \ + readpage_reentrant_guard = true; \ + } while (false) + +#define END_PREFETCH_RECEIVE_WORK() \ + do { \ + readpage_reentrant_guard = false; \ + if (unlikely(timeout_signaled && !InterruptPending)) \ + InterruptPending = true; \ + } while (false) + +/* + * Prefetch implementation: + * + * Prefetch is performed locally by each backend. + * + * There can be up to readahead_buffer_size active IO requests registered at + * any time. Requests using smgr_prefetch are sent to the pageserver, but we + * don't wait on the response. Requests using smgr_read are either read from + * the buffer, or (if that's not possible) we wait on the response to arrive - + * this also will allow us to receive other prefetched pages. + * Each request is immediately written to the output buffer of the pageserver + * connection, but may not be flushed if smgr_prefetch is used: pageserver + * flushes sent requests on manual flush, or every neon.flush_output_after + * unflushed requests; which is not necessarily always and all the time. + * + * Once we have received a response, this value will be stored in the response + * buffer, indexed in a hash table. This allows us to retain our buffered + * prefetch responses even when we have cache misses. + * + * Reading of prefetch responses is delayed until them are actually needed + * (smgr_read). In case of prefetch miss or any other SMGR request other than + * smgr_read, all prefetch responses in the pipeline will need to be read from + * the connection; the responses are stored for later use. + * + * NOTE: The current implementation of the prefetch system implements a ring + * buffer of up to readahead_buffer_size requests. If there are more _read and + * _prefetch requests between the initial _prefetch and the _read of a buffer, + * the prefetch request will have been dropped from this prefetch buffer, and + * your prefetch was wasted. + */ + +/* + * State machine: + * + * not in hash : in hash + * : + * UNUSED ------> REQUESTED --> RECEIVED + * ^ : | | + * | : v | + * | : TAG_REMAINS | + * | : | | + * +----------------+------------+ + * : + */ +typedef enum PrefetchStatus +{ + PRFS_UNUSED = 0, /* unused slot */ + PRFS_REQUESTED, /* request was written to the sendbuffer to + * PS, but not necessarily flushed. all fields + * except response valid */ + PRFS_RECEIVED, /* all fields valid */ + PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still + * valid */ +} PrefetchStatus; + +/* must fit in uint8; bits 0x1 are used */ +typedef enum { + PRFSF_NONE = 0x0, + PRFSF_LFC = 0x1 /* received prefetch result is stored in LFC */ +} PrefetchRequestFlags; + +typedef struct PrefetchRequest +{ + BufferTag buftag; /* must be first entry in the struct */ + shardno_t shard_no; + uint8 status; /* see PrefetchStatus for valid values */ + uint8 flags; /* see PrefetchRequestFlags */ + neon_request_lsns request_lsns; + NeonRequestId reqid; + NeonResponse *response; /* may be null */ + uint64 my_ring_index; +} PrefetchRequest; + +/* prefetch buffer lookup hash table */ + +typedef struct PrfHashEntry +{ + PrefetchRequest *slot; + uint32 status; + uint32 hash; +} PrfHashEntry; + +#define SH_PREFIX prfh +#define SH_ELEMENT_TYPE PrfHashEntry +#define SH_KEY_TYPE PrefetchRequest * +#define SH_KEY slot +#define SH_STORE_HASH +#define SH_GET_HASH(tb, a) ((a)->hash) +#define SH_HASH_KEY(tb, key) hash_bytes( \ + ((const unsigned char *) &(key)->buftag), \ + sizeof(BufferTag) \ +) + +#define SH_EQUAL(tb, a, b) (BufferTagsEqual(&(a)->buftag, &(b)->buftag)) +#define SH_SCOPE static inline +#define SH_DEFINE +#define SH_DECLARE +#include "lib/simplehash.h" + +/* + * PrefetchState maintains the state of (prefetch) getPage@LSN requests. + * It maintains a (ring) buffer of in-flight requests and responses. + * + * We maintain several indexes into the ring buffer: + * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0 + * + * ring_unused points to the first unused slot of the buffer + * ring_receive is the next request that is to be received + * ring_last is the oldest received entry in the buffer + * + * Apart from being an entry in the ring buffer of prefetch requests, each + * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag. + */ +typedef struct PrefetchState +{ + MemoryContext bufctx; /* context for prf_buffer[].response + * allocations */ + MemoryContext errctx; /* context for prf_buffer[].response + * allocations */ + MemoryContext hashctx; /* context for prf_buffer */ + + /* buffer indexes */ + uint64 ring_unused; /* first unused slot */ + uint64 ring_flush; /* next request to flush */ + uint64 ring_receive; /* next slot that is to receive a response */ + uint64 ring_last; /* min slot with a response value */ + + /* metrics / statistics */ + int n_responses_buffered; /* count of PS responses not yet in + * buffers */ + int n_requests_inflight; /* count of PS requests considered in + * flight */ + int n_unused; /* count of buffers < unused, > last, that are + * also unused */ + + /* the buffers */ + prfh_hash *prf_hash; + int max_shard_no; + /* Mark shards involved in prefetch */ + uint8 shard_bitmap[(MAX_SHARDS + 7)/8]; + PrefetchRequest prf_buffer[]; /* prefetch buffers */ +} PrefetchState; + +static PrefetchState *MyPState; + +#define GetPrfSlotNoCheck(ring_index) ( \ + &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ +) + +#define GetPrfSlot(ring_index) ( \ + ( \ + AssertMacro((ring_index) < MyPState->ring_unused && \ + (ring_index) >= MyPState->ring_last), \ + GetPrfSlotNoCheck(ring_index) \ + ) \ +) + +#define ReceiveBufferNeedsCompaction() (\ + (MyPState->n_responses_buffered / 8) < ( \ + MyPState->ring_receive - \ + MyPState->ring_last - \ + MyPState->n_responses_buffered \ + ) \ +) + +static process_interrupts_callback_t prev_interrupt_cb; + +static bool compact_prefetch_buffers(void); +static void consume_prefetch_responses(void); +static uint64 prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, + BlockNumber nblocks, const bits8 *mask, + bool is_prefetch); +static bool prefetch_read(PrefetchRequest *slot); +static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns); +static bool prefetch_wait_for(uint64 ring_index); +static void prefetch_cleanup_trailing_unused(void); +static inline void prefetch_set_unused(uint64 ring_index); + +static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns, + PrefetchRequest *slot); +static bool communicator_processinterrupts(void); + +void +pg_init_communicator(void) +{ + prev_interrupt_cb = ProcessInterruptsCallback; + ProcessInterruptsCallback = communicator_processinterrupts; +} + +static bool +compact_prefetch_buffers(void) +{ + uint64 empty_ring_index = MyPState->ring_last; + uint64 search_ring_index = MyPState->ring_receive; + int n_moved = 0; + + if (MyPState->ring_receive == MyPState->ring_last) + return false; + + while (search_ring_index > MyPState->ring_last) + { + search_ring_index--; + if (GetPrfSlot(search_ring_index)->status == PRFS_UNUSED) + { + empty_ring_index = search_ring_index; + break; + } + } + + /* + * Here we have established: slots < search_ring_index have an unknown + * state (not scanned) slots >= search_ring_index and <= empty_ring_index + * are unused slots > empty_ring_index are in use, or outside our buffer's + * range. ... unless search_ring_index <= ring_last + * + * Therefore, there is a gap of at least one unused items between + * search_ring_index and empty_ring_index (both inclusive), which grows as + * we hit more unused items while moving backwards through the array. + */ + + while (search_ring_index > MyPState->ring_last) + { + PrefetchRequest *source_slot; + PrefetchRequest *target_slot; + bool found; + + /* update search index to an unprocessed entry */ + search_ring_index--; + + source_slot = GetPrfSlot(search_ring_index); + + if (source_slot->status == PRFS_UNUSED) + continue; + + /* slot is used -- start moving slot */ + target_slot = GetPrfSlot(empty_ring_index); + + Assert(source_slot->status == PRFS_RECEIVED); + Assert(target_slot->status == PRFS_UNUSED); + + target_slot->buftag = source_slot->buftag; + target_slot->shard_no = source_slot->shard_no; + target_slot->status = source_slot->status; + target_slot->flags = source_slot->flags; + target_slot->response = source_slot->response; + target_slot->reqid = source_slot->reqid; + target_slot->request_lsns = source_slot->request_lsns; + target_slot->my_ring_index = empty_ring_index; + + prfh_delete(MyPState->prf_hash, source_slot); + prfh_insert(MyPState->prf_hash, target_slot, &found); + + Assert(!found); + + /* Adjust the location of our known-empty slot */ + empty_ring_index--; + + /* empty the moved slot */ + source_slot->status = PRFS_UNUSED; + source_slot->buftag = (BufferTag) + { + 0 + }; + source_slot->response = NULL; + source_slot->my_ring_index = 0; + source_slot->request_lsns = (neon_request_lsns) { + InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr + }; + + /* update bookkeeping */ + n_moved++; + } + + /* + * Only when we've moved slots we can expect trailing unused slots, so + * only then we clean up trailing unused slots. + */ + if (n_moved > 0) + { + prefetch_cleanup_trailing_unused(); + return true; + } + + return false; +} + +/* + * If there might be responses still in the TCP buffer, then we should try to + * use those, to reduce any TCP backpressure on the OS/PS side. + * + * This procedure handles that. + * + * Note that this works because we don't pipeline non-getPage requests. + * + * NOTE: This procedure is not allowed to throw errors that should be handled + * by SMGR-related code, as this can be called from every CHECK_FOR_INTERRUPTS + * point inside and outside PostgreSQL. + * + * This still does throw errors when it receives malformed responses from PS. + * + * When we're not called from CHECK_FOR_INTERRUPTS (indicated by + * IsHandlingInterrupts) we also report we've ended prefetch receive work, + * just in case state tracking was lost due to an error in the sync getPage + * response code. + */ +void +communicator_prefetch_pump_state(bool IsHandlingInterrupts) +{ + while (MyPState->ring_receive != MyPState->ring_flush) + { + NeonResponse *response; + PrefetchRequest *slot; + MemoryContext old; + + slot = GetPrfSlot(MyPState->ring_receive); + + old = MemoryContextSwitchTo(MyPState->errctx); + response = page_server->try_receive(slot->shard_no); + MemoryContextSwitchTo(old); + + if (response == NULL) + break; + + /* The slot should still be valid */ + if (slot->status != PRFS_REQUESTED || + slot->response != NULL || + slot->my_ring_index != MyPState->ring_receive) + neon_shard_log(slot->shard_no, ERROR, + "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", + slot->status, slot->response, + (long) slot->my_ring_index, (long) MyPState->ring_receive); + + /* update prefetch state */ + MyPState->n_responses_buffered += 1; + MyPState->n_requests_inflight -= 1; + MyPState->ring_receive += 1; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + + /* update slot state */ + slot->status = PRFS_RECEIVED; + slot->response = response; + + if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) + { + /* + * Store prefetched result in LFC (please read comments to lfc_prefetch + * explaining why it can be done without holding shared buffer lock + */ + if (lfc_prefetch(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) + { + slot->flags |= PRFSF_LFC; + } + } + } + + /* We never pump the prefetch state while handling other pages */ + if (!IsHandlingInterrupts) + END_PREFETCH_RECEIVE_WORK(); + + communicator_reconfigure_timeout_if_needed(); +} + +void +readahead_buffer_resize(int newsize, void *extra) +{ + uint64 end, + nfree = newsize; + PrefetchState *newPState; + Size newprfs_size = offsetof(PrefetchState, prf_buffer) + + (sizeof(PrefetchRequest) * newsize); + + /* don't try to re-initialize if we haven't initialized yet */ + if (MyPState == NULL) + return; + + /* + * Make sure that we don't lose track of active prefetch requests by + * ensuring we have received all but the last n requests (n = newsize). + */ + if (MyPState->n_requests_inflight > newsize) + { + prefetch_wait_for(MyPState->ring_unused - newsize - 1); + Assert(MyPState->n_requests_inflight <= newsize); + } + + /* construct the new PrefetchState, and copy over the memory contexts */ + newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size); + + newPState->bufctx = MyPState->bufctx; + newPState->errctx = MyPState->errctx; + newPState->hashctx = MyPState->hashctx; + newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL); + newPState->n_unused = newsize; + newPState->n_requests_inflight = 0; + newPState->n_responses_buffered = 0; + newPState->ring_last = newsize; + newPState->ring_unused = newsize; + newPState->ring_receive = newsize; + newPState->max_shard_no = MyPState->max_shard_no; + memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap)); + + /* + * Copy over the prefetches. + * + * We populate the prefetch array from the end; to retain the most recent + * prefetches, but this has the benefit of only needing to do one + * iteration on the dataset, and trivial compaction. + */ + for (end = MyPState->ring_unused - 1; + end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0; + end -= 1) + { + PrefetchRequest *slot = GetPrfSlot(end); + PrefetchRequest *newslot; + bool found; + + if (slot->status == PRFS_UNUSED) + continue; + + nfree -= 1; + + newslot = &newPState->prf_buffer[nfree]; + *newslot = *slot; + newslot->my_ring_index = nfree; + + prfh_insert(newPState->prf_hash, newslot, &found); + + Assert(!found); + + switch (newslot->status) + { + case PRFS_UNUSED: + pg_unreachable(); + case PRFS_REQUESTED: + newPState->n_requests_inflight += 1; + newPState->ring_receive -= 1; + newPState->ring_last -= 1; + break; + case PRFS_RECEIVED: + newPState->n_responses_buffered += 1; + newPState->ring_last -= 1; + break; + case PRFS_TAG_REMAINS: + newPState->ring_last -= 1; + break; + } + newPState->n_unused -= 1; + } + newPState->ring_flush = newPState->ring_receive; + + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + MyNeonCounters->pageserver_open_requests = + MyPState->n_requests_inflight; + + for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1) + { + PrefetchRequest *slot = GetPrfSlot(end); + Assert(slot->status != PRFS_REQUESTED); + if (slot->status == PRFS_RECEIVED) + { + pfree(slot->response); + } + } + + prfh_destroy(MyPState->prf_hash); + pfree(MyPState); + MyPState = newPState; +} + + + +/* + * Make sure that there are no responses still in the buffer. + * + * This function may indirectly update MyPState->pfs_hash; which invalidates + * any active pointers into the hash table. + */ +static void +consume_prefetch_responses(void) +{ + if (MyPState->ring_receive < MyPState->ring_unused) + prefetch_wait_for(MyPState->ring_unused - 1); +} + +static void +prefetch_cleanup_trailing_unused(void) +{ + uint64 ring_index; + PrefetchRequest *slot; + + while (MyPState->ring_last < MyPState->ring_receive) + { + ring_index = MyPState->ring_last; + slot = GetPrfSlot(ring_index); + + if (slot->status == PRFS_UNUSED) + MyPState->ring_last += 1; + else + break; + } +} + + +static bool +prefetch_flush_requests(void) +{ + for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++) + { + if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no)) + { + if (!page_server->flush(shard_no)) + return false; + BITMAP_CLR(MyPState->shard_bitmap, shard_no); + } + } + MyPState->max_shard_no = 0; + return true; +} + +/* + * Wait for slot of ring_index to have received its response. + * The caller is responsible for making sure the request buffer is flushed. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. + * NOTE: callers should make sure they can handle query cancellations in this + * function's call path. + */ +static bool +prefetch_wait_for(uint64 ring_index) +{ + PrefetchRequest *entry; + bool result = true; + + if (MyPState->ring_flush <= ring_index && + MyPState->ring_unused > MyPState->ring_flush) + { + if (!prefetch_flush_requests()) + return false; + MyPState->ring_flush = MyPState->ring_unused; + } + + Assert(MyPState->ring_unused > ring_index); + + while (MyPState->ring_receive <= ring_index) + { + START_PREFETCH_RECEIVE_WORK(); + entry = GetPrfSlot(MyPState->ring_receive); + + Assert(entry->status == PRFS_REQUESTED); + if (!prefetch_read(entry)) + { + result = false; + break; + } + + END_PREFETCH_RECEIVE_WORK(); + CHECK_FOR_INTERRUPTS(); + } + + return result; +} + +/* + * Read the response of a prefetch request into its slot. + * + * The caller is responsible for making sure that the request for this buffer + * was flushed to the PageServer. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. + * + * NOTE: this does IO, and can get canceled out-of-line. + */ +static bool +prefetch_read(PrefetchRequest *slot) +{ + NeonResponse *response; + MemoryContext old; + BufferTag buftag; + shardno_t shard_no; + uint64 my_ring_index; + + Assert(slot->status == PRFS_REQUESTED); + Assert(slot->response == NULL); + Assert(slot->my_ring_index == MyPState->ring_receive); + + if (slot->status != PRFS_REQUESTED || + slot->response != NULL || + slot->my_ring_index != MyPState->ring_receive) + neon_shard_log(slot->shard_no, ERROR, + "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu", + slot->status, slot->response, + (long)slot->my_ring_index, (long)MyPState->ring_receive); + + /* + * Copy the request info so that if an error happens and the prefetch + * queue is flushed during the receive call, we can print the original + * values in the error message + */ + buftag = slot->buftag; + shard_no = slot->shard_no; + my_ring_index = slot->my_ring_index; + + old = MemoryContextSwitchTo(MyPState->errctx); + response = (NeonResponse *) page_server->receive(shard_no); + MemoryContextSwitchTo(old); + if (response) + { + /* The slot should still be valid */ + if (slot->status != PRFS_REQUESTED || + slot->response != NULL || + slot->my_ring_index != MyPState->ring_receive) + neon_shard_log(shard_no, ERROR, + "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", + slot->status, slot->response, + (long) slot->my_ring_index, (long) MyPState->ring_receive); + + /* update prefetch state */ + MyPState->n_responses_buffered += 1; + MyPState->n_requests_inflight -= 1; + MyPState->ring_receive += 1; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + + /* update slot state */ + slot->status = PRFS_RECEIVED; + slot->response = response; + + if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) + { + /* + * Store prefetched result in LFC (please read comments to lfc_prefetch + * explaining why it can be done without holding shared buffer lock + */ + if (lfc_prefetch(BufTagGetNRelFileInfo(buftag), buftag.forkNum, buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) + { + slot->flags |= PRFSF_LFC; + } + } + return true; + } + else + { + /* + * Note: The slot might no longer be valid, if the connection was lost + * and the prefetch queue was flushed during the receive call + */ + neon_shard_log(shard_no, LOG, + "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect", + (long) my_ring_index, + RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)), + buftag.forkNum, buftag.blockNum); + return false; + } +} + +/* + * Disconnect hook - drop prefetches when the connection drops + * + * If we don't remove the failed prefetches, we'd be serving incorrect + * data to the smgr. + */ +void +prefetch_on_ps_disconnect(void) +{ + MyPState->ring_flush = MyPState->ring_unused; + + while (MyPState->ring_receive < MyPState->ring_unused) + { + PrefetchRequest *slot; + uint64 ring_index = MyPState->ring_receive; + + slot = GetPrfSlot(ring_index); + + Assert(slot->status == PRFS_REQUESTED); + Assert(slot->my_ring_index == ring_index); + + /* + * Drop connection to all shards which have prefetch requests. + * It is not a problem to call disconnect multiple times on the same connection + * because disconnect implementation in libpagestore.c will check if connection + * is alive and do nothing of connection was already dropped. + */ + page_server->disconnect(slot->shard_no); + + /* clean up the request */ + slot->status = PRFS_TAG_REMAINS; + MyPState->n_requests_inflight -= 1; + MyPState->ring_receive += 1; + + prefetch_set_unused(ring_index); + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total += 1; + } + + /* + * We can have gone into retry due to network error, so update stats with + * the latest available + */ + MyNeonCounters->pageserver_open_requests = + MyPState->n_requests_inflight; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; +} + +/* + * prefetch_set_unused() - clear a received prefetch slot + * + * The slot at ring_index must be a current member of the ring buffer, + * and may not be in the PRFS_REQUESTED state. + * + * NOTE: this function will update MyPState->pfs_hash; which invalidates any + * active pointers into the hash table. + */ +static inline void +prefetch_set_unused(uint64 ring_index) +{ + PrefetchRequest *slot; + + if (ring_index < MyPState->ring_last) + return; /* Should already be unused */ + + slot = GetPrfSlot(ring_index); + if (slot->status == PRFS_UNUSED) + return; + + Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS); + + if (slot->status == PRFS_RECEIVED) + { + pfree(slot->response); + slot->response = NULL; + + MyPState->n_responses_buffered -= 1; + MyPState->n_unused += 1; + + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + } + else + { + Assert(slot->response == NULL); + } + + prfh_delete(MyPState->prf_hash, slot); + + /* clear all fields */ + MemSet(slot, 0, sizeof(PrefetchRequest)); + slot->status = PRFS_UNUSED; + + /* run cleanup if we're holding back ring_last */ + if (MyPState->ring_last == ring_index) + prefetch_cleanup_trailing_unused(); + + /* + * ... and try to store the buffered responses more compactly if > 12.5% + * of the buffer is gaps + */ + else if (ReceiveBufferNeedsCompaction()) + compact_prefetch_buffers(); +} + +/* + * Send one prefetch request to the pageserver. To wait for the response, call + * prefetch_wait_for(). + */ +static void +prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns) +{ + bool found; + uint64 mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index; + + NeonGetPageRequest request = { + .hdr.tag = T_NeonGetPageRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + /* lsn and not_modified_since are filled in below */ + .rinfo = BufTagGetNRelFileInfo(slot->buftag), + .forknum = slot->buftag.forkNum, + .blkno = slot->buftag.blockNum, + }; + + Assert(mySlotNo == MyPState->ring_unused); + + slot->reqid = request.hdr.reqid; + + if (force_request_lsns) + slot->request_lsns = *force_request_lsns; + else + neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), + slot->buftag.forkNum, slot->buftag.blockNum, + &slot->request_lsns, 1); + request.hdr.lsn = slot->request_lsns.request_lsn; + request.hdr.not_modified_since = slot->request_lsns.not_modified_since; + + Assert(slot->response == NULL); + Assert(slot->my_ring_index == MyPState->ring_unused); + + while (!page_server->send(slot->shard_no, (NeonRequest *) &request)) + { + Assert(mySlotNo == MyPState->ring_unused); + /* loop */ + } + + /* update prefetch state */ + MyPState->n_requests_inflight += 1; + MyPState->n_unused -= 1; + MyPState->ring_unused += 1; + BITMAP_SET(MyPState->shard_bitmap, slot->shard_no); + MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no); + + /* update slot state */ + slot->status = PRFS_REQUESTED; + prfh_insert(MyPState->prf_hash, slot, &found); + Assert(!found); +} + +/* + * Lookup of already received prefetch requests. Only already received responses matching required LSNs are accepted. + * Present pages are marked in "mask" bitmap and total number of such pages is returned. + */ +int +communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, + neon_request_lsns *lsns, BlockNumber nblocks, + void **buffers, bits8 *mask) +{ + int hits = 0; + PrefetchRequest hashkey; + + /* + * Use an intermediate PrefetchRequest struct as the hash key to ensure + * correct alignment and that the padding bytes are cleared. + */ + memset(&hashkey.buftag, 0, sizeof(BufferTag)); + CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo); + hashkey.buftag.forkNum = forknum; + + for (int i = 0; i < nblocks; i++) + { + PrfHashEntry *entry; + + hashkey.buftag.blockNum = blocknum + i; + entry = prfh_lookup(MyPState->prf_hash, &hashkey); + + if (entry != NULL) + { + PrefetchRequest *slot = entry->slot; + uint64 ring_index = slot->my_ring_index; + Assert(slot == GetPrfSlot(ring_index)); + + Assert(slot->status != PRFS_UNUSED); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); + Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); + + if (slot->status != PRFS_RECEIVED) + continue; + + /* + * If the caller specified a request LSN to use, only accept + * prefetch responses that satisfy that request. + */ + if (!neon_prefetch_response_usable(&lsns[i], slot)) + continue; + + /* + * Ignore errors + */ + if (slot->response->tag != T_NeonGetPageResponse) + { + if (slot->response->tag != T_NeonErrorResponse) + { + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag); + } + continue; + } + memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ); + + + /* + * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received + * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here + * under buffer lock. + */ + if (!lfc_store_prefetch_result) + lfc_write(rinfo, forknum, blocknum + i, buffers[i]); + + prefetch_set_unused(ring_index); + BITMAP_SET(mask, i); + + hits += 1; + inc_getpage_wait(0); + } + } + pgBufferUsage.prefetch.hits += hits; + return hits; +} + +/* + * prefetch_register_bufferv() - register and prefetch buffers + * + * Register that we may want the contents of BufferTag in the near future. + * This is used when issuing a speculative prefetch request, but also when + * performing a synchronous request and need the buffer right now. + * + * If force_request_lsns is not NULL, those values are sent to the + * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure + * to calculate the LSNs to send. + * + * Bits set in *mask (if present) indicate pages already read; i.e. pages we + * can skip in this process. + * + * When performing a prefetch rather than a synchronous request, + * is_prefetch==true. Currently, it only affects how the request is accounted + * in the perf counters. + * + * NOTE: this function may indirectly update MyPState->pfs_hash; which + * invalidates any active pointers into the hash table. + */ +void +communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, + BlockNumber nblocks, const bits8 *mask) +{ + uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; + + ring_index = prefetch_register_bufferv(tag, frlsns, nblocks, mask, true); + + Assert(ring_index < MyPState->ring_unused && + MyPState->ring_last <= ring_index); +} + +/* internal version. Returns the ring index */ +static uint64 +prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, + BlockNumber nblocks, const bits8 *mask, + bool is_prefetch) +{ + uint64 min_ring_index; + PrefetchRequest hashkey; +#ifdef USE_ASSERT_CHECKING + bool any_hits = false; +#endif + /* We will never read further ahead than our buffer can store. */ + nblocks = Max(1, Min(nblocks, readahead_buffer_size)); + + /* + * Use an intermediate PrefetchRequest struct as the hash key to ensure + * correct alignment and that the padding bytes are cleared. + */ + memset(&hashkey.buftag, 0, sizeof(BufferTag)); + hashkey.buftag = tag; + +Retry: + /* + * We can have gone into retry due to network error, so update stats with + * the latest available + */ + MyNeonCounters->pageserver_open_requests = + MyPState->ring_unused - MyPState->ring_receive; + MyNeonCounters->getpage_prefetches_buffered = + MyPState->n_responses_buffered; + + min_ring_index = UINT64_MAX; + for (int i = 0; i < nblocks; i++) + { + PrefetchRequest *slot = NULL; + PrfHashEntry *entry = NULL; + uint64 ring_index; + neon_request_lsns *lsns; + + if (PointerIsValid(mask) && BITMAP_ISSET(mask, i)) + continue; + + if (frlsns) + lsns = &frlsns[i]; + else + lsns = NULL; + +#ifdef USE_ASSERT_CHECKING + any_hits = true; +#endif + + slot = NULL; + entry = NULL; + + hashkey.buftag.blockNum = tag.blockNum + i; + entry = prfh_lookup(MyPState->prf_hash, &hashkey); + + if (entry != NULL) + { + slot = entry->slot; + ring_index = slot->my_ring_index; + Assert(slot == GetPrfSlot(ring_index)); + + Assert(slot->status != PRFS_UNUSED); + Assert(MyPState->ring_last <= ring_index && + ring_index < MyPState->ring_unused); + Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); + + /* + * If the caller specified a request LSN to use, only accept + * prefetch responses that satisfy that request. + */ + if (!is_prefetch) + { + if (!neon_prefetch_response_usable(lsns, slot)) + { + /* Wait for the old request to finish and discard it */ + if (!prefetch_wait_for(ring_index)) + goto Retry; + prefetch_set_unused(ring_index); + entry = NULL; + slot = NULL; + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total += 1; + } + } + + if (entry != NULL) + { + /* + * We received a prefetch for a page that was recently read + * and removed from the buffers. Remove that request from the + * buffers. + */ + if (slot->status == PRFS_TAG_REMAINS) + { + prefetch_set_unused(ring_index); + entry = NULL; + slot = NULL; + } + else + { + min_ring_index = Min(min_ring_index, ring_index); + /* The buffered request is good enough, return that index */ + if (is_prefetch) + pgBufferUsage.prefetch.duplicates++; + continue; + } + } + } + else if (!is_prefetch) + { + pgBufferUsage.prefetch.misses += 1; + MyNeonCounters->getpage_prefetch_misses_total++; + } + /* + * We can only leave the block above by finding that there's + * no entry that can satisfy this request, either because there + * was no entry, or because the entry was invalid or didn't satisfy + * the LSNs provided. + * + * The code should've made sure to clear up the data. + */ + Assert(entry == NULL); + Assert(slot == NULL); + + /* There should be no buffer overflow */ + Assert(MyPState->ring_last + readahead_buffer_size >= MyPState->ring_unused); + + /* + * If the prefetch queue is full, we need to make room by clearing the + * oldest slot. If the oldest slot holds a buffer that was already + * received, we can just throw it away; we fetched the page + * unnecessarily in that case. If the oldest slot holds a request that + * we haven't received a response for yet, we have to wait for the + * response to that before we can continue. We might not have even + * flushed the request to the pageserver yet, it might be just sitting + * in the output buffer. In that case, we flush it and wait for the + * response. (We could decide not to send it, but it's hard to abort + * when the request is already in the output buffer, and 'not sending' + * a prefetch request kind of goes against the principles of + * prefetching) + */ + if (MyPState->ring_last + readahead_buffer_size == MyPState->ring_unused) + { + uint64 cleanup_index = MyPState->ring_last; + + slot = GetPrfSlot(cleanup_index); + + Assert(slot->status != PRFS_UNUSED); + + /* + * If there is good reason to run compaction on the prefetch buffers, + * try to do that. + */ + if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) + { + Assert(slot->status == PRFS_UNUSED); + } + else + { + /* + * We have the slot for ring_last, so that must still be in + * progress + */ + switch (slot->status) + { + case PRFS_REQUESTED: + Assert(MyPState->ring_receive == cleanup_index); + if (!prefetch_wait_for(cleanup_index)) + goto Retry; + prefetch_set_unused(cleanup_index); + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total += 1; + break; + case PRFS_RECEIVED: + case PRFS_TAG_REMAINS: + prefetch_set_unused(cleanup_index); + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total += 1; + break; + default: + pg_unreachable(); + } + } + } + + /* + * The next buffer pointed to by `ring_unused` is now definitely empty, so + * we can insert the new request to it. + */ + ring_index = MyPState->ring_unused; + + Assert(MyPState->ring_last <= ring_index && + ring_index <= MyPState->ring_unused); + + slot = GetPrfSlotNoCheck(ring_index); + + Assert(slot->status == PRFS_UNUSED); + + /* + * We must update the slot data before insertion, because the hash + * function reads the buffer tag from the slot. + */ + slot->buftag = hashkey.buftag; + slot->shard_no = get_shard_number(&tag); + slot->my_ring_index = ring_index; + slot->flags = 0; + + min_ring_index = Min(min_ring_index, ring_index); + + if (is_prefetch) + MyNeonCounters->getpage_prefetch_requests_total++; + else + MyNeonCounters->getpage_sync_requests_total++; + + prefetch_do_request(slot, lsns); + } + + MyNeonCounters->pageserver_open_requests = + MyPState->ring_unused - MyPState->ring_receive; + + Assert(any_hits); + + Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED || + GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED); + Assert(MyPState->ring_last <= min_ring_index && + min_ring_index < MyPState->ring_unused); + + if (flush_every_n_requests > 0 && + MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests) + { + if (!prefetch_flush_requests()) + { + /* + * Prefetch set is reset in case of error, so we should try to + * register our request once again + */ + goto Retry; + } + MyPState->ring_flush = MyPState->ring_unused; + } + + return min_ring_index; +} + +static bool +equal_requests(NeonRequest* a, NeonRequest* b) +{ + return a->reqid == b->reqid && a->lsn == b->lsn && a->not_modified_since == b->not_modified_since; +} + + +/* + * Note: this function can get canceled and use a long jump to the next catch + * context. Take care. + */ +static NeonResponse * +page_server_request(void const *req) +{ + NeonResponse *resp; + BufferTag tag = {0}; + shardno_t shard_no; + + switch (messageTag(req)) + { + case T_NeonExistsRequest: + CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo); + break; + case T_NeonNblocksRequest: + CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo); + break; + case T_NeonDbSizeRequest: + NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode; + break; + case T_NeonGetPageRequest: + CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo); + tag.blockNum = ((NeonGetPageRequest *) req)->blkno; + break; + default: + neon_log(ERROR, "Unexpected request tag: %d", messageTag(req)); + } + shard_no = get_shard_number(&tag); + + /* + * Current sharding model assumes that all metadata is present only at shard 0. + * We still need to call get_shard_no() to check if shard map is up-to-date. + */ + if (((NeonRequest *) req)->tag != T_NeonGetPageRequest) + { + shard_no = 0; + } + + do + { + PG_TRY(); + { + while (!page_server->send(shard_no, (NeonRequest *) req) + || !page_server->flush(shard_no)) + { + /* do nothing */ + } + MyNeonCounters->pageserver_open_requests++; + consume_prefetch_responses(); + resp = page_server->receive(shard_no); + MyNeonCounters->pageserver_open_requests--; + } + PG_CATCH(); + { + /* + * Cancellation in this code needs to be handled better at some + * point, but this currently seems fine for now. + */ + page_server->disconnect(shard_no); + MyNeonCounters->pageserver_open_requests = 0; + + /* + * We know for sure we're not working on any prefetch pages after + * this. + */ + END_PREFETCH_RECEIVE_WORK(); + + PG_RE_THROW(); + } + PG_END_TRY(); + + } while (resp == NULL); + + return resp; +} + + +StringInfoData +nm_pack_request(NeonRequest *msg) +{ + StringInfoData s; + + initStringInfo(&s); + + pq_sendbyte(&s, msg->tag); + if (neon_protocol_version >= 3) + { + pq_sendint64(&s, msg->reqid); + } + pq_sendint64(&s, msg->lsn); + pq_sendint64(&s, msg->not_modified_since); + + switch (messageTag(msg)) + { + /* pagestore_client -> pagestore */ + case T_NeonExistsRequest: + { + NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; + + pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); + pq_sendbyte(&s, msg_req->forknum); + + break; + } + case T_NeonNblocksRequest: + { + NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; + + pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); + pq_sendbyte(&s, msg_req->forknum); + + break; + } + case T_NeonDbSizeRequest: + { + NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; + + pq_sendint32(&s, msg_req->dbNode); + + break; + } + case T_NeonGetPageRequest: + { + NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; + + pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); + pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); + pq_sendbyte(&s, msg_req->forknum); + pq_sendint32(&s, msg_req->blkno); + + break; + } + + case T_NeonGetSlruSegmentRequest: + { + NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; + + pq_sendbyte(&s, msg_req->kind); + pq_sendint32(&s, msg_req->segno); + + break; + } + + /* pagestore -> pagestore_client. We never need to create these. */ + case T_NeonExistsResponse: + case T_NeonNblocksResponse: + case T_NeonGetPageResponse: + case T_NeonErrorResponse: + case T_NeonDbSizeResponse: + case T_NeonGetSlruSegmentResponse: + default: + neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag); + break; + } + return s; +} + +NeonResponse * +nm_unpack_response(StringInfo s) +{ + NeonMessageTag tag = pq_getmsgbyte(s); + NeonResponse resp_hdr = {0}; /* make valgrind happy */ + NeonResponse *resp = NULL; + + resp_hdr.tag = tag; + if (neon_protocol_version >= 3) + { + resp_hdr.reqid = pq_getmsgint64(s); + resp_hdr.lsn = pq_getmsgint64(s); + resp_hdr.not_modified_since = pq_getmsgint64(s); + } + switch (tag) + { + /* pagestore -> pagestore_client */ + case T_NeonExistsResponse: + { + NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse)); + + if (neon_protocol_version >= 3) + { + NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + msg_resp->req.forknum = pq_getmsgbyte(s); + } + msg_resp->req.hdr = resp_hdr; + msg_resp->exists = pq_getmsgbyte(s); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonNblocksResponse: + { + NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse)); + + if (neon_protocol_version >= 3) + { + NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + msg_resp->req.forknum = pq_getmsgbyte(s); + } + msg_resp->req.hdr = resp_hdr; + msg_resp->n_blocks = pq_getmsgint(s, 4); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonGetPageResponse: + { + NeonGetPageResponse *msg_resp; + + msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE); + if (neon_protocol_version >= 3) + { + NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); + msg_resp->req.forknum = pq_getmsgbyte(s); + msg_resp->req.blkno = pq_getmsgint(s, 4); + } + msg_resp->req.hdr = resp_hdr; + /* XXX: should be varlena */ + memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); + pq_getmsgend(s); + + Assert(msg_resp->req.hdr.tag == T_NeonGetPageResponse); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonDbSizeResponse: + { + NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse)); + + if (neon_protocol_version >= 3) + { + msg_resp->req.dbNode = pq_getmsgint(s, 4); + } + msg_resp->req.hdr = resp_hdr; + msg_resp->db_size = pq_getmsgint64(s); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonErrorResponse: + { + NeonErrorResponse *msg_resp; + size_t msglen; + const char *msgtext; + + msgtext = pq_getmsgrawstring(s); + msglen = strlen(msgtext); + + msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1); + msg_resp->req = resp_hdr; + memcpy(msg_resp->message, msgtext, msglen + 1); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + case T_NeonGetSlruSegmentResponse: + { + NeonGetSlruSegmentResponse *msg_resp; + int n_blocks; + msg_resp = palloc0(sizeof(NeonGetSlruSegmentResponse)); + + if (neon_protocol_version >= 3) + { + msg_resp->req.kind = pq_getmsgbyte(s); + msg_resp->req.segno = pq_getmsgint(s, 4); + } + msg_resp->req.hdr = resp_hdr; + + n_blocks = pq_getmsgint(s, 4); + msg_resp->n_blocks = n_blocks; + memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ); + pq_getmsgend(s); + + resp = (NeonResponse *) msg_resp; + break; + } + + /* + * pagestore_client -> pagestore + * + * We create these ourselves, and don't need to decode them. + */ + case T_NeonExistsRequest: + case T_NeonNblocksRequest: + case T_NeonGetPageRequest: + case T_NeonDbSizeRequest: + case T_NeonGetSlruSegmentRequest: + default: + neon_log(ERROR, "unexpected neon message tag 0x%02x", tag); + break; + } + + return resp; +} + +/* dump to json for debugging / error reporting purposes */ +char * +nm_to_string(NeonMessage *msg) +{ + StringInfoData s; + + initStringInfo(&s); + + switch (messageTag(msg)) + { + /* pagestore_client -> pagestore */ + case T_NeonExistsRequest: + { + NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\""); + appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); + appendStringInfoChar(&s, '}'); + break; + } + + case T_NeonNblocksRequest: + { + NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\""); + appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); + appendStringInfoChar(&s, '}'); + break; + } + + case T_NeonGetPageRequest: + { + NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\""); + appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); + appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); + appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); + appendStringInfoChar(&s, '}'); + break; + } + case T_NeonDbSizeRequest: + { + NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\""); + appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); + appendStringInfoChar(&s, '}'); + break; + } + case T_NeonGetSlruSegmentRequest: + { + NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\""); + appendStringInfo(&s, ", \"kind\": %u", msg_req->kind); + appendStringInfo(&s, ", \"segno\": %u", msg_req->segno); + appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); + appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); + appendStringInfoChar(&s, '}'); + break; + } + /* pagestore -> pagestore_client */ + case T_NeonExistsResponse: + { + NeonExistsResponse *msg_resp = (NeonExistsResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonExistsResponse\""); + appendStringInfo(&s, ", \"exists\": %d}", + msg_resp->exists); + appendStringInfoChar(&s, '}'); + + break; + } + case T_NeonNblocksResponse: + { + NeonNblocksResponse *msg_resp = (NeonNblocksResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonNblocksResponse\""); + appendStringInfo(&s, ", \"n_blocks\": %u}", + msg_resp->n_blocks); + appendStringInfoChar(&s, '}'); + + break; + } + case T_NeonGetPageResponse: + { +#if 0 + NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg; +#endif + + appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\""); + appendStringInfo(&s, ", \"page\": \"XXX\"}"); + appendStringInfoChar(&s, '}'); + break; + } + case T_NeonErrorResponse: + { + NeonErrorResponse *msg_resp = (NeonErrorResponse *) msg; + + /* FIXME: escape double-quotes in the message */ + appendStringInfoString(&s, "{\"type\": \"NeonErrorResponse\""); + appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message); + appendStringInfoChar(&s, '}'); + break; + } + case T_NeonDbSizeResponse: + { + NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\""); + appendStringInfo(&s, ", \"db_size\": %ld}", + msg_resp->db_size); + appendStringInfoChar(&s, '}'); + + break; + } + case T_NeonGetSlruSegmentResponse: + { + NeonGetSlruSegmentResponse *msg_resp = (NeonGetSlruSegmentResponse *) msg; + + appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentResponse\""); + appendStringInfo(&s, ", \"n_blocks\": %u}", + msg_resp->n_blocks); + appendStringInfoChar(&s, '}'); + + break; + } + + default: + appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag); + } + return s.data; +} + +/* + * communicator_init() -- Initialize per-backend private state + */ +void +communicator_init(void) +{ + Size prfs_size; + + if (MyPState != NULL) + return; + + /* + * Sanity check that theperf counters array is sized correctly. We got + * this wrong once, and the formula for max number of backends and aux + * processes might well change in the future, so better safe than sorry. + * This is a very cheap check so we do it even without assertions. On + * v14, this gets called before initializing MyProc, so we cannot perform + * the check here. That's OK, we don't expect the logic to change in old + * releases. + */ +#if PG_VERSION_NUM>=150000 + if (MyNeonCounters >= &neon_per_backend_counters_shared[NUM_NEON_PERF_COUNTER_SLOTS]) + elog(ERROR, "MyNeonCounters points past end of array"); +#endif + + prfs_size = offsetof(PrefetchState, prf_buffer) + + sizeof(PrefetchRequest) * readahead_buffer_size; + + MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size); + + MyPState->n_unused = readahead_buffer_size; + + MyPState->bufctx = SlabContextCreate(TopMemoryContext, + "NeonSMGR/prefetch", + SLAB_DEFAULT_BLOCK_SIZE * 17, + PS_GETPAGERESPONSE_SIZE); + MyPState->errctx = AllocSetContextCreate(TopMemoryContext, + "NeonSMGR/errors", + ALLOCSET_DEFAULT_SIZES); + MyPState->hashctx = AllocSetContextCreate(TopMemoryContext, + "NeonSMGR/prefetch", + ALLOCSET_DEFAULT_SIZES); + + MyPState->prf_hash = prfh_create(MyPState->hashctx, + readahead_buffer_size, NULL); +} + +/* + * neon_prefetch_response_usable -- Can a new request be satisfied by old one? + * + * This is used to check if the response to a prefetch request can be used to + * satisfy a page read now. + */ +static bool +neon_prefetch_response_usable(neon_request_lsns *request_lsns, + PrefetchRequest *slot) +{ + /* sanity check the LSN's on the old and the new request */ + Assert(request_lsns->request_lsn >= request_lsns->not_modified_since); + Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since); + Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn); + Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since); + Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); + Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn); + Assert(slot->status != PRFS_UNUSED); + + /* + * The new request's LSN should never be older than the old one. This + * could be an Assert, except that for testing purposes, we do provide an + * interface in neon_test_utils to fetch pages at arbitary LSNs, which + * violates this. + * + * Similarly, the not_modified_since value calculated for a page should + * never move backwards. This assumption is a bit fragile; if we updated + * the last-written cache when we read in a page, for example, then it + * might. But as the code stands, it should not. + * + * (If two backends issue a request at the same time, they might race and + * calculate LSNs "out of order" with each other, but the prefetch queue + * is backend-private at the moment.) + */ + if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn || + request_lsns->not_modified_since < slot->request_lsns.not_modified_since) + { + ereport(LOG, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "request with unexpected LSN after prefetch"), + errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", + LSN_FORMAT_ARGS(request_lsns->effective_request_lsn), + LSN_FORMAT_ARGS(request_lsns->not_modified_since), + LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn), + LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); + return false; + } + + /*--- + * Each request to the pageserver has three LSN values associated with it: + * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'. + * `not_modified_since` and `request_lsn` are sent to the pageserver, but + * in the primary node, we always use UINT64_MAX as the `request_lsn`, so + * we remember `effective_request_lsn` separately. In a primary, + * `effective_request_lsn` is the same as `not_modified_since`. + * See comments in neon_get_request_lsns why we can not use last flush WAL position here. + * + * To determine whether a response to a GetPage request issued earlier is + * still valid to satisfy a new page read, we look at the + * (not_modified_since, effective_request_lsn] range of the request. It is + * effectively a claim that the page has not been modified between those + * LSNs. If the range of the old request in the queue overlaps with the + * new request, we know that the page hasn't been modified in the union of + * the ranges. We can use the response to old request to satisfy the new + * request in that case. For example: + * + * 100 500 + * Old request: +--------+ + * + * 400 800 + * New request: +--------+ + * + * The old request claims that the page was not modified between LSNs 100 + * and 500, and the second claims that it was not modified between 400 and + * 800. Together they mean that the page was not modified between 100 and + * 800. Therefore the response to the old request is also valid for the + * new request. + * + * This logic also holds at the boundary case that the old request's LSN + * matches the new request's not_modified_since LSN exactly: + * + * 100 500 + * Old request: +--------+ + * + * 500 900 + * New request: +--------+ + * + * The response to the old request is the page as it was at LSN 500, and + * the page hasn't been changed in the range (500, 900], therefore the + * response is valid also for the new request. + */ + + /* this follows from the checks above */ + Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since); + + return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn; +} + +/* + * Does the physical file exist? + */ +bool +communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *request_lsns) +{ + bool exists; + NeonResponse *resp; + + { + NeonExistsRequest request = { + .hdr.tag = T_NeonExistsRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns->request_lsn, + .hdr.not_modified_since = request_lsns->not_modified_since, + .rinfo = rinfo, + .forknum = forkNum + }; + + resp = page_server_request(&request); + + switch (resp->tag) + { + case T_NeonExistsResponse: + { + NeonExistsResponse* exists_resp = (NeonExistsResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + !RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) || + exists_resp->req.forknum != request.forknum) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum); + } + } + exists = exists_resp->exists; + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", + resp->reqid, + RelFileInfoFmt(rinfo), + forkNum, + LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x", + T_NeonExistsResponse, T_NeonErrorResponse, resp->tag); + } + pfree(resp); + } + return exists; +} + +/* + * Read N pages at a specific LSN. + * + * *mask is set for pages read at a previous point in time, and which we + * should not touch, nor overwrite. + * New bits should be set in *mask for the pages we'successfully read. + * + * The offsets in request_lsns, buffers, and mask are linked. + */ +void +communicator_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, + neon_request_lsns *request_lsns, + void **buffers, BlockNumber nblocks, const bits8 *mask) +{ + NeonResponse *resp; + uint64 ring_index; + PrfHashEntry *entry; + PrefetchRequest *slot; + PrefetchRequest hashkey; + + Assert(PointerIsValid(request_lsns)); + Assert(nblocks >= 1); + + /* + * Use an intermediate PrefetchRequest struct as the hash key to ensure + * correct alignment and that the padding bytes are cleared. + */ + memset(&hashkey.buftag, 0, sizeof(BufferTag)); + CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo); + hashkey.buftag.forkNum = forkNum; + hashkey.buftag.blockNum = base_blockno; + + /* + * The redo process does not lock pages that it needs to replay but are + * not in the shared buffers, so a concurrent process may request the page + * after redo has decided it won't redo that page and updated the LwLSN + * for that page. If we're in hot standby we need to take care that we + * don't return until after REDO has finished replaying up to that LwLSN, + * as the page should have been locked up to that point. + * + * See also the description on neon_redo_read_buffer_filter below. + * + * NOTE: It is possible that the WAL redo process will still do IO due to + * concurrent failed read IOs. Those IOs should never have a request_lsn + * that is as large as the WAL record we're currently replaying, if it + * weren't for the behaviour of the LwLsn cache that uses the highest + * value of the LwLsn cache when the entry is not found. + */ + (void) prefetch_register_bufferv(hashkey.buftag, request_lsns, nblocks, mask, false); + + for (int i = 0; i < nblocks; i++) + { + void *buffer = buffers[i]; + BlockNumber blockno = base_blockno + i; + neon_request_lsns *reqlsns = &request_lsns[i]; + TimestampTz start_ts, end_ts; + + if (PointerIsValid(mask) && BITMAP_ISSET(mask, i)) + continue; + + start_ts = GetCurrentTimestamp(); + + if (RecoveryInProgress() && MyBackendType != B_STARTUP) + XLogWaitForReplayOf(reqlsns->request_lsn); + + /* + * Try to find prefetched page in the list of received pages. + */ +Retry: + hashkey.buftag.blockNum = blockno; + entry = prfh_lookup(MyPState->prf_hash, &hashkey); + + if (entry != NULL) + { + slot = entry->slot; + if (neon_prefetch_response_usable(reqlsns, slot)) + { + ring_index = slot->my_ring_index; + } + else + { + /* + * Cannot use this prefetch, discard it + * + * We can't drop cache for not-yet-received requested items. It is + * unlikely this happens, but it can happen if prefetch distance + * is large enough and a backend didn't consume all prefetch + * requests. + */ + if (slot->status == PRFS_REQUESTED) + { + if (!prefetch_wait_for(slot->my_ring_index)) + goto Retry; + } + /* drop caches */ + prefetch_set_unused(slot->my_ring_index); + pgBufferUsage.prefetch.expired += 1; + MyNeonCounters->getpage_prefetch_discards_total++; + /* make it look like a prefetch cache miss */ + entry = NULL; + } + } + + do + { + if (entry == NULL) + { + ring_index = prefetch_register_bufferv(hashkey.buftag, reqlsns, 1, NULL, false); + Assert(ring_index != UINT64_MAX); + slot = GetPrfSlot(ring_index); + } + else + { + /* + * Empty our reference to the prefetch buffer's hash entry. When + * we wait for prefetches, the entry reference is invalidated by + * potential updates to the hash, and when we reconnect to the + * pageserver the prefetch we're waiting for may be dropped, in + * which case we need to retry and take the branch above. + */ + entry = NULL; + } + + Assert(slot->my_ring_index == ring_index); + Assert(MyPState->ring_last <= ring_index && + MyPState->ring_unused > ring_index); + Assert(slot->status != PRFS_UNUSED); + Assert(GetPrfSlot(ring_index) == slot); + + } while (!prefetch_wait_for(ring_index)); + + Assert(slot->status == PRFS_RECEIVED); + Assert(memcmp(&hashkey.buftag, &slot->buftag, sizeof(BufferTag)) == 0); + Assert(hashkey.buftag.blockNum == base_blockno + i); + + resp = slot->response; + + switch (resp->tag) + { + case T_NeonGetPageResponse: + { + NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp; + if (neon_protocol_version >= 3) + { + if (resp->reqid != slot->reqid || + resp->lsn != slot->request_lsns.request_lsn || + resp->not_modified_since != slot->request_lsns.not_modified_since || + !RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) || + getpage_resp->req.forknum != forkNum || + getpage_resp->req.blkno != base_blockno + i) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno, + slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i); + } + } + memcpy(buffer, getpage_resp->page, BLCKSZ); + + /* + * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received + * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here + * under buffer lock. + */ + if (!lfc_store_prefetch_result) + lfc_write(rinfo, forkNum, blockno, buffer); + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (resp->reqid != slot->reqid || + resp->lsn != slot->request_lsns.request_lsn || + resp->not_modified_since != slot->request_lsns.not_modified_since) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", + slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo), + forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + default: + NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, + "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", + T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); + } + + /* buffer was used, clean up for later reuse */ + prefetch_set_unused(ring_index); + prefetch_cleanup_trailing_unused(); + + end_ts = GetCurrentTimestamp(); + inc_getpage_wait(end_ts >= start_ts ? (end_ts - start_ts) : 0); + } +} + +/* + * neon_nblocks() -- Get the number of blocks stored in a relation. + */ +BlockNumber +communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *request_lsns) +{ + NeonResponse *resp; + BlockNumber n_blocks; + + { + NeonNblocksRequest request = { + .hdr.tag = T_NeonNblocksRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns->request_lsn, + .hdr.not_modified_since = request_lsns->not_modified_since, + .rinfo = rinfo, + .forknum = forknum, + }; + + resp = page_server_request(&request); + + switch (resp->tag) + { + case T_NeonNblocksResponse: + { + NeonNblocksResponse * relsize_resp = (NeonNblocksResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + !RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) || + relsize_resp->req.forknum != forknum) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum); + } + } + n_blocks = relsize_resp->n_blocks; + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", + resp->reqid, + RelFileInfoFmt(rinfo), + forknum, + LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x", + T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag); + } + + pfree(resp); + } + return n_blocks; +} + +/* + * neon_db_size() -- Get the size of the database in bytes. + */ +int64 +communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns) +{ + NeonResponse *resp; + int64 db_size; + + { + NeonDbSizeRequest request = { + .hdr.tag = T_NeonDbSizeRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns->request_lsn, + .hdr.not_modified_since = request_lsns->not_modified_since, + .dbNode = dbNode, + }; + + resp = page_server_request(&request); + + switch (resp->tag) + { + case T_NeonDbSizeResponse: + { + NeonDbSizeResponse* dbsize_resp = (NeonDbSizeResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + dbsize_resp->req.dbNode != dbNode) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode); + } + } + db_size = dbsize_resp->db_size; + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X", + resp->reqid, + dbNode, LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x", + T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag); + } + + pfree(resp); + } + return db_size; +} + +int +communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *request_lsns, + void *buffer) +{ + int n_blocks; + shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ + NeonResponse *resp; + NeonGetSlruSegmentRequest request; + + request = (NeonGetSlruSegmentRequest) { + .hdr.tag = T_NeonGetSlruSegmentRequest, + .hdr.reqid = GENERATE_REQUEST_ID(), + .hdr.lsn = request_lsns->request_lsn, + .hdr.not_modified_since = request_lsns->not_modified_since, + .kind = kind, + .segno = segno + }; + + do + { + while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no)); + + consume_prefetch_responses(); + + resp = page_server->receive(shard_no); + } while (resp == NULL); + + switch (resp->tag) + { + case T_NeonGetSlruSegmentResponse: + { + NeonGetSlruSegmentResponse* slru_resp = (NeonGetSlruSegmentResponse *) resp; + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr) || + slru_resp->req.kind != kind || + slru_resp->req.segno != segno) + { + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno, + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, (unsigned long long) segno); + } + } + n_blocks = slru_resp->n_blocks; + memcpy(buffer, slru_resp->data, n_blocks*BLCKSZ); + break; + } + case T_NeonErrorResponse: + if (neon_protocol_version >= 3) + { + if (!equal_requests(resp, &request.hdr)) + { + elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", + resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), + request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); + } + } + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %llu at lsn %X/%08X", + resp->reqid, + kind, + (unsigned long long) segno, + LSN_FORMAT_ARGS(request_lsns->request_lsn)), + errdetail("page server returned error: %s", + ((NeonErrorResponse *) resp)->message))); + break; + + default: + NEON_PANIC_CONNECTION_STATE(-1, PANIC, + "Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x", + T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag); + } + pfree(resp); + + communicator_reconfigure_timeout_if_needed(); + return n_blocks; +} + +void +communicator_reconfigure_timeout_if_needed(void) +{ + bool needs_set = MyPState->ring_receive != MyPState->ring_unused && + readahead_getpage_pull_timeout_ms > 0; + + if (needs_set != timeout_set) + { + /* The background writer doens't (shouldn't) read any pages */ + Assert(!AmBackgroundWriterProcess()); + /* The checkpointer doens't (shouldn't) read any pages */ + Assert(!AmCheckpointerProcess()); + + if (unlikely(PS_TIMEOUT_ID == 0)) + { + PS_TIMEOUT_ID = RegisterTimeout(USER_TIMEOUT, pagestore_timeout_handler); + } + + if (needs_set) + { +#if PG_MAJORVERSION_NUM <= 14 + enable_timeout_after(PS_TIMEOUT_ID, readahead_getpage_pull_timeout_ms); +#else + enable_timeout_every( + PS_TIMEOUT_ID, + TimestampTzPlusMilliseconds(GetCurrentTimestamp(), + readahead_getpage_pull_timeout_ms), + readahead_getpage_pull_timeout_ms + ); +#endif + timeout_set = true; + } + else + { + Assert(timeout_set); + disable_timeout(PS_TIMEOUT_ID, false); + timeout_set = false; + } + } +} + +static void +pagestore_timeout_handler(void) +{ +#if PG_MAJORVERSION_NUM <= 14 + /* + * PG14: Setting a repeating timeout is not possible, so we signal here + * that the timeout has already been reset, and by telling the system + * that system will re-schedule it later if we need to. + */ + timeout_set = false; +#endif + timeout_signaled = true; + InterruptPending = true; +} + +/* + * Process new data received in our active PageStream sockets. + * + * This relies on the invariant that all pipelined yet-to-be-received requests + * are getPage requests managed by MyPState. This is currently true, any + * modification will probably require some stuff to make it work again. + */ +static bool +communicator_processinterrupts(void) +{ + if (timeout_signaled) + { + if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0) + communicator_prefetch_pump_state(true); + + timeout_signaled = false; + communicator_reconfigure_timeout_if_needed(); + } + + if (!prev_interrupt_cb) + return false; + + return prev_interrupt_cb(); +} diff --git a/pgxn/neon/communicator.h b/pgxn/neon/communicator.h new file mode 100644 index 000000000000..72cba526c1a0 --- /dev/null +++ b/pgxn/neon/communicator.h @@ -0,0 +1,48 @@ +/*------------------------------------------------------------------------- + * + * communicator.h + * internal interface for communicating with remote pageservers + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ +#ifndef COMMUNICATOR_h +#define COMMUNICATOR_h + +#include "neon_pgversioncompat.h" + +#include "storage/buf_internals.h" + +#include "pagestore_client.h" + +/* initialization at postmaster startup */ +extern void pg_init_communicator(void); + +/* initialization at backend startup */ +extern void communicator_init(void); + +extern bool communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, + neon_request_lsns *request_lsns); +extern BlockNumber communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, + neon_request_lsns *request_lsns); +extern int64 communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns); +extern void communicator_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber base_blockno, neon_request_lsns *request_lsns, + void **buffers, BlockNumber nblocks, const bits8 *mask); +extern int communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, + neon_request_lsns *lsns, + BlockNumber nblocks, void **buffers, bits8 *mask); +extern void communicator_prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, + BlockNumber nblocks, const bits8 *mask); +extern int communicator_read_slru_segment(SlruKind kind, int64 segno, + neon_request_lsns *request_lsns, + void *buffer); + +extern void communicator_reconfigure_timeout_if_needed(void); +extern void communicator_prefetch_pump_state(bool IsHandlingInterrupts); + + +#endif diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 2505fcb84723..8c2990e57aed 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -21,7 +21,6 @@ #include "access/xlog.h" #include "funcapi.h" #include "miscadmin.h" -#include "pagestore_client.h" #include "common/hashfn.h" #include "pgstat.h" #include "port/pg_iovec.h" @@ -43,6 +42,7 @@ #include "hll.h" #include "bitmap.h" +#include "file_cache.h" #include "neon.h" #include "neon_lwlsncache.h" #include "neon_perf_counters.h" diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h new file mode 100644 index 000000000000..849558b83d9d --- /dev/null +++ b/pgxn/neon/file_cache.h @@ -0,0 +1,52 @@ +/*------------------------------------------------------------------------- + * + * file_cache.h + * Local File Cache definitions + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + *------------------------------------------------------------------------- + */ +#ifndef FILE_CACHE_h +#define FILE_CACHE_h + +#include "neon_pgversioncompat.h" + +/* GUCs */ +extern bool lfc_store_prefetch_result; + +/* functions for local file cache */ +extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, const void *const *buffers, + BlockNumber nblocks); +/* returns number of blocks read, with one bit set in *read for each */ +extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, void **buffers, + BlockNumber nblocks, bits8 *mask); + +extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno); +extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, + BlockNumber blkno, int nblocks, bits8 *bitmap); +extern void lfc_init(void); +extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, + const void* buffer, XLogRecPtr lsn); + + +static inline bool +lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + void *buffer) +{ + bits8 rv = 0; + return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1; +} + +static inline void +lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, + const void *buffer) +{ + return lfc_writev(rinfo, forkNum, blkno, &buffer, 1); +} + +#endif /* FILE_CACHE_H */ diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 60b2249461a0..dfabb6919e5f 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -65,6 +65,9 @@ static const struct config_enum_entry neon_compute_modes[] = { /* GUCs */ char *neon_timeline; char *neon_tenant; +char *neon_project_id; +char *neon_branch_id; +char *neon_endpoint_id; int32 max_cluster_size; char *page_server_connstring; char *neon_auth_token; @@ -1352,6 +1355,31 @@ pg_init_libpagestore(void) 0, /* no flags required */ check_neon_id, NULL, NULL); + DefineCustomStringVariable("neon.project_id", + "Neon project_id the server is running on", + NULL, + &neon_project_id, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_neon_id, NULL, NULL); + DefineCustomStringVariable("neon.branch_id", + "Neon branch_id the server is running on", + NULL, + &neon_branch_id, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_neon_id, NULL, NULL); + DefineCustomStringVariable("neon.endpoint_id", + "Neon endpoint_id the server is running on", + NULL, + &neon_endpoint_id, + "", + PGC_POSTMASTER, + 0, /* no flags required */ + check_neon_id, NULL, NULL); + DefineCustomIntVariable("neon.stripe_size", "sharding stripe size", NULL, @@ -1475,6 +1503,4 @@ pg_init_libpagestore(void) } memset(page_servers, 0, sizeof(page_servers)); - - lfc_init(); } diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index 081025e2d52a..a6a70217566a 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -28,7 +28,9 @@ #include "utils/guc.h" #include "utils/guc_tables.h" +#include "communicator.h" #include "extension_server.h" +#include "file_cache.h" #include "neon.h" #include "neon_lwlsncache.h" #include "control_plane_connector.h" @@ -434,10 +436,11 @@ _PG_init(void) #endif pg_init_libpagestore(); + lfc_init(); pg_init_walproposer(); init_lwlsncache(); - pagestore_smgr_init(); + pg_init_communicator(); Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; InitUnstableExtensionsSupport(); diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index e2fa136e37e9..a2e81feb5f75 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -47,9 +47,18 @@ extern uint32 WAIT_EVENT_NEON_WAL_DL; #define WAIT_EVENT_NEON_WAL_DL WAIT_EVENT_WAL_READ #endif + +#define NEON_TAG "[NEON_SMGR] " +#define neon_log(tag, fmt, ...) ereport(tag, \ + (errmsg(NEON_TAG fmt, ##__VA_ARGS__), \ + errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) +#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag, \ + (errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \ + errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) + + extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); -extern void pagestore_smgr_init(void); extern uint64 BackpressureThrottlingTime(void); extern void SetNeonCurrentClusterSize(uint64 size); diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h index a2e3d57e4743..0ab539fe5633 100644 --- a/pgxn/neon/pagestore_client.h +++ b/pgxn/neon/pagestore_client.h @@ -58,14 +58,6 @@ typedef struct #define messageTag(m) (((const NeonMessage *)(m))->tag) -#define NEON_TAG "[NEON_SMGR] " -#define neon_log(tag, fmt, ...) ereport(tag, \ - (errmsg(NEON_TAG fmt, ##__VA_ARGS__), \ - errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) -#define neon_shard_log(shard_no, tag, fmt, ...) ereport(tag, \ - (errmsg(NEON_TAG "[shard %d] " fmt, shard_no, ##__VA_ARGS__), \ - errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0))) - /* SLRUs downloadable from page server */ typedef enum { SLRU_CLOG, @@ -234,7 +226,6 @@ extern char *neon_timeline; extern char *neon_tenant; extern int32 max_cluster_size; extern int neon_protocol_version; -extern bool lfc_store_prefetch_result; extern shardno_t get_shard_number(BufferTag* tag); @@ -242,6 +233,7 @@ extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo); extern void smgr_init_neon(void); extern void readahead_buffer_resize(int newsize, void *extra); + /* * LSN values associated with each request to the pageserver */ @@ -278,6 +270,10 @@ extern PGDLLEXPORT void neon_read_at_lsn(NRelFileInfo rnode, ForkNumber forkNum, neon_request_lsns request_lsns, void *buffer); extern int64 neon_dbsize(Oid dbNode); +extern void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, + BlockNumber blkno, neon_request_lsns *output, + BlockNumber nblocks); + /* utils for neon relsize cache */ extern void relsize_hash_init(void); extern bool get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size); @@ -285,37 +281,4 @@ extern void set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumb extern void update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size); extern void forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum); -/* functions for local file cache */ -extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, - BlockNumber blkno, const void *const *buffers, - BlockNumber nblocks); -/* returns number of blocks read, with one bit set in *read for each */ -extern int lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, - BlockNumber blkno, void **buffers, - BlockNumber nblocks, bits8 *mask); - -extern bool lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, - BlockNumber blkno); -extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, - BlockNumber blkno, int nblocks, bits8 *bitmap); -extern void lfc_init(void); -extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, - const void* buffer, XLogRecPtr lsn); - - -static inline bool -lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - void *buffer) -{ - bits8 rv = 0; - return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1; -} - -static inline void -lfc_write(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, - const void *buffer) -{ - return lfc_writev(rinfo, forkNum, blkno, &buffer, 1); -} - #endif /* PAGESTORE_CLIENT_H */ diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index eb8df11923a1..ef6bd038bbd3 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -49,9 +49,6 @@ #include "access/xlog_internal.h" #include "access/xlogutils.h" #include "catalog/pg_class.h" -#include "common/hashfn.h" -#include "executor/instrument.h" -#include "libpq/pqformat.h" #include "pgstat.h" #include "postmaster/autovacuum.h" #include "postmaster/interrupt.h" @@ -62,9 +59,10 @@ #include "storage/fsm_internals.h" #include "storage/md.h" #include "storage/smgr.h" -#include "utils/timeout.h" #include "bitmap.h" +#include "communicator.h" +#include "file_cache.h" #include "neon.h" #include "neon_lwlsncache.h" #include "neon_perf_counters.h" @@ -101,1707 +99,22 @@ static char *hexdump_page(char *page); const int SmgrTrace = DEBUG5; -#define NEON_PANIC_CONNECTION_STATE(shard_no, elvl, message, ...) \ - neon_shard_log(shard_no, elvl, "Broken connection state: " message, \ - ##__VA_ARGS__) - -page_server_api *page_server; - /* unlogged relation build states */ typedef enum -{ - UNLOGGED_BUILD_NOT_IN_PROGRESS = 0, - UNLOGGED_BUILD_PHASE_1, - UNLOGGED_BUILD_PHASE_2, - UNLOGGED_BUILD_NOT_PERMANENT -} UnloggedBuildPhase; - -static SMgrRelation unlogged_build_rel = NULL; -static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; - -static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id); -static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL; - -static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum); - -static uint32 local_request_counter; -#define GENERATE_REQUEST_ID() (((NeonRequestId)MyProcPid << 32) | ++local_request_counter) - -/* - * Various settings related to prompt (fast) handling of PageStream responses - * at any CHECK_FOR_INTERRUPTS point. - */ -int readahead_getpage_pull_timeout_ms = 0; -static int PS_TIMEOUT_ID = 0; -static bool timeout_set = false; -static bool timeout_signaled = false; - -/* - * We have a CHECK_FOR_INTERRUPTS in page_server->receive(), and we don't want - * that to handle any getpage responses if we're already working on the - * backlog of those, as we'd hit issues with determining which prefetch slot - * we just got a response for. - * - * To protect against that, we have this variable that's set whenever we start - * receiving data for prefetch slots, so that we don't get confused. - * - * Note that in certain error cases during readpage we may leak r_r_g=true, - * which results in a failure to pick up further responses until we first - * actively try to receive new getpage responses. - */ -static bool readpage_reentrant_guard = false; - -static void reconfigure_timeout_if_needed(void); -static void pagestore_timeout_handler(void); - -#define START_PREFETCH_RECEIVE_WORK() \ - do { \ - readpage_reentrant_guard = true; \ - } while (false) - -#define END_PREFETCH_RECEIVE_WORK() \ - do { \ - readpage_reentrant_guard = false; \ - if (unlikely(timeout_signaled && !InterruptPending)) \ - InterruptPending = true; \ - } while (false) - -/* - * Prefetch implementation: - * - * Prefetch is performed locally by each backend. - * - * There can be up to readahead_buffer_size active IO requests registered at - * any time. Requests using smgr_prefetch are sent to the pageserver, but we - * don't wait on the response. Requests using smgr_read are either read from - * the buffer, or (if that's not possible) we wait on the response to arrive - - * this also will allow us to receive other prefetched pages. - * Each request is immediately written to the output buffer of the pageserver - * connection, but may not be flushed if smgr_prefetch is used: pageserver - * flushes sent requests on manual flush, or every neon.flush_output_after - * unflushed requests; which is not necessarily always and all the time. - * - * Once we have received a response, this value will be stored in the response - * buffer, indexed in a hash table. This allows us to retain our buffered - * prefetch responses even when we have cache misses. - * - * Reading of prefetch responses is delayed until them are actually needed - * (smgr_read). In case of prefetch miss or any other SMGR request other than - * smgr_read, all prefetch responses in the pipeline will need to be read from - * the connection; the responses are stored for later use. - * - * NOTE: The current implementation of the prefetch system implements a ring - * buffer of up to readahead_buffer_size requests. If there are more _read and - * _prefetch requests between the initial _prefetch and the _read of a buffer, - * the prefetch request will have been dropped from this prefetch buffer, and - * your prefetch was wasted. - */ - -/* - * State machine: - * - * not in hash : in hash - * : - * UNUSED ------> REQUESTED --> RECEIVED - * ^ : | | - * | : v | - * | : TAG_REMAINS | - * | : | | - * +----------------+------------+ - * : - */ -typedef enum PrefetchStatus -{ - PRFS_UNUSED = 0, /* unused slot */ - PRFS_REQUESTED, /* request was written to the sendbuffer to - * PS, but not necessarily flushed. all fields - * except response valid */ - PRFS_RECEIVED, /* all fields valid */ - PRFS_TAG_REMAINS, /* only buftag and my_ring_index are still - * valid */ -} PrefetchStatus; - -/* must fit in uint8; bits 0x1 are used */ -typedef enum { - PRFSF_NONE = 0x0, - PRFSF_LFC = 0x1 /* received prefetch result is stored in LFC */ -} PrefetchRequestFlags; - -typedef struct PrefetchRequest -{ - BufferTag buftag; /* must be first entry in the struct */ - shardno_t shard_no; - uint8 status; /* see PrefetchStatus for valid values */ - uint8 flags; /* see PrefetchRequestFlags */ - neon_request_lsns request_lsns; - NeonRequestId reqid; - NeonResponse *response; /* may be null */ - uint64 my_ring_index; -} PrefetchRequest; - -/* prefetch buffer lookup hash table */ - -typedef struct PrfHashEntry -{ - PrefetchRequest *slot; - uint32 status; - uint32 hash; -} PrfHashEntry; - -#define SH_PREFIX prfh -#define SH_ELEMENT_TYPE PrfHashEntry -#define SH_KEY_TYPE PrefetchRequest * -#define SH_KEY slot -#define SH_STORE_HASH -#define SH_GET_HASH(tb, a) ((a)->hash) -#define SH_HASH_KEY(tb, key) hash_bytes( \ - ((const unsigned char *) &(key)->buftag), \ - sizeof(BufferTag) \ -) - -#define SH_EQUAL(tb, a, b) (BufferTagsEqual(&(a)->buftag, &(b)->buftag)) -#define SH_SCOPE static inline -#define SH_DEFINE -#define SH_DECLARE -#include "lib/simplehash.h" - -/* - * PrefetchState maintains the state of (prefetch) getPage@LSN requests. - * It maintains a (ring) buffer of in-flight requests and responses. - * - * We maintain several indexes into the ring buffer: - * ring_unused >= ring_flush >= ring_receive >= ring_last >= 0 - * - * ring_unused points to the first unused slot of the buffer - * ring_receive is the next request that is to be received - * ring_last is the oldest received entry in the buffer - * - * Apart from being an entry in the ring buffer of prefetch requests, each - * PrefetchRequest that is not UNUSED is indexed in prf_hash by buftag. - */ -typedef struct PrefetchState -{ - MemoryContext bufctx; /* context for prf_buffer[].response - * allocations */ - MemoryContext errctx; /* context for prf_buffer[].response - * allocations */ - MemoryContext hashctx; /* context for prf_buffer */ - - /* buffer indexes */ - uint64 ring_unused; /* first unused slot */ - uint64 ring_flush; /* next request to flush */ - uint64 ring_receive; /* next slot that is to receive a response */ - uint64 ring_last; /* min slot with a response value */ - - /* metrics / statistics */ - int n_responses_buffered; /* count of PS responses not yet in - * buffers */ - int n_requests_inflight; /* count of PS requests considered in - * flight */ - int n_unused; /* count of buffers < unused, > last, that are - * also unused */ - - /* the buffers */ - prfh_hash *prf_hash; - int max_shard_no; - /* Mark shards involved in prefetch */ - uint8 shard_bitmap[(MAX_SHARDS + 7)/8]; - PrefetchRequest prf_buffer[]; /* prefetch buffers */ -} PrefetchState; - -static PrefetchState *MyPState; - -#define GetPrfSlotNoCheck(ring_index) ( \ - &MyPState->prf_buffer[((ring_index) % readahead_buffer_size)] \ -) - -#define GetPrfSlot(ring_index) ( \ - ( \ - AssertMacro((ring_index) < MyPState->ring_unused && \ - (ring_index) >= MyPState->ring_last), \ - GetPrfSlotNoCheck(ring_index) \ - ) \ -) - -#define ReceiveBufferNeedsCompaction() (\ - (MyPState->n_responses_buffered / 8) < ( \ - MyPState->ring_receive - \ - MyPState->ring_last - \ - MyPState->n_responses_buffered \ - ) \ -) - -static bool compact_prefetch_buffers(void); -static void consume_prefetch_responses(void); -static bool prefetch_read(PrefetchRequest *slot); -static void prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns); -static bool prefetch_wait_for(uint64 ring_index); -static void prefetch_cleanup_trailing_unused(void); -static inline void prefetch_set_unused(uint64 ring_index); - -static void -neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, - BlockNumber blkno, neon_request_lsns *output, - BlockNumber nblocks); -static bool neon_prefetch_response_usable(neon_request_lsns *request_lsns, - PrefetchRequest *slot); - -static bool -compact_prefetch_buffers(void) -{ - uint64 empty_ring_index = MyPState->ring_last; - uint64 search_ring_index = MyPState->ring_receive; - int n_moved = 0; - - if (MyPState->ring_receive == MyPState->ring_last) - return false; - - while (search_ring_index > MyPState->ring_last) - { - search_ring_index--; - if (GetPrfSlot(search_ring_index)->status == PRFS_UNUSED) - { - empty_ring_index = search_ring_index; - break; - } - } - - /* - * Here we have established: slots < search_ring_index have an unknown - * state (not scanned) slots >= search_ring_index and <= empty_ring_index - * are unused slots > empty_ring_index are in use, or outside our buffer's - * range. ... unless search_ring_index <= ring_last - * - * Therefore, there is a gap of at least one unused items between - * search_ring_index and empty_ring_index (both inclusive), which grows as - * we hit more unused items while moving backwards through the array. - */ - - while (search_ring_index > MyPState->ring_last) - { - PrefetchRequest *source_slot; - PrefetchRequest *target_slot; - bool found; - - /* update search index to an unprocessed entry */ - search_ring_index--; - - source_slot = GetPrfSlot(search_ring_index); - - if (source_slot->status == PRFS_UNUSED) - continue; - - /* slot is used -- start moving slot */ - target_slot = GetPrfSlot(empty_ring_index); - - Assert(source_slot->status == PRFS_RECEIVED); - Assert(target_slot->status == PRFS_UNUSED); - - target_slot->buftag = source_slot->buftag; - target_slot->shard_no = source_slot->shard_no; - target_slot->status = source_slot->status; - target_slot->flags = source_slot->flags; - target_slot->response = source_slot->response; - target_slot->reqid = source_slot->reqid; - target_slot->request_lsns = source_slot->request_lsns; - target_slot->my_ring_index = empty_ring_index; - - prfh_delete(MyPState->prf_hash, source_slot); - prfh_insert(MyPState->prf_hash, target_slot, &found); - - Assert(!found); - - /* Adjust the location of our known-empty slot */ - empty_ring_index--; - - /* empty the moved slot */ - source_slot->status = PRFS_UNUSED; - source_slot->buftag = (BufferTag) - { - 0 - }; - source_slot->response = NULL; - source_slot->my_ring_index = 0; - source_slot->request_lsns = (neon_request_lsns) { - InvalidXLogRecPtr, InvalidXLogRecPtr, InvalidXLogRecPtr - }; - - /* update bookkeeping */ - n_moved++; - } - - /* - * Only when we've moved slots we can expect trailing unused slots, so - * only then we clean up trailing unused slots. - */ - if (n_moved > 0) - { - prefetch_cleanup_trailing_unused(); - return true; - } - - return false; -} - -/* - * If there might be responses still in the TCP buffer, then we should try to - * use those, to reduce any TCP backpressure on the OS/PS side. - * - * This procedure handles that. - * - * Note that this works because we don't pipeline non-getPage requests. - * - * NOTE: This procedure is not allowed to throw errors that should be handled - * by SMGR-related code, as this can be called from every CHECK_FOR_INTERRUPTS - * point inside and outside PostgreSQL. - * - * This still does throw errors when it receives malformed responses from PS. - * - * When we're not called from CHECK_FOR_INTERRUPTS (indicated by - * IsHandlingInterrupts) we also report we've ended prefetch receive work, - * just in case state tracking was lost due to an error in the sync getPage - * response code. - */ -static void -prefetch_pump_state(bool IsHandlingInterrupts) -{ - while (MyPState->ring_receive != MyPState->ring_flush) - { - NeonResponse *response; - PrefetchRequest *slot; - MemoryContext old; - - slot = GetPrfSlot(MyPState->ring_receive); - - old = MemoryContextSwitchTo(MyPState->errctx); - response = page_server->try_receive(slot->shard_no); - MemoryContextSwitchTo(old); - - if (response == NULL) - break; - - /* The slot should still be valid */ - if (slot->status != PRFS_REQUESTED || - slot->response != NULL || - slot->my_ring_index != MyPState->ring_receive) - neon_shard_log(slot->shard_no, ERROR, - "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", - slot->status, slot->response, - (long) slot->my_ring_index, (long) MyPState->ring_receive); - - /* update prefetch state */ - MyPState->n_responses_buffered += 1; - MyPState->n_requests_inflight -= 1; - MyPState->ring_receive += 1; - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; - - /* update slot state */ - slot->status = PRFS_RECEIVED; - slot->response = response; - - if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) - { - /* - * Store prefetched result in LFC (please read comments to lfc_prefetch - * explaining why it can be done without holding shared buffer lock - */ - if (lfc_prefetch(BufTagGetNRelFileInfo(slot->buftag), slot->buftag.forkNum, slot->buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) - { - slot->flags |= PRFSF_LFC; - } - } - } - - /* We never pump the prefetch state while handling other pages */ - if (!IsHandlingInterrupts) - END_PREFETCH_RECEIVE_WORK(); - - reconfigure_timeout_if_needed(); -} - -void -readahead_buffer_resize(int newsize, void *extra) -{ - uint64 end, - nfree = newsize; - PrefetchState *newPState; - Size newprfs_size = offsetof(PrefetchState, prf_buffer) + - (sizeof(PrefetchRequest) * newsize); - - /* don't try to re-initialize if we haven't initialized yet */ - if (MyPState == NULL) - return; - - /* - * Make sure that we don't lose track of active prefetch requests by - * ensuring we have received all but the last n requests (n = newsize). - */ - if (MyPState->n_requests_inflight > newsize) - { - prefetch_wait_for(MyPState->ring_unused - newsize - 1); - Assert(MyPState->n_requests_inflight <= newsize); - } - - /* construct the new PrefetchState, and copy over the memory contexts */ - newPState = MemoryContextAllocZero(TopMemoryContext, newprfs_size); - - newPState->bufctx = MyPState->bufctx; - newPState->errctx = MyPState->errctx; - newPState->hashctx = MyPState->hashctx; - newPState->prf_hash = prfh_create(MyPState->hashctx, newsize, NULL); - newPState->n_unused = newsize; - newPState->n_requests_inflight = 0; - newPState->n_responses_buffered = 0; - newPState->ring_last = newsize; - newPState->ring_unused = newsize; - newPState->ring_receive = newsize; - newPState->max_shard_no = MyPState->max_shard_no; - memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap)); - - /* - * Copy over the prefetches. - * - * We populate the prefetch array from the end; to retain the most recent - * prefetches, but this has the benefit of only needing to do one - * iteration on the dataset, and trivial compaction. - */ - for (end = MyPState->ring_unused - 1; - end >= MyPState->ring_last && end != UINT64_MAX && nfree != 0; - end -= 1) - { - PrefetchRequest *slot = GetPrfSlot(end); - PrefetchRequest *newslot; - bool found; - - if (slot->status == PRFS_UNUSED) - continue; - - nfree -= 1; - - newslot = &newPState->prf_buffer[nfree]; - *newslot = *slot; - newslot->my_ring_index = nfree; - - prfh_insert(newPState->prf_hash, newslot, &found); - - Assert(!found); - - switch (newslot->status) - { - case PRFS_UNUSED: - pg_unreachable(); - case PRFS_REQUESTED: - newPState->n_requests_inflight += 1; - newPState->ring_receive -= 1; - newPState->ring_last -= 1; - break; - case PRFS_RECEIVED: - newPState->n_responses_buffered += 1; - newPState->ring_last -= 1; - break; - case PRFS_TAG_REMAINS: - newPState->ring_last -= 1; - break; - } - newPState->n_unused -= 1; - } - newPState->ring_flush = newPState->ring_receive; - - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; - MyNeonCounters->pageserver_open_requests = - MyPState->n_requests_inflight; - - for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1) - { - PrefetchRequest *slot = GetPrfSlot(end); - Assert(slot->status != PRFS_REQUESTED); - if (slot->status == PRFS_RECEIVED) - { - pfree(slot->response); - } - } - - prfh_destroy(MyPState->prf_hash); - pfree(MyPState); - MyPState = newPState; -} - - - -/* - * Make sure that there are no responses still in the buffer. - * - * This function may indirectly update MyPState->pfs_hash; which invalidates - * any active pointers into the hash table. - */ -static void -consume_prefetch_responses(void) -{ - if (MyPState->ring_receive < MyPState->ring_unused) - prefetch_wait_for(MyPState->ring_unused - 1); -} - -static void -prefetch_cleanup_trailing_unused(void) -{ - uint64 ring_index; - PrefetchRequest *slot; - - while (MyPState->ring_last < MyPState->ring_receive) - { - ring_index = MyPState->ring_last; - slot = GetPrfSlot(ring_index); - - if (slot->status == PRFS_UNUSED) - MyPState->ring_last += 1; - else - break; - } -} - - -static bool -prefetch_flush_requests(void) -{ - for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++) - { - if (BITMAP_ISSET(MyPState->shard_bitmap, shard_no)) - { - if (!page_server->flush(shard_no)) - return false; - BITMAP_CLR(MyPState->shard_bitmap, shard_no); - } - } - MyPState->max_shard_no = 0; - return true; -} - -/* - * Wait for slot of ring_index to have received its response. - * The caller is responsible for making sure the request buffer is flushed. - * - * NOTE: this function may indirectly update MyPState->pfs_hash; which - * invalidates any active pointers into the hash table. - * NOTE: callers should make sure they can handle query cancellations in this - * function's call path. - */ -static bool -prefetch_wait_for(uint64 ring_index) -{ - PrefetchRequest *entry; - bool result = true; - - if (MyPState->ring_flush <= ring_index && - MyPState->ring_unused > MyPState->ring_flush) - { - if (!prefetch_flush_requests()) - return false; - MyPState->ring_flush = MyPState->ring_unused; - } - - Assert(MyPState->ring_unused > ring_index); - - while (MyPState->ring_receive <= ring_index) - { - START_PREFETCH_RECEIVE_WORK(); - entry = GetPrfSlot(MyPState->ring_receive); - - Assert(entry->status == PRFS_REQUESTED); - if (!prefetch_read(entry)) - { - result = false; - break; - } - - END_PREFETCH_RECEIVE_WORK(); - CHECK_FOR_INTERRUPTS(); - } - - return result; -} - -/* - * Read the response of a prefetch request into its slot. - * - * The caller is responsible for making sure that the request for this buffer - * was flushed to the PageServer. - * - * NOTE: this function may indirectly update MyPState->pfs_hash; which - * invalidates any active pointers into the hash table. - * - * NOTE: this does IO, and can get canceled out-of-line. - */ -static bool -prefetch_read(PrefetchRequest *slot) -{ - NeonResponse *response; - MemoryContext old; - BufferTag buftag; - shardno_t shard_no; - uint64 my_ring_index; - - Assert(slot->status == PRFS_REQUESTED); - Assert(slot->response == NULL); - Assert(slot->my_ring_index == MyPState->ring_receive); - - if (slot->status != PRFS_REQUESTED || - slot->response != NULL || - slot->my_ring_index != MyPState->ring_receive) - neon_shard_log(slot->shard_no, ERROR, - "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu", - slot->status, slot->response, - (long)slot->my_ring_index, (long)MyPState->ring_receive); - - /* - * Copy the request info so that if an error happens and the prefetch - * queue is flushed during the receive call, we can print the original - * values in the error message - */ - buftag = slot->buftag; - shard_no = slot->shard_no; - my_ring_index = slot->my_ring_index; - - old = MemoryContextSwitchTo(MyPState->errctx); - response = (NeonResponse *) page_server->receive(shard_no); - MemoryContextSwitchTo(old); - if (response) - { - /* The slot should still be valid */ - if (slot->status != PRFS_REQUESTED || - slot->response != NULL || - slot->my_ring_index != MyPState->ring_receive) - neon_shard_log(shard_no, ERROR, - "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu", - slot->status, slot->response, - (long) slot->my_ring_index, (long) MyPState->ring_receive); - - /* update prefetch state */ - MyPState->n_responses_buffered += 1; - MyPState->n_requests_inflight -= 1; - MyPState->ring_receive += 1; - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; - - /* update slot state */ - slot->status = PRFS_RECEIVED; - slot->response = response; - - if (response->tag == T_NeonGetPageResponse && !(slot->flags & PRFSF_LFC) && lfc_store_prefetch_result) - { - /* - * Store prefetched result in LFC (please read comments to lfc_prefetch - * explaining why it can be done without holding shared buffer lock - */ - if (lfc_prefetch(BufTagGetNRelFileInfo(buftag), buftag.forkNum, buftag.blockNum, ((NeonGetPageResponse*)response)->page, slot->request_lsns.not_modified_since)) - { - slot->flags |= PRFSF_LFC; - } - } - return true; - } - else - { - /* - * Note: The slot might no longer be valid, if the connection was lost - * and the prefetch queue was flushed during the receive call - */ - neon_shard_log(shard_no, LOG, - "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect", - (long) my_ring_index, - RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)), - buftag.forkNum, buftag.blockNum); - return false; - } -} - -/* - * Disconnect hook - drop prefetches when the connection drops - * - * If we don't remove the failed prefetches, we'd be serving incorrect - * data to the smgr. - */ -void -prefetch_on_ps_disconnect(void) -{ - MyPState->ring_flush = MyPState->ring_unused; - - while (MyPState->ring_receive < MyPState->ring_unused) - { - PrefetchRequest *slot; - uint64 ring_index = MyPState->ring_receive; - - slot = GetPrfSlot(ring_index); - - Assert(slot->status == PRFS_REQUESTED); - Assert(slot->my_ring_index == ring_index); - - /* - * Drop connection to all shards which have prefetch requests. - * It is not a problem to call disconnect multiple times on the same connection - * because disconnect implementation in libpagestore.c will check if connection - * is alive and do nothing of connection was already dropped. - */ - page_server->disconnect(slot->shard_no); - - /* clean up the request */ - slot->status = PRFS_TAG_REMAINS; - MyPState->n_requests_inflight -= 1; - MyPState->ring_receive += 1; - - prefetch_set_unused(ring_index); - pgBufferUsage.prefetch.expired += 1; - MyNeonCounters->getpage_prefetch_discards_total += 1; - } - - /* - * We can have gone into retry due to network error, so update stats with - * the latest available - */ - MyNeonCounters->pageserver_open_requests = - MyPState->n_requests_inflight; - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; -} - -/* - * prefetch_set_unused() - clear a received prefetch slot - * - * The slot at ring_index must be a current member of the ring buffer, - * and may not be in the PRFS_REQUESTED state. - * - * NOTE: this function will update MyPState->pfs_hash; which invalidates any - * active pointers into the hash table. - */ -static inline void -prefetch_set_unused(uint64 ring_index) -{ - PrefetchRequest *slot; - - if (ring_index < MyPState->ring_last) - return; /* Should already be unused */ - - slot = GetPrfSlot(ring_index); - if (slot->status == PRFS_UNUSED) - return; - - Assert(slot->status == PRFS_RECEIVED || slot->status == PRFS_TAG_REMAINS); - - if (slot->status == PRFS_RECEIVED) - { - pfree(slot->response); - slot->response = NULL; - - MyPState->n_responses_buffered -= 1; - MyPState->n_unused += 1; - - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; - } - else - { - Assert(slot->response == NULL); - } - - prfh_delete(MyPState->prf_hash, slot); - - /* clear all fields */ - MemSet(slot, 0, sizeof(PrefetchRequest)); - slot->status = PRFS_UNUSED; - - /* run cleanup if we're holding back ring_last */ - if (MyPState->ring_last == ring_index) - prefetch_cleanup_trailing_unused(); - - /* - * ... and try to store the buffered responses more compactly if > 12.5% - * of the buffer is gaps - */ - else if (ReceiveBufferNeedsCompaction()) - compact_prefetch_buffers(); -} - -/* - * Send one prefetch request to the pageserver. To wait for the response, call - * prefetch_wait_for(). - */ -static void -prefetch_do_request(PrefetchRequest *slot, neon_request_lsns *force_request_lsns) -{ - bool found; - uint64 mySlotNo PG_USED_FOR_ASSERTS_ONLY = slot->my_ring_index; - - NeonGetPageRequest request = { - .hdr.tag = T_NeonGetPageRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), - /* lsn and not_modified_since are filled in below */ - .rinfo = BufTagGetNRelFileInfo(slot->buftag), - .forknum = slot->buftag.forkNum, - .blkno = slot->buftag.blockNum, - }; - - Assert(mySlotNo == MyPState->ring_unused); - - slot->reqid = request.hdr.reqid; - - if (force_request_lsns) - slot->request_lsns = *force_request_lsns; - else - neon_get_request_lsns(BufTagGetNRelFileInfo(slot->buftag), - slot->buftag.forkNum, slot->buftag.blockNum, - &slot->request_lsns, 1); - request.hdr.lsn = slot->request_lsns.request_lsn; - request.hdr.not_modified_since = slot->request_lsns.not_modified_since; - - Assert(slot->response == NULL); - Assert(slot->my_ring_index == MyPState->ring_unused); - - while (!page_server->send(slot->shard_no, (NeonRequest *) &request)) - { - Assert(mySlotNo == MyPState->ring_unused); - /* loop */ - } - - /* update prefetch state */ - MyPState->n_requests_inflight += 1; - MyPState->n_unused -= 1; - MyPState->ring_unused += 1; - BITMAP_SET(MyPState->shard_bitmap, slot->shard_no); - MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no); - - /* update slot state */ - slot->status = PRFS_REQUESTED; - prfh_insert(MyPState->prf_hash, slot, &found); - Assert(!found); -} - -/* - * Lookup of already received prefetch requests. Only already received responses matching required LSNs are accepted. - * Present pages are marked in "mask" bitmap and total number of such pages is returned. - */ -static int -prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, neon_request_lsns *lsns, - BlockNumber nblocks, void **buffers, bits8 *mask) -{ - int hits = 0; - PrefetchRequest hashkey; - - /* - * Use an intermediate PrefetchRequest struct as the hash key to ensure - * correct alignment and that the padding bytes are cleared. - */ - memset(&hashkey.buftag, 0, sizeof(BufferTag)); - CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo); - hashkey.buftag.forkNum = forknum; - - for (int i = 0; i < nblocks; i++) - { - PrfHashEntry *entry; - - hashkey.buftag.blockNum = blocknum + i; - entry = prfh_lookup(MyPState->prf_hash, &hashkey); - - if (entry != NULL) - { - PrefetchRequest *slot = entry->slot; - uint64 ring_index = slot->my_ring_index; - Assert(slot == GetPrfSlot(ring_index)); - - Assert(slot->status != PRFS_UNUSED); - Assert(MyPState->ring_last <= ring_index && - ring_index < MyPState->ring_unused); - Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); - - if (slot->status != PRFS_RECEIVED) - continue; - - /* - * If the caller specified a request LSN to use, only accept - * prefetch responses that satisfy that request. - */ - if (!neon_prefetch_response_usable(&lsns[i], slot)) - continue; - - /* - * Ignore errors - */ - if (slot->response->tag != T_NeonGetPageResponse) - { - if (slot->response->tag != T_NeonErrorResponse) - { - NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, - "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", - T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag); - } - continue; - } - memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ); - - - /* - * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received - * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here - * under buffer lock. - */ - if (!lfc_store_prefetch_result) - lfc_write(rinfo, forknum, blocknum + i, buffers[i]); - - prefetch_set_unused(ring_index); - BITMAP_SET(mask, i); - - hits += 1; - inc_getpage_wait(0); - } - } - pgBufferUsage.prefetch.hits += hits; - return hits; -} - -#if PG_MAJORVERSION_NUM < 17 -static bool -prefetch_lookup(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkn, neon_request_lsns *lsns, void *buffer) -{ - bits8 present = 0; - return prefetch_lookupv(rinfo, forkNum, blkn, lsns, 1, &buffer, &present) != 0; -} -#endif - -/* - * prefetch_register_bufferv() - register and prefetch buffers - * - * Register that we may want the contents of BufferTag in the near future. - * This is used when issuing a speculative prefetch request, but also when - * performing a synchronous request and need the buffer right now. - * - * If force_request_lsns is not NULL, those values are sent to the - * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure - * to calculate the LSNs to send. - * - * Bits set in *mask (if present) indicate pages already read; i.e. pages we - * can skip in this process. - * - * When performing a prefetch rather than a synchronous request, - * is_prefetch==true. Currently, it only affects how the request is accounted - * in the perf counters. - * - * NOTE: this function may indirectly update MyPState->pfs_hash; which - * invalidates any active pointers into the hash table. - */ -static uint64 -prefetch_register_bufferv(BufferTag tag, neon_request_lsns *frlsns, - BlockNumber nblocks, const bits8 *mask, - bool is_prefetch) -{ - uint64 min_ring_index; - PrefetchRequest hashkey; -#ifdef USE_ASSERT_CHECKING - bool any_hits = false; -#endif - /* We will never read further ahead than our buffer can store. */ - nblocks = Max(1, Min(nblocks, readahead_buffer_size)); - - /* - * Use an intermediate PrefetchRequest struct as the hash key to ensure - * correct alignment and that the padding bytes are cleared. - */ - memset(&hashkey.buftag, 0, sizeof(BufferTag)); - hashkey.buftag = tag; - -Retry: - /* - * We can have gone into retry due to network error, so update stats with - * the latest available - */ - MyNeonCounters->pageserver_open_requests = - MyPState->ring_unused - MyPState->ring_receive; - MyNeonCounters->getpage_prefetches_buffered = - MyPState->n_responses_buffered; - - min_ring_index = UINT64_MAX; - for (int i = 0; i < nblocks; i++) - { - PrefetchRequest *slot = NULL; - PrfHashEntry *entry = NULL; - uint64 ring_index; - neon_request_lsns *lsns; - - if (PointerIsValid(mask) && BITMAP_ISSET(mask, i)) - continue; - - if (frlsns) - lsns = &frlsns[i]; - else - lsns = NULL; - -#ifdef USE_ASSERT_CHECKING - any_hits = true; -#endif - - slot = NULL; - entry = NULL; - - hashkey.buftag.blockNum = tag.blockNum + i; - entry = prfh_lookup(MyPState->prf_hash, &hashkey); - - if (entry != NULL) - { - slot = entry->slot; - ring_index = slot->my_ring_index; - Assert(slot == GetPrfSlot(ring_index)); - - Assert(slot->status != PRFS_UNUSED); - Assert(MyPState->ring_last <= ring_index && - ring_index < MyPState->ring_unused); - Assert(BufferTagsEqual(&slot->buftag, &hashkey.buftag)); - - /* - * If the caller specified a request LSN to use, only accept - * prefetch responses that satisfy that request. - */ - if (lsns) - { - if (!neon_prefetch_response_usable(lsns, slot)) - { - /* Wait for the old request to finish and discard it */ - if (!prefetch_wait_for(ring_index)) - goto Retry; - prefetch_set_unused(ring_index); - entry = NULL; - slot = NULL; - pgBufferUsage.prefetch.expired += 1; - MyNeonCounters->getpage_prefetch_discards_total += 1; - } - } - - if (entry != NULL) - { - /* - * We received a prefetch for a page that was recently read - * and removed from the buffers. Remove that request from the - * buffers. - */ - if (slot->status == PRFS_TAG_REMAINS) - { - prefetch_set_unused(ring_index); - entry = NULL; - slot = NULL; - } - else - { - min_ring_index = Min(min_ring_index, ring_index); - /* The buffered request is good enough, return that index */ - if (is_prefetch) - pgBufferUsage.prefetch.duplicates++; - continue; - } - } - } - else if (!is_prefetch) - { - pgBufferUsage.prefetch.misses += 1; - MyNeonCounters->getpage_prefetch_misses_total++; - } - /* - * We can only leave the block above by finding that there's - * no entry that can satisfy this request, either because there - * was no entry, or because the entry was invalid or didn't satisfy - * the LSNs provided. - * - * The code should've made sure to clear up the data. - */ - Assert(entry == NULL); - Assert(slot == NULL); - - /* There should be no buffer overflow */ - Assert(MyPState->ring_last + readahead_buffer_size >= MyPState->ring_unused); - - /* - * If the prefetch queue is full, we need to make room by clearing the - * oldest slot. If the oldest slot holds a buffer that was already - * received, we can just throw it away; we fetched the page - * unnecessarily in that case. If the oldest slot holds a request that - * we haven't received a response for yet, we have to wait for the - * response to that before we can continue. We might not have even - * flushed the request to the pageserver yet, it might be just sitting - * in the output buffer. In that case, we flush it and wait for the - * response. (We could decide not to send it, but it's hard to abort - * when the request is already in the output buffer, and 'not sending' - * a prefetch request kind of goes against the principles of - * prefetching) - */ - if (MyPState->ring_last + readahead_buffer_size == MyPState->ring_unused) - { - uint64 cleanup_index = MyPState->ring_last; - - slot = GetPrfSlot(cleanup_index); - - Assert(slot->status != PRFS_UNUSED); - - /* - * If there is good reason to run compaction on the prefetch buffers, - * try to do that. - */ - if (ReceiveBufferNeedsCompaction() && compact_prefetch_buffers()) - { - Assert(slot->status == PRFS_UNUSED); - } - else - { - /* - * We have the slot for ring_last, so that must still be in - * progress - */ - switch (slot->status) - { - case PRFS_REQUESTED: - Assert(MyPState->ring_receive == cleanup_index); - if (!prefetch_wait_for(cleanup_index)) - goto Retry; - prefetch_set_unused(cleanup_index); - pgBufferUsage.prefetch.expired += 1; - MyNeonCounters->getpage_prefetch_discards_total += 1; - break; - case PRFS_RECEIVED: - case PRFS_TAG_REMAINS: - prefetch_set_unused(cleanup_index); - pgBufferUsage.prefetch.expired += 1; - MyNeonCounters->getpage_prefetch_discards_total += 1; - break; - default: - pg_unreachable(); - } - } - } - - /* - * The next buffer pointed to by `ring_unused` is now definitely empty, so - * we can insert the new request to it. - */ - ring_index = MyPState->ring_unused; - - Assert(MyPState->ring_last <= ring_index && - ring_index <= MyPState->ring_unused); - - slot = GetPrfSlotNoCheck(ring_index); - - Assert(slot->status == PRFS_UNUSED); - - /* - * We must update the slot data before insertion, because the hash - * function reads the buffer tag from the slot. - */ - slot->buftag = hashkey.buftag; - slot->shard_no = get_shard_number(&tag); - slot->my_ring_index = ring_index; - slot->flags = 0; - - min_ring_index = Min(min_ring_index, ring_index); - - if (is_prefetch) - MyNeonCounters->getpage_prefetch_requests_total++; - else - MyNeonCounters->getpage_sync_requests_total++; - - prefetch_do_request(slot, lsns); - } - - MyNeonCounters->pageserver_open_requests = - MyPState->ring_unused - MyPState->ring_receive; - - Assert(any_hits); - - Assert(GetPrfSlot(min_ring_index)->status == PRFS_REQUESTED || - GetPrfSlot(min_ring_index)->status == PRFS_RECEIVED); - Assert(MyPState->ring_last <= min_ring_index && - min_ring_index < MyPState->ring_unused); - - if (flush_every_n_requests > 0 && - MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests) - { - if (!prefetch_flush_requests()) - { - /* - * Prefetch set is reset in case of error, so we should try to - * register our request once again - */ - goto Retry; - } - MyPState->ring_flush = MyPState->ring_unused; - } - - return min_ring_index; -} - -static bool -equal_requests(NeonRequest* a, NeonRequest* b) -{ - return a->reqid == b->reqid && a->lsn == b->lsn && a->not_modified_since == b->not_modified_since; -} - - -/* - * Note: this function can get canceled and use a long jump to the next catch - * context. Take care. - */ -static NeonResponse * -page_server_request(void const *req) -{ - NeonResponse *resp; - BufferTag tag = {0}; - shardno_t shard_no; - - switch (messageTag(req)) - { - case T_NeonExistsRequest: - CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo); - break; - case T_NeonNblocksRequest: - CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo); - break; - case T_NeonDbSizeRequest: - NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode; - break; - case T_NeonGetPageRequest: - CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo); - tag.blockNum = ((NeonGetPageRequest *) req)->blkno; - break; - default: - neon_log(ERROR, "Unexpected request tag: %d", messageTag(req)); - } - shard_no = get_shard_number(&tag); - - /* - * Current sharding model assumes that all metadata is present only at shard 0. - * We still need to call get_shard_no() to check if shard map is up-to-date. - */ - if (((NeonRequest *) req)->tag != T_NeonGetPageRequest) - { - shard_no = 0; - } - - do - { - PG_TRY(); - { - while (!page_server->send(shard_no, (NeonRequest *) req) - || !page_server->flush(shard_no)) - { - /* do nothing */ - } - MyNeonCounters->pageserver_open_requests++; - consume_prefetch_responses(); - resp = page_server->receive(shard_no); - MyNeonCounters->pageserver_open_requests--; - } - PG_CATCH(); - { - /* - * Cancellation in this code needs to be handled better at some - * point, but this currently seems fine for now. - */ - page_server->disconnect(shard_no); - MyNeonCounters->pageserver_open_requests = 0; - - /* - * We know for sure we're not working on any prefetch pages after - * this. - */ - END_PREFETCH_RECEIVE_WORK(); - - PG_RE_THROW(); - } - PG_END_TRY(); - - } while (resp == NULL); - - return resp; -} - - -StringInfoData -nm_pack_request(NeonRequest *msg) -{ - StringInfoData s; - - initStringInfo(&s); - - pq_sendbyte(&s, msg->tag); - if (neon_protocol_version >= 3) - { - pq_sendint64(&s, msg->reqid); - } - pq_sendint64(&s, msg->lsn); - pq_sendint64(&s, msg->not_modified_since); - - switch (messageTag(msg)) - { - /* pagestore_client -> pagestore */ - case T_NeonExistsRequest: - { - NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; - - pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); - pq_sendbyte(&s, msg_req->forknum); - - break; - } - case T_NeonNblocksRequest: - { - NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; - - pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); - pq_sendbyte(&s, msg_req->forknum); - - break; - } - case T_NeonDbSizeRequest: - { - NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; - - pq_sendint32(&s, msg_req->dbNode); - - break; - } - case T_NeonGetPageRequest: - { - NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; - - pq_sendint32(&s, NInfoGetSpcOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetDbOid(msg_req->rinfo)); - pq_sendint32(&s, NInfoGetRelNumber(msg_req->rinfo)); - pq_sendbyte(&s, msg_req->forknum); - pq_sendint32(&s, msg_req->blkno); - - break; - } - - case T_NeonGetSlruSegmentRequest: - { - NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; - - pq_sendbyte(&s, msg_req->kind); - pq_sendint32(&s, msg_req->segno); - - break; - } - - /* pagestore -> pagestore_client. We never need to create these. */ - case T_NeonExistsResponse: - case T_NeonNblocksResponse: - case T_NeonGetPageResponse: - case T_NeonErrorResponse: - case T_NeonDbSizeResponse: - case T_NeonGetSlruSegmentResponse: - default: - neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag); - break; - } - return s; -} - -NeonResponse * -nm_unpack_response(StringInfo s) -{ - NeonMessageTag tag = pq_getmsgbyte(s); - NeonResponse resp_hdr = {0}; /* make valgrind happy */ - NeonResponse *resp = NULL; - - resp_hdr.tag = tag; - if (neon_protocol_version >= 3) - { - resp_hdr.reqid = pq_getmsgint64(s); - resp_hdr.lsn = pq_getmsgint64(s); - resp_hdr.not_modified_since = pq_getmsgint64(s); - } - switch (tag) - { - /* pagestore -> pagestore_client */ - case T_NeonExistsResponse: - { - NeonExistsResponse *msg_resp = palloc0(sizeof(NeonExistsResponse)); - - if (neon_protocol_version >= 3) - { - NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - msg_resp->req.forknum = pq_getmsgbyte(s); - } - msg_resp->req.hdr = resp_hdr; - msg_resp->exists = pq_getmsgbyte(s); - pq_getmsgend(s); - - resp = (NeonResponse *) msg_resp; - break; - } - - case T_NeonNblocksResponse: - { - NeonNblocksResponse *msg_resp = palloc0(sizeof(NeonNblocksResponse)); - - if (neon_protocol_version >= 3) - { - NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - msg_resp->req.forknum = pq_getmsgbyte(s); - } - msg_resp->req.hdr = resp_hdr; - msg_resp->n_blocks = pq_getmsgint(s, 4); - pq_getmsgend(s); - - resp = (NeonResponse *) msg_resp; - break; - } - - case T_NeonGetPageResponse: - { - NeonGetPageResponse *msg_resp; - - msg_resp = MemoryContextAllocZero(MyPState->bufctx, PS_GETPAGERESPONSE_SIZE); - if (neon_protocol_version >= 3) - { - NInfoGetSpcOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetDbOid(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - NInfoGetRelNumber(msg_resp->req.rinfo) = pq_getmsgint(s, 4); - msg_resp->req.forknum = pq_getmsgbyte(s); - msg_resp->req.blkno = pq_getmsgint(s, 4); - } - msg_resp->req.hdr = resp_hdr; - /* XXX: should be varlena */ - memcpy(msg_resp->page, pq_getmsgbytes(s, BLCKSZ), BLCKSZ); - pq_getmsgend(s); - - Assert(msg_resp->req.hdr.tag == T_NeonGetPageResponse); - - resp = (NeonResponse *) msg_resp; - break; - } - - case T_NeonDbSizeResponse: - { - NeonDbSizeResponse *msg_resp = palloc0(sizeof(NeonDbSizeResponse)); - - if (neon_protocol_version >= 3) - { - msg_resp->req.dbNode = pq_getmsgint(s, 4); - } - msg_resp->req.hdr = resp_hdr; - msg_resp->db_size = pq_getmsgint64(s); - pq_getmsgend(s); - - resp = (NeonResponse *) msg_resp; - break; - } - - case T_NeonErrorResponse: - { - NeonErrorResponse *msg_resp; - size_t msglen; - const char *msgtext; - - msgtext = pq_getmsgrawstring(s); - msglen = strlen(msgtext); - - msg_resp = palloc0(sizeof(NeonErrorResponse) + msglen + 1); - msg_resp->req = resp_hdr; - memcpy(msg_resp->message, msgtext, msglen + 1); - pq_getmsgend(s); - - resp = (NeonResponse *) msg_resp; - break; - } - - case T_NeonGetSlruSegmentResponse: - { - NeonGetSlruSegmentResponse *msg_resp; - int n_blocks; - msg_resp = palloc0(sizeof(NeonGetSlruSegmentResponse)); - - if (neon_protocol_version >= 3) - { - msg_resp->req.kind = pq_getmsgbyte(s); - msg_resp->req.segno = pq_getmsgint(s, 4); - } - msg_resp->req.hdr = resp_hdr; - - n_blocks = pq_getmsgint(s, 4); - msg_resp->n_blocks = n_blocks; - memcpy(msg_resp->data, pq_getmsgbytes(s, n_blocks * BLCKSZ), n_blocks * BLCKSZ); - pq_getmsgend(s); - - resp = (NeonResponse *) msg_resp; - break; - } - - /* - * pagestore_client -> pagestore - * - * We create these ourselves, and don't need to decode them. - */ - case T_NeonExistsRequest: - case T_NeonNblocksRequest: - case T_NeonGetPageRequest: - case T_NeonDbSizeRequest: - case T_NeonGetSlruSegmentRequest: - default: - neon_log(ERROR, "unexpected neon message tag 0x%02x", tag); - break; - } - - return resp; -} - -/* dump to json for debugging / error reporting purposes */ -char * -nm_to_string(NeonMessage *msg) -{ - StringInfoData s; - - initStringInfo(&s); - - switch (messageTag(msg)) - { - /* pagestore_client -> pagestore */ - case T_NeonExistsRequest: - { - NeonExistsRequest *msg_req = (NeonExistsRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonExistsRequest\""); - appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); - appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); - appendStringInfoChar(&s, '}'); - break; - } - - case T_NeonNblocksRequest: - { - NeonNblocksRequest *msg_req = (NeonNblocksRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonNblocksRequest\""); - appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); - appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); - appendStringInfoChar(&s, '}'); - break; - } - - case T_NeonGetPageRequest: - { - NeonGetPageRequest *msg_req = (NeonGetPageRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonGetPageRequest\""); - appendStringInfo(&s, ", \"rinfo\": \"%u/%u/%u\"", RelFileInfoFmt(msg_req->rinfo)); - appendStringInfo(&s, ", \"forknum\": %d", msg_req->forknum); - appendStringInfo(&s, ", \"blkno\": %u", msg_req->blkno); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); - appendStringInfoChar(&s, '}'); - break; - } - case T_NeonDbSizeRequest: - { - NeonDbSizeRequest *msg_req = (NeonDbSizeRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonDbSizeRequest\""); - appendStringInfo(&s, ", \"dbnode\": \"%u\"", msg_req->dbNode); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); - appendStringInfoChar(&s, '}'); - break; - } - case T_NeonGetSlruSegmentRequest: - { - NeonGetSlruSegmentRequest *msg_req = (NeonGetSlruSegmentRequest *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentRequest\""); - appendStringInfo(&s, ", \"kind\": %u", msg_req->kind); - appendStringInfo(&s, ", \"segno\": %u", msg_req->segno); - appendStringInfo(&s, ", \"lsn\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.lsn)); - appendStringInfo(&s, ", \"not_modified_since\": \"%X/%X\"", LSN_FORMAT_ARGS(msg_req->hdr.not_modified_since)); - appendStringInfoChar(&s, '}'); - break; - } - /* pagestore -> pagestore_client */ - case T_NeonExistsResponse: - { - NeonExistsResponse *msg_resp = (NeonExistsResponse *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonExistsResponse\""); - appendStringInfo(&s, ", \"exists\": %d}", - msg_resp->exists); - appendStringInfoChar(&s, '}'); - - break; - } - case T_NeonNblocksResponse: - { - NeonNblocksResponse *msg_resp = (NeonNblocksResponse *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonNblocksResponse\""); - appendStringInfo(&s, ", \"n_blocks\": %u}", - msg_resp->n_blocks); - appendStringInfoChar(&s, '}'); - - break; - } - case T_NeonGetPageResponse: - { -#if 0 - NeonGetPageResponse *msg_resp = (NeonGetPageResponse *) msg; -#endif - - appendStringInfoString(&s, "{\"type\": \"NeonGetPageResponse\""); - appendStringInfo(&s, ", \"page\": \"XXX\"}"); - appendStringInfoChar(&s, '}'); - break; - } - case T_NeonErrorResponse: - { - NeonErrorResponse *msg_resp = (NeonErrorResponse *) msg; - - /* FIXME: escape double-quotes in the message */ - appendStringInfoString(&s, "{\"type\": \"NeonErrorResponse\""); - appendStringInfo(&s, ", \"message\": \"%s\"}", msg_resp->message); - appendStringInfoChar(&s, '}'); - break; - } - case T_NeonDbSizeResponse: - { - NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg; - - appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\""); - appendStringInfo(&s, ", \"db_size\": %ld}", - msg_resp->db_size); - appendStringInfoChar(&s, '}'); - - break; - } - case T_NeonGetSlruSegmentResponse: - { - NeonGetSlruSegmentResponse *msg_resp = (NeonGetSlruSegmentResponse *) msg; +{ + UNLOGGED_BUILD_NOT_IN_PROGRESS = 0, + UNLOGGED_BUILD_PHASE_1, + UNLOGGED_BUILD_PHASE_2, + UNLOGGED_BUILD_NOT_PERMANENT +} UnloggedBuildPhase; - appendStringInfoString(&s, "{\"type\": \"NeonGetSlruSegmentResponse\""); - appendStringInfo(&s, ", \"n_blocks\": %u}", - msg_resp->n_blocks); - appendStringInfoChar(&s, '}'); +static SMgrRelation unlogged_build_rel = NULL; +static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; - break; - } +static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id); +static bool (*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL; - default: - appendStringInfo(&s, "{\"type\": \"unknown 0x%02x\"", msg->tag); - } - return s.data; -} +static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum); /* * Wrapper around log_newpage() that makes a temporary copy of the block and @@ -2148,11 +461,6 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co static void neon_init(void) { - Size prfs_size; - - if (MyPState != NULL) - return; - /* * Sanity check that theperf counters array is sized correctly. We got * this wrong once, and the formula for max number of backends and aux @@ -2167,27 +475,6 @@ neon_init(void) elog(ERROR, "MyNeonCounters points past end of array"); #endif - prfs_size = offsetof(PrefetchState, prf_buffer) + - sizeof(PrefetchRequest) * readahead_buffer_size; - - MyPState = MemoryContextAllocZero(TopMemoryContext, prfs_size); - - MyPState->n_unused = readahead_buffer_size; - - MyPState->bufctx = SlabContextCreate(TopMemoryContext, - "NeonSMGR/prefetch", - SLAB_DEFAULT_BLOCK_SIZE * 17, - PS_GETPAGERESPONSE_SIZE); - MyPState->errctx = AllocSetContextCreate(TopMemoryContext, - "NeonSMGR/errors", - ALLOCSET_DEFAULT_SIZES); - MyPState->hashctx = AllocSetContextCreate(TopMemoryContext, - "NeonSMGR/prefetch", - ALLOCSET_DEFAULT_SIZES); - - MyPState->prf_hash = prfh_create(MyPState->hashctx, - readahead_buffer_size, NULL); - old_redo_read_buffer_filter = redo_read_buffer_filter; redo_read_buffer_filter = neon_redo_read_buffer_filter; @@ -2224,8 +511,10 @@ nm_adjust_lsn(XLogRecPtr lsn) /* * Return LSN for requesting pages and number of blocks from page server + * + * XXX: exposed so that prefetch_do_request() can call back here. */ -static void +void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, neon_request_lsns *output, BlockNumber nblocks) { @@ -2428,112 +717,12 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, } } -/* - * neon_prefetch_response_usable -- Can a new request be satisfied by old one? - * - * This is used to check if the response to a prefetch request can be used to - * satisfy a page read now. - */ -static bool -neon_prefetch_response_usable(neon_request_lsns *request_lsns, - PrefetchRequest *slot) -{ - /* sanity check the LSN's on the old and the new request */ - Assert(request_lsns->request_lsn >= request_lsns->not_modified_since); - Assert(request_lsns->effective_request_lsn >= request_lsns->not_modified_since); - Assert(request_lsns->effective_request_lsn <= request_lsns->request_lsn); - Assert(slot->request_lsns.request_lsn >= slot->request_lsns.not_modified_since); - Assert(slot->request_lsns.effective_request_lsn >= slot->request_lsns.not_modified_since); - Assert(slot->request_lsns.effective_request_lsn <= slot->request_lsns.request_lsn); - Assert(slot->status != PRFS_UNUSED); - - /* - * The new request's LSN should never be older than the old one. This - * could be an Assert, except that for testing purposes, we do provide an - * interface in neon_test_utils to fetch pages at arbitary LSNs, which - * violates this. - * - * Similarly, the not_modified_since value calculated for a page should - * never move backwards. This assumption is a bit fragile; if we updated - * the last-written cache when we read in a page, for example, then it - * might. But as the code stands, it should not. - * - * (If two backends issue a request at the same time, they might race and - * calculate LSNs "out of order" with each other, but the prefetch queue - * is backend-private at the moment.) - */ - if (request_lsns->effective_request_lsn < slot->request_lsns.effective_request_lsn || - request_lsns->not_modified_since < slot->request_lsns.not_modified_since) - { - ereport(LOG, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "request with unexpected LSN after prefetch"), - errdetail("Request %X/%X not_modified_since %X/%X, prefetch %X/%X not_modified_since %X/%X)", - LSN_FORMAT_ARGS(request_lsns->effective_request_lsn), - LSN_FORMAT_ARGS(request_lsns->not_modified_since), - LSN_FORMAT_ARGS(slot->request_lsns.effective_request_lsn), - LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)))); - return false; - } - - /*--- - * Each request to the pageserver has three LSN values associated with it: - * `not_modified_since`, `request_lsn`, and 'effective_request_lsn'. - * `not_modified_since` and `request_lsn` are sent to the pageserver, but - * in the primary node, we always use UINT64_MAX as the `request_lsn`, so - * we remember `effective_request_lsn` separately. In a primary, - * `effective_request_lsn` is the same as `not_modified_since`. - * See comments in neon_get_request_lsns why we can not use last flush WAL position here. - * - * To determine whether a response to a GetPage request issued earlier is - * still valid to satisfy a new page read, we look at the - * (not_modified_since, effective_request_lsn] range of the request. It is - * effectively a claim that the page has not been modified between those - * LSNs. If the range of the old request in the queue overlaps with the - * new request, we know that the page hasn't been modified in the union of - * the ranges. We can use the response to old request to satisfy the new - * request in that case. For example: - * - * 100 500 - * Old request: +--------+ - * - * 400 800 - * New request: +--------+ - * - * The old request claims that the page was not modified between LSNs 100 - * and 500, and the second claims that it was not modified between 400 and - * 800. Together they mean that the page was not modified between 100 and - * 800. Therefore the response to the old request is also valid for the - * new request. - * - * This logic also holds at the boundary case that the old request's LSN - * matches the new request's not_modified_since LSN exactly: - * - * 100 500 - * Old request: +--------+ - * - * 500 900 - * New request: +--------+ - * - * The response to the old request is the page as it was at LSN 500, and - * the page hasn't been changed in the range (500, 900], therefore the - * response is valid also for the new request. - */ - - /* this follows from the checks above */ - Assert(request_lsns->effective_request_lsn >= slot->request_lsns.not_modified_since); - - return request_lsns->not_modified_since <= slot->request_lsns.effective_request_lsn; -} - /* * neon_exists() -- Does the physical file exist? */ static bool neon_exists(SMgrRelation reln, ForkNumber forkNum) { - bool exists; - NeonResponse *resp; BlockNumber n_blocks; neon_request_lsns request_lsns; @@ -2592,67 +781,8 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum) neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); - { - NeonExistsRequest request = { - .hdr.tag = T_NeonExistsRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), - .hdr.lsn = request_lsns.request_lsn, - .hdr.not_modified_since = request_lsns.not_modified_since, - .rinfo = InfoFromSMgrRel(reln), - .forknum = forkNum - }; - - resp = page_server_request(&request); - - switch (resp->tag) - { - case T_NeonExistsResponse: - { - NeonExistsResponse* exists_resp = (NeonExistsResponse *) resp; - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr) || - !RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) || - exists_resp->req.forknum != request.forknum) - { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum, - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum); - } - } - exists = exists_resp->exists; - break; - } - case T_NeonErrorResponse: - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr)) - { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); - } - } - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X", - resp->reqid, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forkNum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x", - T_NeonExistsResponse, T_NeonErrorResponse, resp->tag); - } - pfree(resp); - } - return exists; + + return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns); } /* @@ -3001,7 +1131,6 @@ static bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks) { - uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; BufferTag tag; switch (reln->smgr_relpersistence) @@ -3038,17 +1167,13 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, tag.blockNum = blocknum; - ring_index = prefetch_register_bufferv(tag, NULL, iterblocks, - lfc_present, true); + communicator_prefetch_register_bufferv(tag, NULL, iterblocks, lfc_present); nblocks -= iterblocks; blocknum += iterblocks; - - Assert(ring_index < MyPState->ring_unused && - MyPState->ring_last <= ring_index); } - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); return false; } @@ -3061,7 +1186,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, static bool neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { - uint64 ring_index PG_USED_FOR_ASSERTS_ONLY; BufferTag tag; switch (reln->smgr_relpersistence) @@ -3086,12 +1210,9 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln)); - ring_index = prefetch_register_bufferv(tag, NULL, 1, NULL, true); - - Assert(ring_index < MyPState->ring_unused && - MyPState->ring_last <= ring_index); + communicator_prefetch_register_bufferv(tag, NULL, 1, NULL); - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); return false; } @@ -3135,7 +1256,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, */ neon_log(SmgrTrace, "writeback noop"); - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -3143,208 +1264,6 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, #endif } -/* - * Read N pages at a specific LSN. - * - * *mask is set for pages read at a previous point in time, and which we - * should not touch, nor overwrite. - * New bits should be set in *mask for the pages we'successfully read. - * - * The offsets in request_lsns, buffers, and mask are linked. - */ -static void -neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns, - void **buffers, BlockNumber nblocks, const bits8 *mask) -{ - NeonResponse *resp; - uint64 ring_index; - PrfHashEntry *entry; - PrefetchRequest *slot; - PrefetchRequest hashkey; - - Assert(PointerIsValid(request_lsns)); - Assert(nblocks >= 1); - - /* - * Use an intermediate PrefetchRequest struct as the hash key to ensure - * correct alignment and that the padding bytes are cleared. - */ - memset(&hashkey.buftag, 0, sizeof(BufferTag)); - CopyNRelFileInfoToBufTag(hashkey.buftag, rinfo); - hashkey.buftag.forkNum = forkNum; - hashkey.buftag.blockNum = base_blockno; - - /* - * The redo process does not lock pages that it needs to replay but are - * not in the shared buffers, so a concurrent process may request the page - * after redo has decided it won't redo that page and updated the LwLSN - * for that page. If we're in hot standby we need to take care that we - * don't return until after REDO has finished replaying up to that LwLSN, - * as the page should have been locked up to that point. - * - * See also the description on neon_redo_read_buffer_filter below. - * - * NOTE: It is possible that the WAL redo process will still do IO due to - * concurrent failed read IOs. Those IOs should never have a request_lsn - * that is as large as the WAL record we're currently replaying, if it - * weren't for the behaviour of the LwLsn cache that uses the highest - * value of the LwLsn cache when the entry is not found. - */ - prefetch_register_bufferv(hashkey.buftag, request_lsns, nblocks, mask, false); - - for (int i = 0; i < nblocks; i++) - { - void *buffer = buffers[i]; - BlockNumber blockno = base_blockno + i; - neon_request_lsns *reqlsns = &request_lsns[i]; - TimestampTz start_ts, end_ts; - - if (PointerIsValid(mask) && BITMAP_ISSET(mask, i)) - continue; - - start_ts = GetCurrentTimestamp(); - - if (RecoveryInProgress() && MyBackendType != B_STARTUP) - XLogWaitForReplayOf(reqlsns->request_lsn); - - /* - * Try to find prefetched page in the list of received pages. - */ -Retry: - hashkey.buftag.blockNum = blockno; - entry = prfh_lookup(MyPState->prf_hash, &hashkey); - - if (entry != NULL) - { - slot = entry->slot; - if (neon_prefetch_response_usable(reqlsns, slot)) - { - ring_index = slot->my_ring_index; - } - else - { - /* - * Cannot use this prefetch, discard it - * - * We can't drop cache for not-yet-received requested items. It is - * unlikely this happens, but it can happen if prefetch distance - * is large enough and a backend didn't consume all prefetch - * requests. - */ - if (slot->status == PRFS_REQUESTED) - { - if (!prefetch_wait_for(slot->my_ring_index)) - goto Retry; - } - /* drop caches */ - prefetch_set_unused(slot->my_ring_index); - pgBufferUsage.prefetch.expired += 1; - MyNeonCounters->getpage_prefetch_discards_total++; - /* make it look like a prefetch cache miss */ - entry = NULL; - } - } - - do - { - if (entry == NULL) - { - ring_index = prefetch_register_bufferv(hashkey.buftag, reqlsns, 1, NULL, false); - Assert(ring_index != UINT64_MAX); - slot = GetPrfSlot(ring_index); - } - else - { - /* - * Empty our reference to the prefetch buffer's hash entry. When - * we wait for prefetches, the entry reference is invalidated by - * potential updates to the hash, and when we reconnect to the - * pageserver the prefetch we're waiting for may be dropped, in - * which case we need to retry and take the branch above. - */ - entry = NULL; - } - - Assert(slot->my_ring_index == ring_index); - Assert(MyPState->ring_last <= ring_index && - MyPState->ring_unused > ring_index); - Assert(slot->status != PRFS_UNUSED); - Assert(GetPrfSlot(ring_index) == slot); - - } while (!prefetch_wait_for(ring_index)); - - Assert(slot->status == PRFS_RECEIVED); - Assert(memcmp(&hashkey.buftag, &slot->buftag, sizeof(BufferTag)) == 0); - Assert(hashkey.buftag.blockNum == base_blockno + i); - - resp = slot->response; - - switch (resp->tag) - { - case T_NeonGetPageResponse: - { - NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp; - if (neon_protocol_version >= 3) - { - if (resp->reqid != slot->reqid || - resp->lsn != slot->request_lsns.request_lsn || - resp->not_modified_since != slot->request_lsns.not_modified_since || - !RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) || - getpage_resp->req.forknum != forkNum || - getpage_resp->req.blkno != base_blockno + i) - { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno, - slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i); - } - } - memcpy(buffer, getpage_resp->page, BLCKSZ); - - /* - * With lfc_store_prefetch_result=true prefetch result is stored in LFC in prefetch_pump_state when response is received - * from page server. But if lfc_store_prefetch_result=false then it is not yet stored in LFC and we have to do it here - * under buffer lock. - */ - if (!lfc_store_prefetch_result) - lfc_write(rinfo, forkNum, blockno, buffer); - break; - } - case T_NeonErrorResponse: - if (neon_protocol_version >= 3) - { - if (resp->reqid != slot->reqid || - resp->lsn != slot->request_lsns.request_lsn || - resp->not_modified_since != slot->request_lsns.not_modified_since) - { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), - slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since)); - } - } - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X", - slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo), - forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - default: - NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC, - "Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x", - T_NeonGetPageResponse, T_NeonErrorResponse, resp->tag); - } - - /* buffer was used, clean up for later reuse */ - prefetch_set_unused(ring_index); - prefetch_cleanup_trailing_unused(); - - end_ts = GetCurrentTimestamp(); - inc_getpage_wait(end_ts >= start_ts ? (end_ts - start_ts) : 0); - } -} - /* * While function is defined in the neon extension it's used within neon_test_utils directly. * To avoid breaking tests in the runtime please keep function signature in sync. @@ -3353,7 +1272,7 @@ void neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, neon_request_lsns request_lsns, void *buffer) { - neon_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); + communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); } #if PG_MAJORVERSION_NUM < 17 @@ -3369,6 +1288,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer #endif { neon_request_lsns request_lsns; + bits8 present; + void *bufferp; switch (reln->smgr_relpersistence) { @@ -3388,11 +1309,13 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } /* Try to read PS results if they are available */ - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); - if (prefetch_lookup(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, buffer)) + present = 0; + bufferp = buffer; + if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present)) { /* Prefetch hit */ return; @@ -3410,7 +1333,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -3520,16 +1443,16 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, nblocks, PG_IOV_MAX); /* Try to read PS results if they are available */ - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks); memset(read_pages, 0, sizeof(read_pages)); - prefetch_result = prefetch_lookupv(InfoFromSMgrRel(reln), forknum, - blocknum, request_lsns, nblocks, - buffers, read_pages); + prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum, + blocknum, request_lsns, nblocks, + buffers, read_pages); if (prefetch_result == nblocks) return; @@ -3545,13 +1468,13 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, if (prefetch_result + lfc_result == nblocks) return; - neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, - buffers, nblocks, read_pages); + communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, + buffers, nblocks, read_pages); /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) @@ -3564,7 +1487,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, for (int i = 0; i < nblocks; i++) { BlockNumber blkno = blocknum + i; - if (!BITMAP_ISSET(read, i)) + if (!BITMAP_ISSET(read_pages, i)) continue; #if PG_MAJORVERSION_NUM >= 17 @@ -3687,6 +1610,9 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo #ifndef DEBUG_COMPARE_LOCAL /* This is a bit tricky. Check if the relation exists locally */ if (mdexists(reln, forknum)) +#else + if (mdexists(reln, INIT_FORKNUM)) +#endif { /* It exists locally. Guess it's unlogged then. */ #if PG_MAJORVERSION_NUM >= 17 @@ -3703,7 +1629,6 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo */ return; } -#endif break; case RELPERSISTENCE_PERMANENT: @@ -3734,7 +1659,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -3760,6 +1685,9 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, #ifndef DEBUG_COMPARE_LOCAL /* This is a bit tricky. Check if the relation exists locally */ if (mdexists(reln, forknum)) +#else + if (mdexists(reln, INIT_FORKNUM)) +#endif { /* It exists locally. Guess it's unlogged then. */ mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); @@ -3773,7 +1701,6 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, */ return; } -#endif break; case RELPERSISTENCE_PERMANENT: @@ -3794,7 +1721,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -3810,7 +1737,6 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, static BlockNumber neon_nblocks(SMgrRelation reln, ForkNumber forknum) { - NeonResponse *resp; BlockNumber n_blocks; neon_request_lsns request_lsns; @@ -3842,74 +1768,15 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); - { - NeonNblocksRequest request = { - .hdr.tag = T_NeonNblocksRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), - .hdr.lsn = request_lsns.request_lsn, - .hdr.not_modified_since = request_lsns.not_modified_since, - .rinfo = InfoFromSMgrRel(reln), - .forknum = forknum, - }; - - resp = page_server_request(&request); - - switch (resp->tag) - { - case T_NeonNblocksResponse: - { - NeonNblocksResponse * relsize_resp = (NeonNblocksResponse *) resp; - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr) || - !RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) || - relsize_resp->req.forknum != forknum) - { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum, - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum); - } - } - n_blocks = relsize_resp->n_blocks; - break; - } - case T_NeonErrorResponse: - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr)) - { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); - } - } - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X", - resp->reqid, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x", - T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag); - } - update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); + n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns); + update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks); - neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), - n_blocks); + neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks", + RelFileInfoFmt(InfoFromSMgrRel(reln)), + forknum, + LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), + n_blocks); - pfree(resp); - } return n_blocks; } @@ -3919,7 +1786,6 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) int64 neon_dbsize(Oid dbNode) { - NeonResponse *resp; int64 db_size; neon_request_lsns request_lsns; NRelFileInfo dummy_node = {0}; @@ -3927,66 +1793,11 @@ neon_dbsize(Oid dbNode) neon_get_request_lsns(dummy_node, MAIN_FORKNUM, REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); - { - NeonDbSizeRequest request = { - .hdr.tag = T_NeonDbSizeRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), - .hdr.lsn = request_lsns.request_lsn, - .hdr.not_modified_since = request_lsns.not_modified_since, - .dbNode = dbNode, - }; - - resp = page_server_request(&request); - - switch (resp->tag) - { - case T_NeonDbSizeResponse: - { - NeonDbSizeResponse* dbsize_resp = (NeonDbSizeResponse *) resp; - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr) || - dbsize_resp->req.dbNode != dbNode) - { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode, - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode); - } - } - db_size = dbsize_resp->db_size; - break; - } - case T_NeonErrorResponse: - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr)) - { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); - } - } - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X", - resp->reqid, - dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x", - T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag); - } + db_size = communicator_dbsize(dbNode, &request_lsns); - neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", - dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size); + neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes", + dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size); - pfree(resp); - } return db_size; } @@ -4085,7 +1896,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop"); - prefetch_pump_state(false); + communicator_prefetch_pump_state(false); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -4187,6 +1998,8 @@ neon_start_unlogged_build(SMgrRelation reln) #ifndef DEBUG_COMPARE_LOCAL if (!IsParallelWorker()) mdcreate(reln, MAIN_FORKNUM, false); +#else + mdcreate(reln, INIT_FORKNUM, false); #endif } @@ -4265,6 +2078,8 @@ neon_end_unlogged_build(SMgrRelation reln) #ifndef DEBUG_COMPARE_LOCAL /* use isRedo == true, so that we drop it immediately */ mdunlink(rinfob, forknum, true); +#else + mdunlink(rinfob, INIT_FORKNUM, true); #endif } } @@ -4282,9 +2097,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf not_modified_since; SlruKind kind; int n_blocks; - shardno_t shard_no = 0; /* All SLRUs are at shard 0 */ - NeonResponse *resp; - NeonGetSlruSegmentRequest request; + neon_request_lsns request_lsns; /* * Compute a request LSN to use, similar to neon_get_request_lsns() but the @@ -4323,74 +2136,12 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf else return -1; - request = (NeonGetSlruSegmentRequest) { - .hdr.tag = T_NeonGetSlruSegmentRequest, - .hdr.reqid = GENERATE_REQUEST_ID(), - .hdr.lsn = request_lsn, - .hdr.not_modified_since = not_modified_since, - .kind = kind, - .segno = segno - }; - - do - { - while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no)); - - consume_prefetch_responses(); - - resp = page_server->receive(shard_no); - } while (resp == NULL); + request_lsns.request_lsn = request_lsn; + request_lsns.not_modified_since = not_modified_since; + request_lsns.effective_request_lsn = request_lsn; - switch (resp->tag) - { - case T_NeonGetSlruSegmentResponse: - { - NeonGetSlruSegmentResponse* slru_resp = (NeonGetSlruSegmentResponse *) resp; - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr) || - slru_resp->req.kind != kind || - slru_resp->req.segno != segno) - { - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno, - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, segno); - } - } - n_blocks = slru_resp->n_blocks; - memcpy(buffer, slru_resp->data, n_blocks*BLCKSZ); - break; - } - case T_NeonErrorResponse: - if (neon_protocol_version >= 3) - { - if (!equal_requests(resp, &request.hdr)) - { - elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}", - resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), - request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since)); - } - } - ereport(ERROR, - (errcode(ERRCODE_IO_ERROR), - errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %d at lsn %X/%08X", - resp->reqid, - kind, - segno, - LSN_FORMAT_ARGS(request_lsn)), - errdetail("page server returned error: %s", - ((NeonErrorResponse *) resp)->message))); - break; - - default: - NEON_PANIC_CONNECTION_STATE(-1, PANIC, - "Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x", - T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag); - } - pfree(resp); + n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer); - reconfigure_timeout_if_needed(); return n_blocks; } @@ -4426,7 +2177,7 @@ AtEOXact_neon(XactEvent event, void *arg) } break; } - reconfigure_timeout_if_needed(); + communicator_reconfigure_timeout_if_needed(); } static const struct f_smgr neon_smgr = @@ -4484,6 +2235,7 @@ smgr_init_neon(void) smgr_init_standard(); neon_init(); + communicator_init(); } @@ -4513,25 +2265,14 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, * This length is later reused when we open the smgr to read the * block, which is fine and expected. */ - NeonResponse *response; - NeonNblocksResponse *nbresponse; - NeonNblocksRequest request = { - .hdr = (NeonRequest) { - .tag = T_NeonNblocksRequest, - .reqid = GENERATE_REQUEST_ID(), - .lsn = end_recptr, - .not_modified_since = end_recptr, - }, - .rinfo = rinfo, - .forknum = forknum, - }; - - response = page_server_request(&request); - - Assert(response->tag == T_NeonNblocksResponse); - nbresponse = (NeonNblocksResponse *) response; - - relsize = Max(nbresponse->n_blocks, blkno + 1); + neon_request_lsns request_lsns; + + neon_get_request_lsns(rinfo, forknum, + REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1); + + relsize = communicator_nblocks(rinfo, forknum, &request_lsns); + + relsize = Max(relsize, blkno + 1); set_cached_relsize(rinfo, forknum, relsize); neon_set_lwlsn_relation(end_recptr, rinfo, forknum); @@ -4683,94 +2424,3 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id) } return no_redo_needed; } - -static void -reconfigure_timeout_if_needed(void) -{ - bool needs_set = MyPState->ring_receive != MyPState->ring_unused && - readahead_getpage_pull_timeout_ms > 0; - - if (needs_set != timeout_set) - { - /* The background writer doens't (shouldn't) read any pages */ - Assert(!AmBackgroundWriterProcess()); - /* The checkpointer doens't (shouldn't) read any pages */ - Assert(!AmCheckpointerProcess()); - - if (unlikely(PS_TIMEOUT_ID == 0)) - { - PS_TIMEOUT_ID = RegisterTimeout(USER_TIMEOUT, pagestore_timeout_handler); - } - - if (needs_set) - { -#if PG_MAJORVERSION_NUM <= 14 - enable_timeout_after(PS_TIMEOUT_ID, readahead_getpage_pull_timeout_ms); -#else - enable_timeout_every( - PS_TIMEOUT_ID, - TimestampTzPlusMilliseconds(GetCurrentTimestamp(), - readahead_getpage_pull_timeout_ms), - readahead_getpage_pull_timeout_ms - ); -#endif - timeout_set = true; - } - else - { - Assert(timeout_set); - disable_timeout(PS_TIMEOUT_ID, false); - timeout_set = false; - } - } -} - -static void -pagestore_timeout_handler(void) -{ -#if PG_MAJORVERSION_NUM <= 14 - /* - * PG14: Setting a repeating timeout is not possible, so we signal here - * that the timeout has already been reset, and by telling the system - * that system will re-schedule it later if we need to. - */ - timeout_set = false; -#endif - timeout_signaled = true; - InterruptPending = true; -} - -static process_interrupts_callback_t prev_interrupt_cb; - -/* - * Process new data received in our active PageStream sockets. - * - * This relies on the invariant that all pipelined yet-to-be-received requests - * are getPage requests managed by MyPState. This is currently true, any - * modification will probably require some stuff to make it work again. - */ -static bool -pagestore_smgr_processinterrupts(void) -{ - if (timeout_signaled) - { - if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0) - prefetch_pump_state(true); - - timeout_signaled = false; - reconfigure_timeout_if_needed(); - } - - if (!prev_interrupt_cb) - return false; - - return prev_interrupt_cb(); -} - - -void -pagestore_smgr_init(void) -{ - prev_interrupt_cb = ProcessInterruptsCallback; - ProcessInterruptsCallback = pagestore_smgr_processinterrupts; -} diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 0336d63e8d70..b95b1451e4ed 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -99,6 +99,9 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) wp->config = config; wp->api = api; wp->state = WPS_COLLECTING_TERMS; + wp->mconf.generation = INVALID_GENERATION; + wp->mconf.members.len = 0; + wp->mconf.new_members.len = 0; wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list); @@ -170,6 +173,8 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api) if (wp->config->proto_version != 2 && wp->config->proto_version != 3) wp_log(FATAL, "unsupported safekeeper protocol version %d", wp->config->proto_version); + if (wp->safekeepers_generation > INVALID_GENERATION && wp->config->proto_version < 3) + wp_log(FATAL, "enabling generations requires protocol version 3"); wp_log(LOG, "using safekeeper protocol version %d", wp->config->proto_version); /* Fill the greeting package */ @@ -214,7 +219,7 @@ WalProposerFree(WalProposer *wp) static bool WalProposerGenerationsEnabled(WalProposer *wp) { - return wp->safekeepers_generation != 0; + return wp->safekeepers_generation != INVALID_GENERATION; } /* @@ -723,13 +728,176 @@ SendProposerGreeting(Safekeeper *sk) BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_HANDSHAKE_RECV); } +/* + * Assuming `sk` sent its node id, find such member(s) in wp->mconf and set ptr in + * members_safekeepers & new_members_safekeepers to sk. + */ +static void +UpdateMemberSafekeeperPtr(WalProposer *wp, Safekeeper *sk) +{ + /* members_safekeepers etc are fixed size, sanity check mconf size */ + if (wp->mconf.members.len > MAX_SAFEKEEPERS) + wp_log(FATAL, "too many members %d in mconf", wp->mconf.members.len); + if (wp->mconf.new_members.len > MAX_SAFEKEEPERS) + wp_log(FATAL, "too many new_members %d in mconf", wp->mconf.new_members.len); + + /* node id is not known until greeting is received */ + if (sk->state < SS_WAIT_VOTING) + return; + + /* 0 is assumed to be invalid node id, should never happen */ + if (sk->greetResponse.nodeId == 0) + { + wp_log(WARNING, "safekeeper %s:%s sent zero node id", sk->host, sk->port); + return; + } + + for (uint32 i = 0; i < wp->mconf.members.len; i++) + { + SafekeeperId *sk_id = &wp->mconf.members.m[i]; + + if (wp->mconf.members.m[i].node_id == sk->greetResponse.nodeId) + { + /* + * If mconf or list of safekeepers to connect to changed (the + * latter always currently goes through restart though), + * ResetMemberSafekeeperPtrs is expected to be called before + * UpdateMemberSafekeeperPtr. So, other value suggests that we are + * connected to the same sk under different host name, complain + * about that. + */ + if (wp->members_safekeepers[i] != NULL && wp->members_safekeepers[i] != sk) + { + wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in members[%u] is already mapped to connection slot %lu", + sk_id->node_id, sk_id->host, sk_id->port, i, wp->members_safekeepers[i] - wp->safekeeper); + } + wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in members[%u] mapped to connection slot %lu", + sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper); + wp->members_safekeepers[i] = sk; + } + } + /* repeat for new_members */ + for (uint32 i = 0; i < wp->mconf.new_members.len; i++) + { + SafekeeperId *sk_id = &wp->mconf.new_members.m[i]; + + if (wp->mconf.new_members.m[i].node_id == sk->greetResponse.nodeId) + { + if (wp->new_members_safekeepers[i] != NULL && wp->new_members_safekeepers[i] != sk) + { + wp_log(WARNING, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] is already mapped to connection slot %lu", + sk_id->node_id, sk_id->host, sk_id->port, i, wp->new_members_safekeepers[i] - wp->safekeeper); + } + wp_log(LOG, "safekeeper {id = %lu, ep = %s:%u } in new_members[%u] mapped to connection slot %lu", + sk_id->node_id, sk_id->host, sk_id->port, i, sk - wp->safekeeper); + wp->new_members_safekeepers[i] = sk; + } + } +} + +/* + * Reset wp->members_safekeepers & new_members_safekeepers and refill them. + * Called after wp changes mconf. + */ +static void +ResetMemberSafekeeperPtrs(WalProposer *wp) +{ + memset(&wp->members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS); + memset(&wp->new_members_safekeepers, 0, sizeof(Safekeeper *) * MAX_SAFEKEEPERS); + for (int i = 0; i < wp->n_safekeepers; i++) + { + if (wp->safekeeper[i].state >= SS_WAIT_VOTING) + UpdateMemberSafekeeperPtr(wp, &wp->safekeeper[i]); + } +} + +static uint32 +MsetQuorum(MemberSet *mset) +{ + Assert(mset->len > 0); + return mset->len / 2 + 1; +} + +/* Does n forms quorum in mset? */ +static bool +MsetHasQuorum(MemberSet *mset, uint32 n) +{ + return n >= MsetQuorum(mset); +} + +/* + * TermsCollected helper for a single member set `mset`. + * + * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers + * or new_members_safekeepers. + */ +static bool +TermsCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s) +{ + uint32 n_greeted = 0; + + for (uint32 i = 0; i < wp->mconf.members.len; i++) + { + Safekeeper *sk = msk[i]; + + if (sk != NULL && sk->state == SS_WAIT_VOTING) + { + if (n_greeted > 0) + appendStringInfoString(s, ", "); + appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port); + n_greeted++; + } + } + appendStringInfo(s, ", %u/%u total", n_greeted, mset->len); + return MsetHasQuorum(mset, n_greeted); +} + /* * Have we received greeting from enough (quorum) safekeepers to start voting? */ static bool TermsCollected(WalProposer *wp) { - return wp->n_connected >= wp->quorum; + StringInfoData s; /* str for logging */ + bool collected = false; + + /* legacy: generations disabled */ + if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION) + { + collected = wp->n_connected >= wp->quorum; + if (collected) + { + wp->propTerm++; + wp_log(LOG, "walproposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT ", starting voting", wp->quorum, wp->propTerm); + } + return collected; + } + + /* + * With generations enabled, we start campaign only when 1) some mconf is + * actually received 2) we have greetings from majority of members as well + * as from majority of new_members if it exists. + */ + if (wp->mconf.generation == INVALID_GENERATION) + return false; + + initStringInfo(&s); + appendStringInfoString(&s, "mset greeters: "); + if (!TermsCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s)) + goto res; + if (wp->mconf.new_members.len > 0) + { + appendStringInfoString(&s, ", new_mset greeters: "); + if (!TermsCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s)) + goto res; + } + wp->propTerm++; + wp_log(LOG, "walproposer connected to quorum of safekeepers: %s, propTerm=" INT64_FORMAT ", starting voting", s.data, wp->propTerm); + collected = true; + +res: + pfree(s.data); + return collected; } static void @@ -753,13 +921,41 @@ RecvAcceptorGreeting(Safekeeper *sk) pfree(mconf_toml); /* - * Adopt mconf of safekeepers if it is higher. TODO: mconf change should - * restart wp if it started voting. + * Adopt mconf of safekeepers if it is higher. */ if (sk->greetResponse.mconf.generation > wp->mconf.generation) { + /* sanity check before adopting, should never happen */ + if (sk->greetResponse.mconf.members.len == 0) + { + wp_log(FATAL, "mconf %u has zero members", sk->greetResponse.mconf.generation); + } + + /* + * If we at least started campaign, restart wp to get elected in the + * new mconf. Note: in principle once wp is already elected + * re-election is not required, but being conservative here is not + * bad. + * + * TODO: put mconf to shmem to immediately pick it up on start, + * otherwise if some safekeeper(s) misses latest mconf and gets + * connected the first, it may cause redundant restarts here. + * + * More generally, it would be nice to restart walproposer (wiping + * election state) without restarting the process. In particular, that + * would allow sync-safekeepers not to die here if it intersected with + * sk migration (as well as remove 1s delay). + * + * Note that assign_neon_safekeepers also currently restarts the + * process, so during normal migration walproposer may restart twice. + */ + if (wp->state >= WPS_CAMPAIGN) + { + wp_log(FATAL, "restarting to adopt mconf generation %d", sk->greetResponse.mconf.generation); + } MembershipConfigurationFree(&wp->mconf); MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf); + ResetMemberSafekeeperPtrs(wp); /* full conf was just logged above */ wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation); } @@ -767,6 +963,9 @@ RecvAcceptorGreeting(Safekeeper *sk) /* Protocol is all good, move to voting. */ sk->state = SS_WAIT_VOTING; + /* In greeting safekeeper sent its id; update mappings accordingly. */ + UpdateMemberSafekeeperPtr(wp, sk); + /* * Note: it would be better to track the counter on per safekeeper basis, * but at worst walproposer would restart with 'term rejected', so leave @@ -778,12 +977,9 @@ RecvAcceptorGreeting(Safekeeper *sk) /* We're still collecting terms from the majority. */ wp->propTerm = Max(sk->greetResponse.term, wp->propTerm); - /* Quorum is acquried, prepare the vote request. */ + /* Quorum is acquired, prepare the vote request. */ if (TermsCollected(wp)) { - wp->propTerm++; - wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm); - wp->state = WPS_CAMPAIGN; wp->voteRequest.pam.tag = 'v'; wp->voteRequest.generation = wp->mconf.generation; @@ -832,8 +1028,8 @@ SendVoteRequest(Safekeeper *sk) &sk->outbuf, wp->config->proto_version); /* We have quorum for voting, send our vote request */ - wp_log(LOG, "requesting vote from %s:%s for generation %u term " UINT64_FORMAT, sk->host, sk->port, - wp->voteRequest.generation, wp->voteRequest.term); + wp_log(LOG, "requesting vote from sk {id = %lu, ep = %s:%s} for generation %u term " UINT64_FORMAT, + sk->greetResponse.nodeId, sk->host, sk->port, wp->voteRequest.generation, wp->voteRequest.term); /* On failure, logging & resetting is handled */ BlockingWrite(sk, sk->outbuf.data, sk->outbuf.len, SS_WAIT_VERDICT); /* If successful, wait for read-ready with SS_WAIT_VERDICT */ @@ -851,8 +1047,8 @@ RecvVoteResponse(Safekeeper *sk) return; wp_log(LOG, - "got VoteResponse from acceptor %s:%s, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X", - sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term, + "got VoteResponse from sk {id = %lu, ep = %s:%s}, generation=%u, term=%lu, voteGiven=%u, last_log_term=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X", + sk->greetResponse.nodeId, sk->host, sk->port, sk->voteResponse.generation, sk->voteResponse.term, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory), LSN_FORMAT_ARGS(sk->voteResponse.flushLsn), @@ -899,6 +1095,53 @@ RecvVoteResponse(Safekeeper *sk) } } +/* + * VotesCollected helper for a single member set `mset`. + * + * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers + * or new_members_safekeepers. + */ +static bool +VotesCollectedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk, StringInfo s) +{ + uint32 n_votes = 0; + + for (uint32 i = 0; i < wp->mconf.members.len; i++) + { + Safekeeper *sk = msk[i]; + + if (sk != NULL && sk->state == SS_WAIT_ELECTED) + { + Assert(sk->voteResponse.voteGiven); + + /* + * Find the highest vote. NULL check is for the legacy case where + * safekeeper might be not initialized with LSN at all and return + * 0 LSN in the vote response; we still want to set donor to + * something in this case. + */ + if (GetLastLogTerm(sk) > wp->donorLastLogTerm || + (GetLastLogTerm(sk) == wp->donorLastLogTerm && + sk->voteResponse.flushLsn > wp->propTermStartLsn) || + wp->donor == NULL) + { + wp->donorLastLogTerm = GetLastLogTerm(sk); + wp->propTermStartLsn = sk->voteResponse.flushLsn; + wp->donor = sk; + } + wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); + + if (n_votes > 0) + appendStringInfoString(s, ", "); + appendStringInfo(s, "{id = %lu, ep = %s:%s}", sk->greetResponse.nodeId, sk->host, sk->port); + n_votes++; + } + } + appendStringInfo(s, ", %u/%u total", n_votes, mset->len); + return MsetHasQuorum(mset, n_votes); +} + + /* * Checks if enough votes has been collected to get elected and if that's the * case finds the highest vote, setting donor, donorLastLogTerm, @@ -907,7 +1150,8 @@ RecvVoteResponse(Safekeeper *sk) static bool VotesCollected(WalProposer *wp) { - int n_ready = 0; + StringInfoData s; /* str for logging */ + bool collected = false; /* assumed to be called only when not elected yet */ Assert(wp->state == WPS_CAMPAIGN); @@ -916,25 +1160,62 @@ VotesCollected(WalProposer *wp) wp->donorLastLogTerm = 0; wp->truncateLsn = InvalidXLogRecPtr; - for (int i = 0; i < wp->n_safekeepers; i++) + /* legacy: generations disabled */ + if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION) { - if (wp->safekeeper[i].state == SS_WAIT_ELECTED) - { - n_ready++; + int n_ready = 0; - if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm || - (GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm && - wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn)) + for (int i = 0; i < wp->n_safekeepers; i++) + { + if (wp->safekeeper[i].state == SS_WAIT_ELECTED) { - wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]); - wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn; - wp->donor = i; + n_ready++; + + if (GetLastLogTerm(&wp->safekeeper[i]) > wp->donorLastLogTerm || + (GetLastLogTerm(&wp->safekeeper[i]) == wp->donorLastLogTerm && + wp->safekeeper[i].voteResponse.flushLsn > wp->propTermStartLsn) || + wp->donor == NULL) + { + wp->donorLastLogTerm = GetLastLogTerm(&wp->safekeeper[i]); + wp->propTermStartLsn = wp->safekeeper[i].voteResponse.flushLsn; + wp->donor = &wp->safekeeper[i]; + } + wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); } - wp->truncateLsn = Max(wp->safekeeper[i].voteResponse.truncateLsn, wp->truncateLsn); } + collected = n_ready >= wp->quorum; + if (collected) + { + wp_log(LOG, "walproposer elected with %d/%d votes", n_ready, wp->n_safekeepers); + } + return collected; + } + + /* + * if generations are enabled we're expected to get to voting only when + * mconf is established. + */ + Assert(wp->mconf.generation != INVALID_GENERATION); + + /* + * We must get votes from both msets if both are present. + */ + initStringInfo(&s); + appendStringInfoString(&s, "mset voters: "); + if (!VotesCollectedMset(wp, &wp->mconf.members, wp->members_safekeepers, &s)) + goto res; + if (wp->mconf.new_members.len > 0) + { + appendStringInfoString(&s, ", new_mset voters: "); + if (!VotesCollectedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers, &s)) + goto res; } + wp_log(LOG, "walproposer elected, %s", s.data); + collected = true; - return n_ready >= wp->quorum; +res: + pfree(s.data); + return collected; } /* @@ -955,7 +1236,7 @@ HandleElectedProposer(WalProposer *wp) * that only for logical replication (and switching logical walsenders to * neon_walreader is a todo.) */ - if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor])) + if (!wp->api.recovery_download(wp, wp->donor)) { wp_log(FATAL, "failed to download WAL for logical replicaiton"); } @@ -1078,7 +1359,7 @@ ProcessPropStartPos(WalProposer *wp) /* * Proposer's term history is the donor's + its own entry. */ - dth = &wp->safekeeper[wp->donor].voteResponse.termHistory; + dth = &wp->donor->voteResponse.termHistory; wp->propTermHistory.n_entries = dth->n_entries + 1; wp->propTermHistory.entries = palloc(sizeof(TermSwitchEntry) * wp->propTermHistory.n_entries); if (dth->n_entries > 0) @@ -1086,11 +1367,10 @@ ProcessPropStartPos(WalProposer *wp) wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm; wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propTermStartLsn; - wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", - wp->quorum, + wp_log(LOG, "walproposer elected in term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X", wp->propTerm, LSN_FORMAT_ARGS(wp->propTermStartLsn), - wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port, + wp->donor->host, wp->donor->port, LSN_FORMAT_ARGS(wp->truncateLsn)); /* @@ -1508,6 +1788,14 @@ RecvAppendResponses(Safekeeper *sk) readAnything = true; + /* should never happen: sk is expected to send ERROR instead */ + if (sk->appendResponse.generation != wp->mconf.generation) + { + wp_log(FATAL, "safekeeper {id = %lu, ep = %s:%s} sent response with generation %u, expected %u", + sk->greetResponse.nodeId, sk->host, sk->port, + sk->appendResponse.generation, wp->mconf.generation); + } + if (sk->appendResponse.term > wp->propTerm) { /* @@ -1624,30 +1912,101 @@ CalculateMinFlushLsn(WalProposer *wp) } /* - * Calculate WAL position acknowledged by quorum + * GetAcknowledgedByQuorumWALPosition for a single member set `mset`. + * + * `msk` is the member -> safekeeper mapping for mset, i.e. members_safekeepers + * or new_members_safekeepers. */ static XLogRecPtr -GetAcknowledgedByQuorumWALPosition(WalProposer *wp) +GetCommittedMset(WalProposer *wp, MemberSet *mset, Safekeeper **msk) { XLogRecPtr responses[MAX_SAFEKEEPERS]; /* - * Sort acknowledged LSNs + * Ascending sort acknowledged LSNs. */ - for (int i = 0; i < wp->n_safekeepers; i++) + Assert(mset->len <= MAX_SAFEKEEPERS); + for (uint32 i = 0; i < mset->len; i++) { + Safekeeper *sk = msk[i]; + /* * Like in Raft, we aren't allowed to commit entries from previous - * terms, so ignore reported LSN until it gets to epochStartLsn. + * terms, so ignore reported LSN until it gets to propTermStartLsn. + * + * Note: we ignore sk state, which is ok: before first ack flushLsn is + * 0, and later we just preserve value across reconnections. It would + * be ok to check for SS_ACTIVE as well. */ - responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0; + if (sk != NULL && sk->appendResponse.flushLsn >= wp->propTermStartLsn) + { + responses[i] = sk->appendResponse.flushLsn; + } + else + { + responses[i] = 0; + } } - qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn); + qsort(responses, mset->len, sizeof(XLogRecPtr), CompareLsn); /* - * Get the smallest LSN committed by quorum + * And get value committed by the quorum. A way to view this: to get the + * highest value committed on the quorum, in the ordered array we skip n - + * n_quorum elements to get to the first (lowest) value present on all sks + * of the highest quorum. */ - return responses[wp->n_safekeepers - wp->quorum]; + return responses[mset->len - MsetQuorum(mset)]; +} + +/* + * Calculate WAL position acknowledged by quorum, i.e. which may be regarded + * committed. + * + * Zero may be returned when there is no quorum of nodes recovered to term start + * lsn which sent feedback yet. + */ +static XLogRecPtr +GetAcknowledgedByQuorumWALPosition(WalProposer *wp) +{ + XLogRecPtr committed; + + /* legacy: generations disabled */ + if (!WalProposerGenerationsEnabled(wp) && wp->mconf.generation == INVALID_GENERATION) + { + XLogRecPtr responses[MAX_SAFEKEEPERS]; + + /* + * Sort acknowledged LSNs + */ + for (int i = 0; i < wp->n_safekeepers; i++) + { + /* + * Like in Raft, we aren't allowed to commit entries from previous + * terms, so ignore reported LSN until it gets to + * propTermStartLsn. + * + * Note: we ignore sk state, which is ok: before first ack + * flushLsn is 0, and later we just preserve value across + * reconnections. It would be ok to check for SS_ACTIVE as well. + */ + responses[i] = wp->safekeeper[i].appendResponse.flushLsn >= wp->propTermStartLsn ? wp->safekeeper[i].appendResponse.flushLsn : 0; + } + qsort(responses, wp->n_safekeepers, sizeof(XLogRecPtr), CompareLsn); + + /* + * Get the smallest LSN committed by quorum + */ + return responses[wp->n_safekeepers - wp->quorum]; + } + + committed = GetCommittedMset(wp, &wp->mconf.members, wp->members_safekeepers); + if (wp->mconf.new_members.len > 0) + { + XLogRecPtr new_mset_committed = GetCommittedMset(wp, &wp->mconf.new_members, wp->new_members_safekeepers); + + committed = Min(committed, new_mset_committed); + } + return committed; } /* @@ -1662,7 +2021,7 @@ UpdateDonorShmem(WalProposer *wp) int i; XLogRecPtr donor_lsn = InvalidXLogRecPtr; - if (wp->n_votes < wp->quorum) + if (wp->state < WPS_ELECTED) { wp_log(WARNING, "UpdateDonorShmem called before elections are won"); return; @@ -1673,9 +2032,9 @@ UpdateDonorShmem(WalProposer *wp) * about its position immediately after election before any feedbacks are * sent. */ - if (wp->safekeeper[wp->donor].state >= SS_WAIT_ELECTED) + if (wp->donor->state >= SS_WAIT_ELECTED) { - donor = &wp->safekeeper[wp->donor]; + donor = wp->donor; donor_lsn = wp->propTermStartLsn; } @@ -1746,22 +2105,19 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk) } /* - * Generally sync is done when majority switched the epoch so we committed - * epochStartLsn and made the majority aware of it, ensuring they are - * ready to give all WAL to pageserver. It would mean whichever majority - * is alive, there will be at least one safekeeper who is able to stream - * WAL to pageserver to make basebackup possible. However, since at the - * moment we don't have any good mechanism of defining the healthy and - * most advanced safekeeper who should push the wal into pageserver and + * Generally sync is done when majority reached propTermStartLsn so we + * committed it and made the majority aware of it, ensuring they are ready + * to give all WAL to pageserver. It would mean whichever majority is + * alive, there will be at least one safekeeper who is able to stream WAL + * to pageserver to make basebackup possible. However, since at the moment + * we don't have any good mechanism of defining the healthy and most + * advanced safekeeper who should push the wal into pageserver and * basically the random one gets connected, to prevent hanging basebackup * (due to pageserver connecting to not-synced-safekeeper) we currently * wait for all seemingly alive safekeepers to get synced. */ if (wp->config->syncSafekeepers) { - int n_synced; - - n_synced = 0; for (int i = 0; i < wp->n_safekeepers; i++) { Safekeeper *sk = &wp->safekeeper[i]; @@ -1770,11 +2126,9 @@ HandleSafekeeperResponse(WalProposer *wp, Safekeeper *fromsk) /* alive safekeeper which is not synced yet; wait for it */ if (sk->state != SS_OFFLINE && !synced) return; - if (synced) - n_synced++; } - if (n_synced >= wp->quorum) + if (newCommitLsn >= wp->propTermStartLsn) { /* A quorum of safekeepers has been synced! */ diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index d116bce80644..648b0015ad8f 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -145,6 +145,7 @@ typedef uint64 NNodeId; * This and following structs pair ones in membership.rs. */ typedef uint32 Generation; +#define INVALID_GENERATION 0 typedef struct SafekeeperId { @@ -771,7 +772,17 @@ typedef struct WalProposer /* Current walproposer membership configuration */ MembershipConfiguration mconf; - /* (n_safekeepers / 2) + 1 */ + /* + * Parallels mconf.members with pointers to the member's slot in + * safekeepers array of connections, or NULL if such member is not + * connected. Helps to avoid looking slot per id through all + * .safekeepers[] when doing quorum checks. + */ + Safekeeper *members_safekeepers[MAX_SAFEKEEPERS]; + /* As above, but for new_members. */ + Safekeeper *new_members_safekeepers[MAX_SAFEKEEPERS]; + + /* (n_safekeepers / 2) + 1. Used for static pre-generations quorum checks. */ int quorum; /* @@ -829,7 +840,7 @@ typedef struct WalProposer term_t donorLastLogTerm; /* Most advanced acceptor */ - int donor; + Safekeeper *donor; /* timeline globally starts at this LSN */ XLogRecPtr timelineStartLsn; diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index 62fdc18207f1..e03f2f33d972 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -509,7 +509,14 @@ pub async fn run() -> anyhow::Result<()> { if let Some(mut redis_kv_client) = redis_kv_client { maintenance_tasks.spawn(async move { redis_kv_client.try_connect().await?; - handle_cancel_messages(&mut redis_kv_client, rx_cancel).await + handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?; + + drop(redis_kv_client); + + // `handle_cancel_messages` was terminated due to the tx_cancel + // being dropped. this is not worthy of an error, and this task can only return `Err`, + // so let's wait forever instead. + std::future::pending().await }); } diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index 8263e5aa2aa8..c5ba04eb8c8e 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -1,16 +1,17 @@ -use std::convert::Infallible; use std::net::{IpAddr, SocketAddr}; use std::sync::Arc; +use anyhow::{Context, anyhow}; use ipnet::{IpNet, Ipv4Net, Ipv6Net}; use postgres_client::CancelToken; use postgres_client::tls::MakeTlsConnect; use pq_proto::CancelKeyData; +use redis::{FromRedisValue, Pipeline, Value, pipe}; use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::net::TcpStream; use tokio::sync::{mpsc, oneshot}; -use tracing::{debug, info}; +use tracing::{debug, info, warn}; use crate::auth::backend::ComputeUserInfo; use crate::auth::{AuthError, check_peer_addr_is_in_list}; @@ -30,6 +31,7 @@ type IpSubnetKey = IpNet; const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10); +const BATCH_SIZE: usize = 8; // Message types for sending through mpsc channel pub enum CancelKeyOp { @@ -54,78 +56,168 @@ pub enum CancelKeyOp { }, } +impl CancelKeyOp { + fn register(self, pipe: &mut Pipeline) -> Option { + #[allow(clippy::used_underscore_binding)] + match self { + CancelKeyOp::StoreCancelKey { + key, + field, + value, + resp_tx, + _guard, + expire, + } => { + pipe.hset(&key, field, value); + pipe.expire(key, expire); + let resp_tx = resp_tx?; + Some(CancelReplyOp::StoreCancelKey { resp_tx, _guard }) + } + CancelKeyOp::GetCancelData { + key, + resp_tx, + _guard, + } => { + pipe.hgetall(key); + Some(CancelReplyOp::GetCancelData { resp_tx, _guard }) + } + CancelKeyOp::RemoveCancelKey { + key, + field, + resp_tx, + _guard, + } => { + pipe.hdel(key, field); + let resp_tx = resp_tx?; + Some(CancelReplyOp::RemoveCancelKey { resp_tx, _guard }) + } + } + } +} + +// Message types for sending through mpsc channel +pub enum CancelReplyOp { + StoreCancelKey { + resp_tx: oneshot::Sender>, + _guard: CancelChannelSizeGuard<'static>, + }, + GetCancelData { + resp_tx: oneshot::Sender>>, + _guard: CancelChannelSizeGuard<'static>, + }, + RemoveCancelKey { + resp_tx: oneshot::Sender>, + _guard: CancelChannelSizeGuard<'static>, + }, +} + +impl CancelReplyOp { + fn send_err(self, e: anyhow::Error) { + match self { + CancelReplyOp::StoreCancelKey { resp_tx, _guard } => { + resp_tx + .send(Err(e)) + .inspect_err(|_| tracing::debug!("could not send reply")) + .ok(); + } + CancelReplyOp::GetCancelData { resp_tx, _guard } => { + resp_tx + .send(Err(e)) + .inspect_err(|_| tracing::debug!("could not send reply")) + .ok(); + } + CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => { + resp_tx + .send(Err(e)) + .inspect_err(|_| tracing::debug!("could not send reply")) + .ok(); + } + } + } + + fn send_value(self, v: redis::Value) { + match self { + CancelReplyOp::StoreCancelKey { resp_tx, _guard } => { + let send = + FromRedisValue::from_owned_redis_value(v).context("could not parse value"); + resp_tx + .send(send) + .inspect_err(|_| tracing::debug!("could not send reply")) + .ok(); + } + CancelReplyOp::GetCancelData { resp_tx, _guard } => { + let send = + FromRedisValue::from_owned_redis_value(v).context("could not parse value"); + resp_tx + .send(send) + .inspect_err(|_| tracing::debug!("could not send reply")) + .ok(); + } + CancelReplyOp::RemoveCancelKey { resp_tx, _guard } => { + let send = + FromRedisValue::from_owned_redis_value(v).context("could not parse value"); + resp_tx + .send(send) + .inspect_err(|_| tracing::debug!("could not send reply")) + .ok(); + } + } + } +} + // Running as a separate task to accept messages through the rx channel -// In case of problems with RTT: switch to recv_many() + redis pipeline pub async fn handle_cancel_messages( client: &mut RedisKVClient, mut rx: mpsc::Receiver, -) -> anyhow::Result { +) -> anyhow::Result<()> { + let mut batch = Vec::new(); + let mut replies = vec![]; + loop { - if let Some(msg) = rx.recv().await { - match msg { - CancelKeyOp::StoreCancelKey { - key, - field, - value, - resp_tx, - _guard, - expire, - } => { - let res = client.hset(&key, field, value).await; - if let Some(resp_tx) = resp_tx { - if res.is_ok() { - resp_tx - .send(client.expire(key, expire).await) - .inspect_err(|e| { - tracing::debug!( - "failed to send StoreCancelKey response: {:?}", - e - ); - }) - .ok(); - } else { - resp_tx - .send(res) - .inspect_err(|e| { - tracing::debug!( - "failed to send StoreCancelKey response: {:?}", - e - ); - }) - .ok(); - } - } else if res.is_ok() { - drop(client.expire(key, expire).await); - } else { - tracing::warn!("failed to store cancel key: {:?}", res); - } + if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 { + warn!("shutting down cancellation queue"); + break Ok(()); + } + + let batch_size = batch.len(); + debug!(batch_size, "running cancellation jobs"); + + let mut pipe = pipe(); + for msg in batch.drain(..) { + if let Some(reply) = msg.register(&mut pipe) { + replies.push(reply); + } else { + pipe.ignore(); + } + } + + let responses = replies.len(); + + match client.query(pipe).await { + // for each reply, we expect that many values. + Ok(Value::Array(values)) if values.len() == responses => { + debug!( + batch_size, + responses, "successfully completed cancellation jobs", + ); + for (value, reply) in std::iter::zip(values, replies.drain(..)) { + reply.send_value(value); } - CancelKeyOp::GetCancelData { - key, - resp_tx, - _guard, - } => { - drop(resp_tx.send(client.hget_all(key).await)); + } + Ok(value) => { + debug!(?value, "unexpected redis return value"); + for reply in replies.drain(..) { + reply.send_err(anyhow!("incorrect response type from redis")); } - CancelKeyOp::RemoveCancelKey { - key, - field, - resp_tx, - _guard, - } => { - if let Some(resp_tx) = resp_tx { - resp_tx - .send(client.hdel(key, field).await) - .inspect_err(|e| { - tracing::debug!("failed to send StoreCancelKey response: {:?}", e); - }) - .ok(); - } else { - drop(client.hdel(key, field).await); - } + } + Err(err) => { + for reply in replies.drain(..) { + reply.send_err(anyhow!("could not send cmd to redis: {err}")); } } } + + replies.clear(); } } @@ -425,12 +517,7 @@ impl CancelClosure { &mut mk_tls, &self.hostname, ) - .map_err(|e| { - CancelError::IO(std::io::Error::new( - std::io::ErrorKind::Other, - e.to_string(), - )) - })?; + .map_err(|e| CancelError::IO(std::io::Error::other(e.to_string())))?; self.cancel_token.cancel_query_raw(socket, tls).await?; debug!("query was cancelled"); diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs index 2c3e70138d9e..2268e60d257d 100644 --- a/proxy/src/proxy/tests/mod.rs +++ b/proxy/src/proxy/tests/mod.rs @@ -568,7 +568,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn fn helper_create_connect_info( mechanism: &TestConnectMechanism, ) -> auth::Backend<'static, ComputeCredentials> { - let user_info = auth::Backend::ControlPlane( + auth::Backend::ControlPlane( MaybeOwned::Owned(ControlPlaneClient::Test(Box::new(mechanism.clone()))), ComputeCredentials { info: ComputeUserInfo { @@ -578,8 +578,7 @@ fn helper_create_connect_info( }, keys: ComputeCredentialKeys::Password("password".into()), }, - ); - user_info + ) } fn config() -> ComputeConfig { diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs index 3689bf7ae29b..aa627b29a6f2 100644 --- a/proxy/src/redis/kv_ops.rs +++ b/proxy/src/redis/kv_ops.rs @@ -1,4 +1,5 @@ -use redis::{AsyncCommands, ToRedisArgs}; +use redis::aio::ConnectionLike; +use redis::{Cmd, FromRedisValue, Pipeline, RedisResult}; use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider; use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo}; @@ -8,6 +9,23 @@ pub struct RedisKVClient { limiter: GlobalRateLimiter, } +#[allow(async_fn_in_trait)] +pub trait Queryable { + async fn query(&self, conn: &mut impl ConnectionLike) -> RedisResult; +} + +impl Queryable for Pipeline { + async fn query(&self, conn: &mut impl ConnectionLike) -> RedisResult { + self.query_async(conn).await + } +} + +impl Queryable for Cmd { + async fn query(&self, conn: &mut impl ConnectionLike) -> RedisResult { + self.query_async(conn).await + } +} + impl RedisKVClient { pub fn new(client: ConnectionWithCredentialsProvider, info: &'static [RateBucketInfo]) -> Self { Self { @@ -27,158 +45,24 @@ impl RedisKVClient { Ok(()) } - pub(crate) async fn hset(&mut self, key: K, field: F, value: V) -> anyhow::Result<()> - where - K: ToRedisArgs + Send + Sync, - F: ToRedisArgs + Send + Sync, - V: ToRedisArgs + Send + Sync, - { - if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping hset"); - return Err(anyhow::anyhow!("Rate limit exceeded")); - } - - match self.client.hset(&key, &field, &value).await { - Ok(()) => return Ok(()), - Err(e) => { - tracing::error!("failed to set a key-value pair: {e}"); - } - } - - tracing::info!("Redis client is disconnected. Reconnectiong..."); - self.try_connect().await?; - self.client - .hset(key, field, value) - .await - .map_err(anyhow::Error::new) - } - - #[allow(dead_code)] - pub(crate) async fn hset_multiple( + pub(crate) async fn query( &mut self, - key: &str, - items: &[(K, V)], - ) -> anyhow::Result<()> - where - K: ToRedisArgs + Send + Sync, - V: ToRedisArgs + Send + Sync, - { - if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping hset_multiple"); - return Err(anyhow::anyhow!("Rate limit exceeded")); - } - - match self.client.hset_multiple(key, items).await { - Ok(()) => return Ok(()), - Err(e) => { - tracing::error!("failed to set a key-value pair: {e}"); - } - } - - tracing::info!("Redis client is disconnected. Reconnectiong..."); - self.try_connect().await?; - self.client - .hset_multiple(key, items) - .await - .map_err(anyhow::Error::new) - } - - #[allow(dead_code)] - pub(crate) async fn expire(&mut self, key: K, seconds: i64) -> anyhow::Result<()> - where - K: ToRedisArgs + Send + Sync, - { - if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping expire"); - return Err(anyhow::anyhow!("Rate limit exceeded")); - } - - match self.client.expire(&key, seconds).await { - Ok(()) => return Ok(()), - Err(e) => { - tracing::error!("failed to set a key-value pair: {e}"); - } - } - - tracing::info!("Redis client is disconnected. Reconnectiong..."); - self.try_connect().await?; - self.client - .expire(key, seconds) - .await - .map_err(anyhow::Error::new) - } - - #[allow(dead_code)] - pub(crate) async fn hget(&mut self, key: K, field: F) -> anyhow::Result - where - K: ToRedisArgs + Send + Sync, - F: ToRedisArgs + Send + Sync, - V: redis::FromRedisValue, - { - if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping hget"); - return Err(anyhow::anyhow!("Rate limit exceeded")); - } - - match self.client.hget(&key, &field).await { - Ok(value) => return Ok(value), - Err(e) => { - tracing::error!("failed to get a value: {e}"); - } - } - - tracing::info!("Redis client is disconnected. Reconnectiong..."); - self.try_connect().await?; - self.client - .hget(key, field) - .await - .map_err(anyhow::Error::new) - } - - pub(crate) async fn hget_all(&mut self, key: K) -> anyhow::Result - where - K: ToRedisArgs + Send + Sync, - V: redis::FromRedisValue, - { - if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping hgetall"); - return Err(anyhow::anyhow!("Rate limit exceeded")); - } - - match self.client.hgetall(&key).await { - Ok(value) => return Ok(value), - Err(e) => { - tracing::error!("failed to get a value: {e}"); - } - } - - tracing::info!("Redis client is disconnected. Reconnectiong..."); - self.try_connect().await?; - self.client.hgetall(key).await.map_err(anyhow::Error::new) - } - - pub(crate) async fn hdel(&mut self, key: K, field: F) -> anyhow::Result<()> - where - K: ToRedisArgs + Send + Sync, - F: ToRedisArgs + Send + Sync, - { + q: impl Queryable, + ) -> anyhow::Result { if !self.limiter.check() { - tracing::info!("Rate limit exceeded. Skipping hdel"); + tracing::info!("Rate limit exceeded. Skipping query"); return Err(anyhow::anyhow!("Rate limit exceeded")); } - match self.client.hdel(&key, &field).await { - Ok(()) => return Ok(()), + match q.query(&mut self.client).await { + Ok(t) => return Ok(t), Err(e) => { - tracing::error!("failed to delete a key-value pair: {e}"); + tracing::error!("failed to run query: {e}"); } } - tracing::info!("Redis client is disconnected. Reconnectiong..."); + tracing::info!("Redis client is disconnected. Reconnecting..."); self.try_connect().await?; - self.client - .hdel(key, field) - .await - .map_err(anyhow::Error::new) + Ok(q.query(&mut self.client).await?) } } diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs index 77b548cc43a7..42a3ea17a248 100644 --- a/proxy/src/serverless/conn_pool_lib.rs +++ b/proxy/src/serverless/conn_pool_lib.rs @@ -47,6 +47,7 @@ impl ConnInfo { } #[derive(Clone)] +#[allow(clippy::large_enum_variant, reason = "TODO")] pub(crate) enum ClientDataEnum { Remote(ClientDataRemote), Local(ClientDataLocal), diff --git a/safekeeper/src/http/mod.rs b/safekeeper/src/http/mod.rs index 003a75faa6a7..6e7c5d971df8 100644 --- a/safekeeper/src/http/mod.rs +++ b/safekeeper/src/http/mod.rs @@ -31,6 +31,7 @@ pub async fn task_main_https( global_timelines: Arc, ) -> anyhow::Result<()> { let cert_resolver = ReloadingCertificateResolver::new( + "main", &conf.ssl_key_file, &conf.ssl_cert_file, conf.ssl_cert_reload_period, diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index e6a7ade9f2e2..b7ba28f4356d 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -138,6 +138,7 @@ impl Drop for WriteGuardSharedState<'_> { /// Usually it holds SafeKeeper, but it also supports offloaded timeline state. In this /// case, SafeKeeper is not available (because WAL is not present on disk) and all /// operations can be done only with control file. +#[allow(clippy::large_enum_variant, reason = "TODO")] pub enum StateSK { Loaded(SafeKeeper), Offloaded(Box>), diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index 06ccb32d03a1..84c636daf627 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -35,7 +35,7 @@ impl Manager { next_event: &Option, state: &StateSnapshot, ) -> bool { - let ready = self.backup_task.is_none() + self.backup_task.is_none() && self.recovery_task.is_none() && self.wal_removal_task.is_none() && self.partial_backup_task.is_none() @@ -61,8 +61,7 @@ impl Manager { .unwrap() .flush_lsn .segment_number(self.wal_seg_size) - == self.last_removed_segno + 1; - ready + == self.last_removed_segno + 1 } /// Evict the timeline to remote storage. Returns whether the eviction was successful. diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index f1bd7ba708d9..a7e0c986e6da 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -96,6 +96,7 @@ enum Message { impl Message { /// Convert proto message to internal message. + #[allow(clippy::result_large_err, reason = "TODO")] pub fn from(proto_msg: TypedMessage) -> Result { match proto_msg.r#type() { MessageType::SafekeeperTimelineInfo => Ok(Message::SafekeeperTimelineInfo( @@ -127,6 +128,7 @@ impl Message { } /// Get the tenant_timeline_id from the message. + #[allow(clippy::result_large_err, reason = "TODO")] pub fn tenant_timeline_id(&self) -> Result, Status> { match self { Message::SafekeeperTimelineInfo(msg) => Ok(msg @@ -185,6 +187,7 @@ enum SubscriptionKey { impl SubscriptionKey { /// Parse protobuf subkey (protobuf doesn't have fixed size bytes, we get vectors). + #[allow(clippy::result_large_err, reason = "TODO")] pub fn from_proto_subscription_key(key: ProtoSubscriptionKey) -> Result { match key { ProtoSubscriptionKey::All(_) => Ok(SubscriptionKey::All), @@ -195,6 +198,7 @@ impl SubscriptionKey { } /// Parse from FilterTenantTimelineId + #[allow(clippy::result_large_err, reason = "TODO")] pub fn from_proto_filter_tenant_timeline_id( opt: Option<&FilterTenantTimelineId>, ) -> Result { @@ -385,6 +389,7 @@ impl Registry { } /// Send msg to relevant subscribers. + #[allow(clippy::result_large_err, reason = "TODO")] pub fn send_msg(&self, msg: &Message) -> Result<(), Status> { PROCESSED_MESSAGES_TOTAL.inc(); @@ -436,6 +441,7 @@ struct Publisher { impl Publisher { /// Send msg to relevant subscribers. + #[allow(clippy::result_large_err, reason = "TODO")] pub fn send_msg(&mut self, msg: &Message) -> Result<(), Status> { self.registry.send_msg(msg) } diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs index 55d411f607fb..7b36f5e9483c 100644 --- a/storage_broker/src/lib.rs +++ b/storage_broker/src/lib.rs @@ -79,6 +79,7 @@ impl BrokerClientChannel { } // parse variable length bytes from protobuf +#[allow(clippy::result_large_err, reason = "TODO")] pub fn parse_proto_ttid(proto_ttid: &ProtoTenantTimelineId) -> Result { let tenant_id = TenantId::from_slice(&proto_ttid.tenant_id) .map_err(|e| Status::new(Code::InvalidArgument, format!("malformed tenant_id: {}", e)))?; diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 31ab443ccdb6..57709302e18b 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -629,15 +629,13 @@ impl ComputeHook { }; let result = if !self.config.use_local_compute_notifications { - let compute_hook_url = if let Some(control_plane_url) = &self.config.control_plane_url { - Some(if control_plane_url.ends_with('/') { - format!("{control_plane_url}notify-attach") - } else { - format!("{control_plane_url}/notify-attach") - }) - } else { - self.config.compute_hook_url.clone() - }; + let compute_hook_url = + self.config + .control_plane_url + .as_ref() + .map(|control_plane_url| { + format!("{}/notify-attach", control_plane_url.trim_end_matches('/')) + }); // We validate this at startup let notify_url = compute_hook_url.as_ref().unwrap(); @@ -800,7 +798,7 @@ impl ComputeHook { #[cfg(test)] pub(crate) mod tests { - use pageserver_api::shard::{ShardCount, ShardNumber}; + use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber}; use utils::id::TenantId; use super::*; @@ -808,6 +806,7 @@ pub(crate) mod tests { #[test] fn tenant_updates() -> anyhow::Result<()> { let tenant_id = TenantId::generate(); + let stripe_size = DEFAULT_STRIPE_SIZE; let mut tenant_state = ComputeHookTenant::new( TenantShardId { tenant_id, @@ -848,7 +847,7 @@ pub(crate) mod tests { shard_count: ShardCount::new(2), shard_number: ShardNumber(1), }, - stripe_size: ShardStripeSize(32768), + stripe_size, preferred_az: None, node_id: NodeId(1), }); @@ -864,7 +863,7 @@ pub(crate) mod tests { shard_count: ShardCount::new(2), shard_number: ShardNumber(0), }, - stripe_size: ShardStripeSize(32768), + stripe_size, preferred_az: None, node_id: NodeId(1), }); @@ -874,7 +873,7 @@ pub(crate) mod tests { anyhow::bail!("Wrong send result"); }; assert_eq!(request.shards.len(), 2); - assert_eq!(request.stripe_size, Some(ShardStripeSize(32768))); + assert_eq!(request.stripe_size, Some(stripe_size)); // Simulate successful send *guard = Some(ComputeRemoteState { diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 0d1dc8f8eec1..fb4530d0d219 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -22,6 +22,7 @@ use pageserver_api::controller_api::{ MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse, NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest, + TimelineImportRequest, }; use pageserver_api::models::{ DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest, @@ -1235,8 +1236,18 @@ async fn handle_step_down(req: Request) -> Result, ApiError ForwardOutcome::NotForwarded(req) => req, }; - let state = get_state(&req); - json_response(StatusCode::OK, state.service.step_down().await) + // Spawn a background task: once we start stepping down, we must finish: if the client drops + // their request we should avoid stopping in some part-stepped-down state. + let handle = tokio::spawn(async move { + let state = get_state(&req); + state.service.step_down().await + }); + + let result = handle + .await + .map_err(|e| ApiError::InternalServerError(e.into()))?; + + json_response(StatusCode::OK, result) } async fn handle_tenant_drop(req: Request) -> Result, ApiError> { @@ -1276,6 +1287,37 @@ async fn handle_tenant_import(req: Request) -> Result, ApiE ) } +async fn handle_timeline_import(req: Request) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; + + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let import_req = json_request::(&mut req).await?; + + let state = get_state(&req); + + if import_req.tenant_id != tenant_id || import_req.timeline_id != timeline_id { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "tenant id or timeline id mismatch: url={tenant_id}/{timeline_id}, body={}/{}", + import_req.tenant_id, + import_req.timeline_id + ))); + } + + json_response( + StatusCode::OK, + state.service.timeline_import(import_req).await?, + ) +} + async fn handle_tenants_dump(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -1949,6 +1991,16 @@ pub fn make_router( RequestName("debug_v1_tenant_locate"), ) }) + .post( + "/debug/v1/tenant/:tenant_id/timeline/:timeline_id/import", + |r| { + named_request_span( + r, + handle_timeline_import, + RequestName("debug_v1_timeline_import"), + ) + }, + ) .get("/debug/v1/scheduler", |r| { named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler")) }) diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 1aa9ae10aea6..a924e5b6c558 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -86,10 +86,6 @@ struct Cli { #[arg(long)] peer_jwt_token: Option, - /// URL to control plane compute notification endpoint - #[arg(long)] - compute_hook_url: Option, - /// URL to control plane storage API prefix #[arg(long)] control_plane_url: Option, @@ -360,13 +356,11 @@ async fn async_main() -> anyhow::Result<()> { "Insecure config! One or more secrets is not set. This is only permitted in `--dev` mode" ); } - StrictMode::Strict - if args.compute_hook_url.is_none() && args.control_plane_url.is_none() => - { + StrictMode::Strict if args.control_plane_url.is_none() => { // Production systems should always have a control plane URL set, to prevent falling // back to trying to use neon_local. anyhow::bail!( - "neither `--compute-hook-url` nor `--control-plane-url` are set: this is only permitted in `--dev` mode" + "`--control-plane-url` is not set: this is only permitted in `--dev` mode" ); } StrictMode::Strict if args.use_local_compute_notifications => { @@ -394,7 +388,6 @@ async fn async_main() -> anyhow::Result<()> { safekeeper_jwt_token: secrets.safekeeper_jwt_token, control_plane_jwt_token: secrets.control_plane_jwt_token, peer_jwt_token: secrets.peer_jwt_token, - compute_hook_url: args.compute_hook_url, control_plane_url: args.control_plane_url, max_offline_interval: args .max_offline_interval @@ -472,6 +465,7 @@ async fn async_main() -> anyhow::Result<()> { let https_listener = tcp_listener::bind(https_addr)?; let resolver = ReloadingCertificateResolver::new( + "main", &args.ssl_key_file, &args.ssl_cert_file, *args.ssl_cert_reload_period, diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index ea390df726a6..5ce2fb65e4bb 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -44,6 +44,15 @@ pub(crate) struct StorageControllerMetricGroup { /// Size of the in-memory map of pageserver_nodes pub(crate) storage_controller_pageserver_nodes: measured::Gauge, + /// Count of how many pageserver nodes from in-memory map have https configured + pub(crate) storage_controller_https_pageserver_nodes: measured::Gauge, + + /// Size of the in-memory map of safekeeper_nodes + pub(crate) storage_controller_safekeeper_nodes: measured::Gauge, + + /// Count of how many safekeeper nodes from in-memory map have https configured + pub(crate) storage_controller_https_safekeeper_nodes: measured::Gauge, + /// Reconciler tasks completed, broken down by success/failure/cancelled pub(crate) storage_controller_reconcile_complete: measured::CounterVec, diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index f667514517b0..e180c49b4319 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -89,6 +89,10 @@ impl Node { self.scheduling = scheduling } + pub(crate) fn has_https_port(&self) -> bool { + self.listen_https_port.is_some() + } + /// Does this registration request match `self`? This is used when deciding whether a registration /// request should be allowed to update an existing record with the same node ID. pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool { diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs index 3b731acf7e11..5a13ef750e30 100644 --- a/storage_controller/src/safekeeper.rs +++ b/storage_controller/src/safekeeper.rs @@ -89,6 +89,9 @@ impl Safekeeper { pub(crate) fn availability(&self) -> SafekeeperState { self.availability.clone() } + pub(crate) fn has_https_port(&self) -> bool { + self.listen_https_port.is_some() + } /// Perform an operation (which is given a [`SafekeeperClient`]) with retries #[allow(clippy::too_many_arguments)] pub(crate) async fn with_client_retries( diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 50f642deaf7e..a02131347400 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -43,7 +43,7 @@ use pageserver_api::models::{ TimelineInfo, TopTenantShardItem, TopTenantShardsRequest, }; use pageserver_api::shard::{ - ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, + DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, }; use pageserver_api::upcall_api::{ ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse, @@ -61,7 +61,7 @@ use utils::completion::Barrier; use utils::generation::Generation; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; -use utils::sync::gate::Gate; +use utils::sync::gate::{Gate, GateGuard}; use utils::{failpoint_support, pausable_failpoint}; use crate::background_node_operations::{ @@ -357,18 +357,10 @@ pub struct Config { // This JWT token will be used to authenticate with other storage controller instances pub peer_jwt_token: Option, - /// Where the compute hook should send notifications of pageserver attachment locations - /// (this URL points to the control plane in prod). If this is None, the compute hook will - /// assume it is running in a test environment and try to update neon_local. - pub compute_hook_url: Option, - /// Prefix for storage API endpoints of the control plane. We use this prefix to compute /// URLs that we use to send pageserver and safekeeper attachment locations. /// If this is None, the compute hook will assume it is running in a test environment /// and try to invoke neon_local instead. - /// - /// For now, there is also `compute_hook_url` which allows configuration of the pageserver - /// specific endpoint, but it is in the process of being phased out. pub control_plane_url: Option, /// Grace period within which a pageserver does not respond to heartbeats, but is still @@ -594,6 +586,8 @@ struct TenantShardSplitAbort { new_stripe_size: Option, /// Until this abort op is complete, no other operations may be done on the tenant _tenant_lock: TracingExclusiveGuard, + /// The reconciler gate for the duration of the split operation, and any included abort. + _gate: GateGuard, } #[derive(thiserror::Error, Debug)] @@ -1460,7 +1454,7 @@ impl Service { // Retry until shutdown: we must keep this request object alive until it is properly // processed, as it holds a lock guard that prevents other operations trying to do things // to the tenant while it is in a weird part-split state. - while !self.cancel.is_cancelled() { + while !self.reconcilers_cancel.is_cancelled() { match self.abort_tenant_shard_split(&op).await { Ok(_) => break, Err(e) => { @@ -1473,9 +1467,12 @@ impl Service { // when we retry, so that the abort op will succeed. If the abort op is failing // for some other reason, we will keep retrying forever, or until a human notices // and does something about it (either fixing a pageserver or restarting the controller). - tokio::time::timeout(Duration::from_secs(5), self.cancel.cancelled()) - .await - .ok(); + tokio::time::timeout( + Duration::from_secs(5), + self.reconcilers_cancel.cancelled(), + ) + .await + .ok(); } } } @@ -1509,6 +1506,10 @@ impl Service { .metrics_group .storage_controller_pageserver_nodes .set(nodes.len() as i64); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_https_pageserver_nodes + .set(nodes.values().filter(|n| n.has_https_port()).count() as i64); tracing::info!("Loading safekeepers from database..."); let safekeepers = persistence @@ -1526,6 +1527,14 @@ impl Service { let safekeepers: HashMap = safekeepers.into_iter().map(|n| (n.get_id(), n)).collect(); tracing::info!("Loaded {} safekeepers from database.", safekeepers.len()); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_safekeeper_nodes + .set(safekeepers.len() as i64); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_https_safekeeper_nodes + .set(safekeepers.values().filter(|s| s.has_https_port()).count() as i64); tracing::info!("Loading shards from database..."); let mut tenant_shard_persistence = persistence.load_active_tenant_shards().await?; @@ -1835,6 +1844,7 @@ impl Service { }; if insert { + let config = attach_req.config.clone().unwrap_or_default(); let tsp = TenantShardPersistence { tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(), shard_number: attach_req.tenant_shard_id.shard_number.0 as i32, @@ -1843,7 +1853,7 @@ impl Service { generation: attach_req.generation_override.or(Some(0)), generation_pageserver: None, placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(), - config: serde_json::to_string(&TenantConfig::default()).unwrap(), + config: serde_json::to_string(&config).unwrap(), splitting: SplitState::default(), scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default()) .unwrap(), @@ -1866,16 +1876,16 @@ impl Service { Ok(()) => { tracing::info!("Inserted shard {} in database", attach_req.tenant_shard_id); - let mut locked = self.inner.write().unwrap(); - locked.tenants.insert( + let mut shard = TenantShard::new( attach_req.tenant_shard_id, - TenantShard::new( - attach_req.tenant_shard_id, - ShardIdentity::unsharded(), - PlacementPolicy::Attached(0), - None, - ), + ShardIdentity::unsharded(), + PlacementPolicy::Attached(0), + None, ); + shard.config = config; + + let mut locked = self.inner.write().unwrap(); + locked.tenants.insert(attach_req.tenant_shard_id, shard); tracing::info!("Inserted shard {} in memory", attach_req.tenant_shard_id); } } @@ -1960,11 +1970,12 @@ impl Service { .set_attached(scheduler, attach_req.node_id); tracing::info!( - "attach_hook: tenant {} set generation {:?}, pageserver {}", + "attach_hook: tenant {} set generation {:?}, pageserver {}, config {:?}", attach_req.tenant_shard_id, tenant_shard.generation, // TODO: this is an odd number of 0xf's - attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)) + attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff)), + attach_req.config, ); // Trick the reconciler into not doing anything for this tenant: this helps @@ -2742,7 +2753,7 @@ impl Service { count: tenant_shard_id.shard_count, // We only import un-sharded or single-sharded tenants, so stripe // size can be made up arbitrarily here. - stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE, + stripe_size: DEFAULT_STRIPE_SIZE, }, placement_policy: Some(placement_policy), config: req.config.tenant_conf, @@ -4898,7 +4909,7 @@ impl Service { 1, 10, Duration::from_secs(5), - &self.cancel, + &self.reconcilers_cancel, ) .await { @@ -5149,6 +5160,11 @@ impl Service { ) .await; + let _gate = self + .reconcilers_gate + .enter() + .map_err(|_| ApiError::ShuttingDown)?; + let new_shard_count = ShardCount::new(split_req.new_shard_count); let new_stripe_size = split_req.new_stripe_size; @@ -5176,6 +5192,7 @@ impl Service { new_shard_count, new_stripe_size, _tenant_lock, + _gate, }) // Ignore error sending: that just means we're shutting down: aborts are ephemeral so it's fine to drop it. .ok(); @@ -5515,7 +5532,10 @@ impl Service { "failpoint".to_string() ))); - failpoint_support::sleep_millis_async!("shard-split-post-remote-sleep", &self.cancel); + failpoint_support::sleep_millis_async!( + "shard-split-post-remote-sleep", + &self.reconcilers_cancel + ); tracing::info!( "Split {} into {}", @@ -5573,7 +5593,7 @@ impl Service { stripe_size, preferred_az: preferred_az_id.as_ref().map(Cow::Borrowed), }, - &self.cancel, + &self.reconcilers_cancel, ) .await { @@ -6014,9 +6034,21 @@ impl Service { .max() .expect("We already validated >0 shards"); - // FIXME: we have no way to recover the shard stripe size from contents of remote storage: this will - // only work if they were using the default stripe size. - let stripe_size = ShardParameters::DEFAULT_STRIPE_SIZE; + // Find the tenant's stripe size. This wasn't always persisted in the tenant manifest, so + // fall back to the original default stripe size of 32768 (256 MB) if it's not specified. + const ORIGINAL_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(32768); + let stripe_size = scan_result + .shards + .iter() + .find(|s| s.tenant_shard_id.shard_count == shard_count && s.generation == generation) + .expect("we validated >0 shards above") + .stripe_size + .unwrap_or_else(|| { + if shard_count.count() > 1 { + warn!("unknown stripe size, assuming {ORIGINAL_STRIPE_SIZE}"); + } + ORIGINAL_STRIPE_SIZE + }); let (response, waiters) = self .do_tenant_create(TenantCreateRequest { @@ -6242,6 +6274,10 @@ impl Service { .metrics_group .storage_controller_pageserver_nodes .set(locked.nodes.len() as i64); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_https_pageserver_nodes + .set(locked.nodes.values().filter(|n| n.has_https_port()).count() as i64); locked.scheduler.node_remove(node_id); @@ -6333,6 +6369,10 @@ impl Service { .metrics_group .storage_controller_pageserver_nodes .set(nodes.len() as i64); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_https_pageserver_nodes + .set(nodes.values().filter(|n| n.has_https_port()).count() as i64); } } @@ -6557,6 +6597,10 @@ impl Service { .metrics_group .storage_controller_pageserver_nodes .set(locked.nodes.len() as i64); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_https_pageserver_nodes + .set(locked.nodes.values().filter(|n| n.has_https_port()).count() as i64); match registration_status { RegistrationStatus::New => { @@ -7270,7 +7314,7 @@ impl Service { } // Eventual consistency: if an earlier reconcile job failed, and the shard is still - // dirty, spawn another rone + // dirty, spawn another one if self .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal) .is_some() @@ -7829,7 +7873,7 @@ impl Service { // old, persisted stripe size. let new_stripe_size = match candidate.id.shard_count.count() { 0 => panic!("invalid shard count 0"), - 1 => Some(ShardParameters::DEFAULT_STRIPE_SIZE), + 1 => Some(DEFAULT_STRIPE_SIZE), 2.. => None, }; @@ -8634,9 +8678,24 @@ impl Service { failpoint_support::sleep_millis_async!("sleep-on-step-down-handling"); self.inner.write().unwrap().step_down(); - // TODO: would it make sense to have a time-out for this? - self.stop_reconciliations(StopReconciliationsReason::SteppingDown) - .await; + + // Wait for reconciliations to stop, or terminate this process if they + // fail to stop in time (this indicates a bug in shutdown) + tokio::select! { + _ = self.stop_reconciliations(StopReconciliationsReason::SteppingDown) => { + tracing::info!("Reconciliations stopped, proceeding with step down"); + } + _ = async { + failpoint_support::sleep_millis_async!("step-down-delay-timeout"); + tokio::time::sleep(Duration::from_secs(10)).await + } => { + tracing::warn!("Step down timed out while waiting for reconciliation gate, terminating process"); + + // The caller may proceed to act as leader when it sees this request fail: reduce the chance + // of a split-brain situation by terminating this controller instead of leaving it up in a partially-shut-down state. + std::process::exit(1); + } + } let mut global_observed = GlobalObservedState::default(); let locked = self.inner.read().unwrap(); diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 7f2c63b9afcc..a23b9a4a0260 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -5,19 +5,23 @@ use std::time::Duration; use super::safekeeper_reconciler::ScheduleRequest; use crate::heartbeater::SafekeeperState; +use crate::metrics; use crate::persistence::{ DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence, }; use crate::safekeeper::Safekeeper; use anyhow::Context; use http_utils::error::ApiError; -use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; +use pageserver_api::controller_api::{ + SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest, +}; use pageserver_api::models::{self, SafekeeperInfo, SafekeepersInfo, TimelineInfo}; use safekeeper_api::membership::{MemberSet, SafekeeperId}; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use utils::id::{NodeId, TenantId, TimelineId}; use utils::logging::SecretString; +use utils::lsn::Lsn; use super::Service; @@ -297,6 +301,31 @@ impl Service { timeline_id, }) } + + /// Directly insert the timeline into the database without reconciling it with safekeepers. + /// + /// Useful if the timeline already exists on the specified safekeepers, + /// but we want to make it storage controller managed. + pub(crate) async fn timeline_import(&self, req: TimelineImportRequest) -> Result<(), ApiError> { + let persistence = TimelinePersistence { + tenant_id: req.tenant_id.to_string(), + timeline_id: req.timeline_id.to_string(), + start_lsn: Lsn::INVALID.into(), + generation: 1, + sk_set: req.sk_set.iter().map(|sk_id| sk_id.0 as i64).collect(), + new_sk_set: None, + cplane_notified_generation: 1, + deleted_at: None, + }; + let inserted = self.persistence.insert_timeline(persistence).await?; + if inserted { + tracing::info!("imported timeline into db"); + } else { + tracing::info!("didn't import timeline into db, as it is already present in db"); + } + Ok(()) + } + /// Perform timeline deletion on safekeepers. Will return success: we persist the deletion into the reconciler. pub(super) async fn tenant_timeline_delete_safekeepers( self: &Arc, @@ -590,6 +619,20 @@ impl Service { } } locked.safekeepers = Arc::new(safekeepers); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_safekeeper_nodes + .set(locked.safekeepers.len() as i64); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_https_safekeeper_nodes + .set( + locked + .safekeepers + .values() + .filter(|s| s.has_https_port()) + .count() as i64, + ); } Ok(()) } diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 8424c65aba4c..3a75e96cb2b5 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -2000,7 +2000,7 @@ pub(crate) mod tests { use std::rc::Rc; use pageserver_api::controller_api::NodeAvailability; - use pageserver_api::shard::{ShardCount, ShardNumber}; + use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardNumber}; use rand::SeedableRng; use rand::rngs::StdRng; use utils::id::TenantId; @@ -2012,6 +2012,7 @@ pub(crate) mod tests { let tenant_id = TenantId::generate(); let shard_number = ShardNumber(0); let shard_count = ShardCount::new(1); + let stripe_size = DEFAULT_STRIPE_SIZE; let tenant_shard_id = TenantShardId { tenant_id, @@ -2020,12 +2021,7 @@ pub(crate) mod tests { }; TenantShard::new( tenant_shard_id, - ShardIdentity::new( - shard_number, - shard_count, - pageserver_api::shard::ShardStripeSize(32768), - ) - .unwrap(), + ShardIdentity::new(shard_number, shard_count, stripe_size).unwrap(), policy, None, ) @@ -2045,6 +2041,7 @@ pub(crate) mod tests { shard_count: ShardCount, preferred_az: Option, ) -> Vec { + let stripe_size = DEFAULT_STRIPE_SIZE; (0..shard_count.count()) .map(|i| { let shard_number = ShardNumber(i); @@ -2056,12 +2053,7 @@ pub(crate) mod tests { }; TenantShard::new( tenant_shard_id, - ShardIdentity::new( - shard_number, - shard_count, - pageserver_api::shard::ShardStripeSize(32768), - ) - .unwrap(), + ShardIdentity::new(shard_number, shard_count, stripe_size).unwrap(), policy.clone(), preferred_az.clone(), ) diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index df500544dc70..879808b7baae 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -194,6 +194,7 @@ def counter(name: str) -> str: counter("pageserver_wait_lsn_started_count"), counter("pageserver_wait_lsn_finished_count"), counter("pageserver_wait_ondemand_download_seconds_sum"), + counter("pageserver_page_service_batch_break_reason"), *histogram("pageserver_page_service_batch_size"), *histogram("pageserver_page_service_pagestream_batch_wait_time_seconds"), *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS, diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index d555ee298915..5f5626fb98c0 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -417,6 +417,19 @@ def storage_controller_stop(self, immediate: bool, instance_id: int | None = Non cmd.append(f"--instance-id={instance_id}") return self.raw_cli(cmd) + def object_storage_start(self, timeout_in_seconds: int | None = None): + cmd = ["object-storage", "start"] + if timeout_in_seconds is not None: + cmd.append(f"--start-timeout={timeout_in_seconds}s") + return self.raw_cli(cmd) + + def object_storage_stop(self, immediate: bool): + cmd = ["object-storage", "stop"] + if immediate: + cmd.extend(["-m", "immediate"]) + return self.raw_cli(cmd) + pass + def pageserver_start( self, id: int, diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 5694bf170e36..10bbb7020bf1 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -14,6 +14,7 @@ import time import uuid from collections import defaultdict +from collections.abc import Mapping from contextlib import closing, contextmanager from dataclasses import dataclass from datetime import datetime @@ -79,7 +80,12 @@ default_remote_storage, remote_storage_to_toml_dict, ) -from fixtures.safekeeper.http import SafekeeperHttpClient +from fixtures.safekeeper.http import ( + MembershipConfiguration, + SafekeeperHttpClient, + SafekeeperId, + TimelineCreateRequest, +) from fixtures.safekeeper.utils import wait_walreceivers_absent from fixtures.utils import ( ATTACHMENT_NAME_REGEX, @@ -941,6 +947,8 @@ def cleanup_local_storage(self): continue if SMALL_DB_FILE_NAME_REGEX.fullmatch(test_file.name): continue + if FINAL_METRICS_FILE_NAME == test_file.name: + continue log.debug(f"Removing large database {test_file} file") test_file.unlink() elif test_entry.is_dir(): @@ -1023,6 +1031,8 @@ def __exit__( self.env.broker.assert_no_errors() + self.env.object_storage.assert_no_errors() + try: self.overlay_cleanup_teardown() except Exception as e: @@ -1118,6 +1128,8 @@ def __init__(self, config: NeonEnvBuilder): pagectl_env_vars["RUST_LOG"] = self.rust_log_override self.pagectl = Pagectl(extra_env=pagectl_env_vars, binpath=self.neon_binpath) + self.object_storage = ObjectStorage(self) + # The URL for the pageserver to use as its control_plane_api config if config.storage_controller_port_override is not None: log.info( @@ -1173,6 +1185,7 @@ def __init__(self, config: NeonEnvBuilder): }, "safekeepers": [], "pageservers": [], + "object_storage": {"port": self.port_distributor.get_port()}, "generate_local_ssl_certs": self.generate_local_ssl_certs, } @@ -1244,6 +1257,7 @@ def __init__(self, config: NeonEnvBuilder): "mode": "pipelined", "execution": "concurrent-futures", "max_batch_size": 32, + "batching": "scattered-lsn", } get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io @@ -1408,6 +1422,8 @@ def start(self, timeout_in_seconds: int | None = None): self.storage_controller.on_safekeeper_deploy(sk_id, body) self.storage_controller.safekeeper_scheduling_policy(sk_id, "Active") + self.object_storage.start(timeout_in_seconds=timeout_in_seconds) + def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True): """ After this method returns, there should be no child processes running. @@ -1425,6 +1441,8 @@ def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoi except Exception as e: raise_later = e + self.object_storage.stop(immediate=immediate) + # Stop storage controller before pageservers: we don't want it to spuriously # detect a pageserver "failure" during test teardown self.storage_controller.stop(immediate=immediate) @@ -1441,6 +1459,12 @@ def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoi except Exception as e: metric_errors.append(e) log.error(f"metric validation failed on {pageserver.id}: {e}") + + try: + pageserver.snapshot_final_metrics() + except Exception as e: + log.error(f"metric snapshot failed on {pageserver.id}: {e}") + try: pageserver.stop(immediate=immediate) except RuntimeError: @@ -1971,10 +1995,13 @@ def attach_hook_issue( tenant_shard_id: TenantId | TenantShardId, pageserver_id: int, generation_override: int | None = None, + config: None | dict[str, Any] = None, ) -> int: body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id} if generation_override is not None: body["generation_override"] = generation_override + if config is not None: + body["config"] = config response = self.request( "POST", @@ -2635,6 +2662,26 @@ def __exit__( self.stop(immediate=True) +class ObjectStorage(LogUtils): + def __init__(self, env: NeonEnv): + service_dir = env.repo_dir / "object_storage" + super().__init__(logfile=service_dir / "object_storage.log") + self.conf_path = service_dir / "object_storage.json" + self.env = env + + def base_url(self): + return json.loads(self.conf_path.read_text())["listen"] + + def start(self, timeout_in_seconds: int | None = None): + self.env.neon_cli.object_storage_start(timeout_in_seconds) + + def stop(self, immediate: bool = False): + self.env.neon_cli.object_storage_stop(immediate) + + def assert_no_errors(self): + assert_no_errors(self.logfile, "object_storage", []) + + class NeonProxiedStorageController(NeonStorageController): def __init__(self, env: NeonEnv, proxy_port: int, auth_enabled: bool, use_https: bool): super().__init__(env, proxy_port, auth_enabled, use_https) @@ -2849,13 +2896,14 @@ def restart( self, immediate: bool = False, timeout_in_seconds: int | None = None, + extra_env_vars: dict[str, str] | None = None, ): """ High level wrapper for restart: restarts the process, and waits for tenant state to stabilize. """ self.stop(immediate=immediate) - self.start(timeout_in_seconds=timeout_in_seconds) + self.start(timeout_in_seconds=timeout_in_seconds, extra_env_vars=extra_env_vars) self.quiesce_tenants() def quiesce_tenants(self): @@ -2932,6 +2980,20 @@ def assert_no_metric_errors(self): value = self.http_client().get_metric_value(metric) assert value == 0, f"Nonzero {metric} == {value}" + def snapshot_final_metrics(self): + """ + Take a snapshot of this pageserver's metrics and stash in its work directory. + """ + if not self.running: + log.info(f"Skipping metrics snapshot on pageserver {self.id}, it is not running") + return + + metrics = self.http_client().get_metrics_str() + metrics_snapshot_path = self.workdir / FINAL_METRICS_FILE_NAME + + with open(metrics_snapshot_path, "w") as f: + f.write(metrics) + def tenant_attach( self, tenant_id: TenantId, @@ -2944,11 +3006,12 @@ def tenant_attach( to call into the pageserver HTTP client. """ client = self.http_client() - if generation is None: - generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) - elif override_storage_controller_generation: + if generation is None or override_storage_controller_generation: generation = self.env.storage_controller.attach_hook_issue( - tenant_id, self.id, generation + tenant_id, + self.id, + generation_override=generation if override_storage_controller_generation else None, + config=config, ) return client.tenant_attach( tenant_id, @@ -4263,31 +4326,32 @@ def respec(self, **kwargs: Any) -> None: def respec_deep(self, **kwargs: Any) -> None: """ Update the endpoint.json file taking into account nested keys. - It does one level deep update. Should enough for most cases. Distinct method from respec() to do not break existing functionality. - NOTE: This method also updates the spec.json file, not endpoint.json. - We need it because neon_local also writes to spec.json, so intended + NOTE: This method also updates the config.json file, not endpoint.json. + We need it because neon_local also writes to config.json, so intended use-case is i) start endpoint with some config, ii) respec_deep(), iii) call reconfigure() to apply the changes. """ - config_path = os.path.join(self.endpoint_path(), "spec.json") + + def update(curr, patch): + for k, v in patch.items(): + if isinstance(v, Mapping): + curr[k] = update(curr.get(k, {}), v) + else: + curr[k] = v + return curr + + config_path = os.path.join(self.endpoint_path(), "config.json") with open(config_path) as f: - data_dict: dict[str, Any] = json.load(f) + config: dict[str, Any] = json.load(f) - log.debug("Current compute spec: %s", json.dumps(data_dict, indent=4)) + log.debug("Current compute config: %s", json.dumps(config, indent=4)) - for key, value in kwargs.items(): - if isinstance(value, dict): - if key not in data_dict: - data_dict[key] = value - else: - data_dict[key] = {**data_dict[key], **value} - else: - data_dict[key] = value + update(config, kwargs) with open(config_path, "w") as file: - log.debug("Updating compute spec to: %s", json.dumps(data_dict, indent=4)) - json.dump(data_dict, file, indent=4) + log.debug("Updating compute config to: %s", json.dumps(config, indent=4)) + json.dump(config, file, indent=4) def wait_for_migrations(self, wait_for: int = NUM_COMPUTE_MIGRATIONS) -> None: """ @@ -4304,7 +4368,7 @@ def check_migrations_done(): wait_until(check_migrations_done) # Mock the extension part of spec passed from control plane for local testing - # endpooint.rs adds content of this file as a part of the spec.json + # endpooint.rs adds content of this file as a part of the config.json def create_remote_extension_spec(self, spec: dict[str, Any]): """Create a remote extension spec file for the endpoint.""" remote_extensions_spec_path = os.path.join( @@ -4810,6 +4874,50 @@ def paused(): wait_until(paused) + @staticmethod + def sks_to_safekeeper_ids(sks: list[Safekeeper]) -> list[SafekeeperId]: + return [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in sks] + + @staticmethod + def mconf_sks(env: NeonEnv, mconf: MembershipConfiguration) -> list[Safekeeper]: + """ + List of Safekeepers which are members in `mconf`. + """ + members_ids = [m.id for m in mconf.members] + new_members_ids = [m.id for m in mconf.new_members] if mconf.new_members is not None else [] + return [sk for sk in env.safekeepers if sk.id in members_ids or sk.id in new_members_ids] + + @staticmethod + def create_timeline( + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + mconf: MembershipConfiguration, + members_sks: list[Safekeeper], + ): + """ + Manually create timeline on safekeepers with given (presumably inital) + mconf: figure out LSN from pageserver, bake request and execute it on + given safekeepers. + + Normally done by storcon, but some tests want to do it manually so far. + """ + ps_http_cli = ps.http_client() + # figure out initial LSN. + ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id) + init_lsn = ps_timeline_detail["last_record_lsn"] + log.info(f"initial LSN: {init_lsn}") + # sk timeline creation request expects minor version + pg_version = ps_timeline_detail["pg_version"] * 10000 + # create inital mconf + create_r = TimelineCreateRequest( + tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None + ) + log.info(f"sending timeline create: {create_r.to_json()}") + + for sk in members_sks: + sk.http_client().timeline_create(create_r) + class NeonBroker(LogUtils): """An object managing storage_broker instance""" @@ -5048,6 +5156,8 @@ def pytest_addoption(parser: Parser): r"config-v1|heatmap-v1|tenant-manifest|metadata|.+\.(?:toml|pid|json|sql|conf)" ) +FINAL_METRICS_FILE_NAME: str = "final_metrics.txt" + SKIP_DIRS = frozenset( ( diff --git a/test_runner/fixtures/pageserver/common_types.py b/test_runner/fixtures/pageserver/common_types.py index 0e068db59307..0a92883e9646 100644 --- a/test_runner/fixtures/pageserver/common_types.py +++ b/test_runner/fixtures/pageserver/common_types.py @@ -105,7 +105,7 @@ def parse_layer_file_name(file_name: str) -> LayerName: except InvalidFileName: pass - raise InvalidFileName("neither image nor delta layer") + raise InvalidFileName(f"neither image nor delta layer: {file_name}") def is_future_layer(layer_file_name: LayerName, disk_consistent_lsn: Lsn): diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index eedb693e3d62..71c750b9eb3b 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -65,13 +65,11 @@ def single_timeline( assert ps_http.tenant_list() == [] def attach(tenant): - # NB: create the new tenant in the storage controller with the correct tenant config. This - # will pick up the existing tenant data from remote storage. If we just attach it to the - # Pageserver, the storage controller will reset the tenant config to the default. - env.create_tenant( - tenant_id=tenant, - timeline_id=template_timeline, - conf=template_config, + env.pageserver.tenant_attach( + tenant, + config=template_config, + generation=100, + override_storage_controller_generation=True, ) with concurrent.futures.ThreadPoolExecutor(max_workers=22) as executor: diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index e409151b7604..839e985419e8 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -25,7 +25,7 @@ class Walreceiver: @dataclass class SafekeeperTimelineStatus: - mconf: Configuration | None + mconf: MembershipConfiguration | None term: int last_log_term: int pg_version: int # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2 @@ -78,17 +78,17 @@ class SafekeeperId: @dataclass -class Configuration: +class MembershipConfiguration: generation: int members: list[SafekeeperId] new_members: list[SafekeeperId] | None @classmethod - def from_json(cls, d: dict[str, Any]) -> Configuration: + def from_json(cls, d: dict[str, Any]) -> MembershipConfiguration: generation = d["generation"] members = d["members"] new_members = d.get("new_members") - return Configuration(generation, members, new_members) + return MembershipConfiguration(generation, members, new_members) def to_json(self) -> str: return json.dumps(self, cls=EnhancedJSONEncoder) @@ -98,7 +98,7 @@ def to_json(self) -> str: class TimelineCreateRequest: tenant_id: TenantId timeline_id: TimelineId - mconf: Configuration + mconf: MembershipConfiguration # not exactly PgVersion, for example 150002 for 15.2 pg_version: int start_lsn: Lsn @@ -110,13 +110,13 @@ def to_json(self) -> str: @dataclass class TimelineMembershipSwitchResponse: - previous_conf: Configuration - current_conf: Configuration + previous_conf: MembershipConfiguration + current_conf: MembershipConfiguration @classmethod def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse: - previous_conf = Configuration.from_json(d["previous_conf"]) - current_conf = Configuration.from_json(d["current_conf"]) + previous_conf = MembershipConfiguration.from_json(d["previous_conf"]) + current_conf = MembershipConfiguration.from_json(d["current_conf"]) return TimelineMembershipSwitchResponse(previous_conf, current_conf) @@ -194,7 +194,7 @@ def timeline_status( resj = res.json() walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]] # It is always normally not None, it is allowed only to make forward compat tests happy. - mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None + mconf = MembershipConfiguration.from_json(resj["mconf"]) if "mconf" in resj else None return SafekeeperTimelineStatus( mconf=mconf, term=resj["acceptor_state"]["term"], @@ -223,7 +223,9 @@ def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn: return self.timeline_status(tenant_id, timeline_id).commit_lsn # Get timeline membership configuration. - def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration: + def get_membership( + self, tenant_id: TenantId, timeline_id: TimelineId + ) -> MembershipConfiguration: # make mypy happy return self.timeline_status(tenant_id, timeline_id).mconf # type: ignore @@ -275,7 +277,7 @@ def pull_timeline(self, body: dict[str, Any]) -> dict[str, Any]: return res_json def timeline_exclude( - self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration + self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration ) -> dict[str, Any]: res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/exclude", @@ -287,7 +289,7 @@ def timeline_exclude( return res_json def membership_switch( - self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration + self, tenant_id: TenantId, timeline_id: TimelineId, to: MembershipConfiguration ) -> TimelineMembershipSwitchResponse: res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership", diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index efd423104d23..8af52dcbd05f 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -66,11 +66,11 @@ def record(metric, **kwargs): n_txns = 500000 - def setup_wrapper(env: NeonEnv): - return setup_tenant_template(env, n_txns) - env = setup_pageserver_with_tenants( - neon_env_builder, f"large_slru_count-{n_tenants}-{n_txns}", n_tenants, setup_wrapper + neon_env_builder, + f"large_slru_count-{n_tenants}-{n_txns}", + n_tenants, + lambda env: setup_tenant_template(env, n_txns), ) run_benchmark(env, pg_bin, record, duration) @@ -80,10 +80,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int): "gc_period": "0s", # disable periodic gc "checkpoint_timeout": "10 years", "compaction_period": "0s", # disable periodic compaction - "compaction_threshold": 10, - "compaction_target_size": 134217728, - "checkpoint_distance": 268435456, - "image_creation_threshold": 3, } template_tenant, template_timeline = env.create_tenant(set_default=True) diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py index 2c27368001b3..b17ca772c9c7 100644 --- a/test_runner/performance/pageserver/test_page_service_batching.py +++ b/test_runner/performance/pageserver/test_page_service_batching.py @@ -1,5 +1,7 @@ +import concurrent.futures import dataclasses import json +import threading import time from dataclasses import dataclass from pathlib import Path @@ -28,38 +30,33 @@ class PageServicePipeliningConfigSerial(PageServicePipeliningConfig): class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig): max_batch_size: int execution: str + batching: str mode: str = "pipelined" -EXECUTION = ["concurrent-futures", "tasks"] +EXECUTION = ["concurrent-futures"] +BATCHING = ["uniform-lsn", "scattered-lsn"] NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] for max_batch_size in [1, 32]: for execution in EXECUTION: - NON_BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution)) + for batching in BATCHING: + NON_BATCHABLE.append( + PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) + ) -BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] -for max_batch_size in [1, 2, 4, 8, 16, 32]: +BATCHABLE: list[PageServicePipeliningConfig] = [] +for max_batch_size in [32]: for execution in EXECUTION: - BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution)) + for batching in BATCHING: + BATCHABLE.append( + PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) + ) @pytest.mark.parametrize( "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name", [ - # non-batchable workloads - # (A separate benchmark will consider latency). - *[ - ( - 50, - config, - TARGET_RUNTIME, - 1, - 128, - f"not batchable {dataclasses.asdict(config)}", - ) - for config in NON_BATCHABLE - ], # batchable workloads should show throughput and CPU efficiency improvements *[ ( @@ -137,7 +134,14 @@ def test_throughput( env = neon_env_builder.init_start() ps_http = env.pageserver.http_client() - endpoint = env.endpoints.create_start("main") + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + # minimal lfc & small shared buffers to force requests to pageserver + "neon.max_file_cache_size=1MB", + "shared_buffers=10MB", + ], + ) conn = endpoint.connect() cur = conn.cursor() @@ -155,7 +159,6 @@ def test_throughput( tablesize = tablesize_mib * 1024 * 1024 npages = tablesize // (8 * 1024) cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,)) - # TODO: can we force postgres to do sequential scans? # # Run the workload, collect `Metrics` before and after, calculate difference, normalize. @@ -166,6 +169,7 @@ class Metrics: time: float pageserver_batch_size_histo_sum: float pageserver_batch_size_histo_count: float + pageserver_batch_breaks_reason_count: dict[str, int] compute_getpage_count: float pageserver_cpu_seconds_total: float @@ -179,6 +183,10 @@ def __sub__(self, other: "Metrics") -> "Metrics": compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count, pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total - other.pageserver_cpu_seconds_total, + pageserver_batch_breaks_reason_count={ + reason: count - other.pageserver_batch_breaks_reason_count.get(reason, 0) + for reason, count in self.pageserver_batch_breaks_reason_count.items() + }, ) def normalize(self, by) -> "Metrics": @@ -188,6 +196,10 @@ def normalize(self, by) -> "Metrics": pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count / by, compute_getpage_count=self.compute_getpage_count / by, pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by, + pageserver_batch_breaks_reason_count={ + reason: count / by + for reason, count in self.pageserver_batch_breaks_reason_count.items() + }, ) def get_metrics() -> Metrics: @@ -197,6 +209,20 @@ def get_metrics() -> Metrics: ) compute_getpage_count = cur.fetchall()[0][0] pageserver_metrics = ps_http.get_metrics() + for name, samples in pageserver_metrics.metrics.items(): + for sample in samples: + log.info(f"{name=} labels={sample.labels} {sample.value}") + + raw_batch_break_reason_count = pageserver_metrics.query_all( + "pageserver_page_service_batch_break_reason_total", + filter={"timeline_id": str(env.initial_timeline)}, + ) + + batch_break_reason_count = { + sample.labels["reason"]: int(sample.value) + for sample in raw_batch_break_reason_count + } + return Metrics( time=time.time(), pageserver_batch_size_histo_sum=pageserver_metrics.query_one( @@ -205,34 +231,58 @@ def get_metrics() -> Metrics: pageserver_batch_size_histo_count=pageserver_metrics.query_one( "pageserver_page_service_batch_size_count" ).value, + pageserver_batch_breaks_reason_count=batch_break_reason_count, compute_getpage_count=compute_getpage_count, pageserver_cpu_seconds_total=pageserver_metrics.query_one( "libmetrics_process_cpu_seconds_highres" ).value, ) - def workload() -> Metrics: + def workload(disruptor_started: threading.Event) -> Metrics: + disruptor_started.wait() start = time.time() iters = 0 while time.time() - start < target_runtime or iters < 2: - log.info("Seqscan %d", iters) if iters == 1: # round zero for warming up before = get_metrics() - cur.execute( - "select clear_buffer_cache()" - ) # TODO: what about LFC? doesn't matter right now because LFC isn't enabled by default in tests cur.execute("select sum(data::bigint) from t") assert cur.fetchall()[0][0] == npages * (npages + 1) // 2 iters += 1 after = get_metrics() return (after - before).normalize(iters - 1) + def disruptor(disruptor_started: threading.Event, stop_disruptor: threading.Event): + conn = endpoint.connect() + cur = conn.cursor() + iters = 0 + while True: + cur.execute("SELECT pg_logical_emit_message(true, 'test', 'advancelsn')") + if stop_disruptor.is_set(): + break + disruptor_started.set() + iters += 1 + time.sleep(0.001) + return iters + env.pageserver.patch_config_toml_nonrecursive( {"page_service_pipelining": dataclasses.asdict(pipelining_config)} ) - env.pageserver.restart() - metrics = workload() + + # set trace for log analysis below + env.pageserver.restart(extra_env_vars={"RUST_LOG": "info,pageserver::page_service=trace"}) + + log.info("Starting workload") + + with concurrent.futures.ThreadPoolExecutor() as executor: + disruptor_started = threading.Event() + stop_disruptor = threading.Event() + disruptor_fut = executor.submit(disruptor, disruptor_started, stop_disruptor) + workload_fut = executor.submit(workload, disruptor_started) + metrics = workload_fut.result() + stop_disruptor.set() + ndisruptions = disruptor_fut.result() + log.info("Disruptor issued %d disrupting requests", ndisruptions) log.info("Results: %s", metrics) @@ -249,7 +299,16 @@ def workload() -> Metrics: # for metric, value in dataclasses.asdict(metrics).items(): - zenbenchmark.record(f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM) + if metric == "pageserver_batch_breaks_reason_count": + assert isinstance(value, dict) + for reason, count in value.items(): + zenbenchmark.record( + f"counters.{metric}_{reason}", count, unit="", report=MetricReport.TEST_PARAM + ) + else: + zenbenchmark.record( + f"counters.{metric}", value, unit="", report=MetricReport.TEST_PARAM + ) zenbenchmark.record( "perfmetric.batching_factor", @@ -262,7 +321,10 @@ def workload() -> Metrics: PRECISION_CONFIGS: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] for max_batch_size in [1, 32]: for execution in EXECUTION: - PRECISION_CONFIGS.append(PageServicePipeliningConfigPipelined(max_batch_size, execution)) + for batching in BATCHING: + PRECISION_CONFIGS.append( + PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) + ) @pytest.mark.parametrize( diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py index 7a6d88f79c77..b50659defc4f 100644 --- a/test_runner/performance/pageserver/util.py +++ b/test_runner/performance/pageserver/util.py @@ -40,6 +40,8 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): for layer in info.historic_layers: assert not layer.remote + env.storage_controller.reconcile_until_idle(timeout_secs=60) + log.info("ready") diff --git a/test_runner/performance/test_perf_oltp_large_tenant.py b/test_runner/performance/test_perf_oltp_large_tenant.py index 957a4ec796ac..b45394d6271b 100644 --- a/test_runner/performance/test_perf_oltp_large_tenant.py +++ b/test_runner/performance/test_perf_oltp_large_tenant.py @@ -145,11 +145,14 @@ def run_database_maintenance(env: PgCompare): END $$; """ ) - - log.info("start REINDEX TABLE CONCURRENTLY transaction.transaction") - with env.zenbenchmark.record_duration("reindex concurrently"): - cur.execute("REINDEX TABLE CONCURRENTLY transaction.transaction;") - log.info("finished REINDEX TABLE CONCURRENTLY transaction.transaction") + # in production a customer would likely use reindex concurrently + # but for our test we don't care about the downtime + # and it would just about double the time we report in the test + # because we need one more table scan for each index + log.info("start REINDEX TABLE transaction.transaction") + with env.zenbenchmark.record_duration("reindex"): + cur.execute("REINDEX TABLE transaction.transaction;") + log.info("finished REINDEX TABLE transaction.transaction") @pytest.mark.parametrize("custom_scripts", get_custom_scripts()) diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py index df5419f29268..16cdab155a60 100644 --- a/test_runner/performance/test_physical_replication.py +++ b/test_runner/performance/test_physical_replication.py @@ -64,8 +64,8 @@ def test_ro_replica_lag( project = neon_api.create_project(pg_version) project_id = project["project"]["id"] - log.info("Project ID: {}", project_id) - log.info("Primary endpoint ID: {}", project["project"]["endpoints"][0]["id"]) + log.info("Project ID: %s", project_id) + log.info("Primary endpoint ID: %s", project["project"]["endpoints"][0]["id"]) neon_api.wait_for_operation_to_finish(project_id) error_occurred = False try: @@ -81,7 +81,7 @@ def test_ro_replica_lag( endpoint_type="read_only", settings={"pg_settings": {"hot_standby_feedback": "on"}}, ) - log.info("Replica endpoint ID: {}", replica["endpoint"]["id"]) + log.info("Replica endpoint ID: %s", replica["endpoint"]["id"]) replica_env = master_env.copy() replica_env["PGHOST"] = replica["endpoint"]["host"] neon_api.wait_for_operation_to_finish(project_id) @@ -197,8 +197,8 @@ def test_replication_start_stop( project = neon_api.create_project(pg_version) project_id = project["project"]["id"] - log.info("Project ID: {}", project_id) - log.info("Primary endpoint ID: {}", project["project"]["endpoints"][0]["id"]) + log.info("Project ID: %s", project_id) + log.info("Primary endpoint ID: %s", project["project"]["endpoints"][0]["id"]) neon_api.wait_for_operation_to_finish(project_id) try: branch_id = project["branch"]["id"] @@ -215,7 +215,7 @@ def test_replication_start_stop( endpoint_type="read_only", settings={"pg_settings": {"hot_standby_feedback": "on"}}, ) - log.info("Replica {} endpoint ID: {}", i + 1, replica["endpoint"]["id"]) + log.info("Replica %d endpoint ID: %s", i + 1, replica["endpoint"]["id"]) replicas.append(replica) neon_api.wait_for_operation_to_finish(project_id) diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index 027be03707b7..22c0e461b5c0 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -808,9 +808,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.38.0" +version = "1.38.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" +checksum = "68722da18b0fc4a05fdc1120b302b82051265792a1e1b399086e9b204b10ad3d" dependencies = [ "backtrace", "bytes", diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 5021cc4b1790..9b6930695c79 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -187,6 +187,7 @@ def test_fully_custom_config(positive_env: NeonEnv): }, "rel_size_v2_enabled": False, # test suite enables it by default as of https://github.com/neondatabase/neon/issues/11081, so, custom config means disabling it "gc_compaction_enabled": True, + "gc_compaction_verification": False, "gc_compaction_initial_threshold_kb": 1024000, "gc_compaction_ratio_percent": 200, "image_creation_preempt_threshold": 5, diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index 6789939e0c32..84d37de9f14b 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -38,12 +38,34 @@ "compaction_target_size": 1024**2, "image_creation_threshold": 1, "image_creation_preempt_threshold": 1, - # compact more frequently + # Compact more frequently "compaction_threshold": 3, "compaction_upper_limit": 6, "lsn_lease_length": "0s", } +PREEMPT_GC_COMPACTION_TENANT_CONF = { + "gc_period": "5s", + "compaction_period": "5s", + # Small checkpoint distance to create many layers + "checkpoint_distance": 1024**2, + # Compact small layers + "compaction_target_size": 1024**2, + "image_creation_threshold": 10000, # Do not create image layers at all + "image_creation_preempt_threshold": 10000, + # Compact more frequently + "compaction_threshold": 3, + "compaction_upper_limit": 6, + "lsn_lease_length": "0s", + # Enable gc-compaction + "gc_compaction_enabled": "true", + "gc_compaction_initial_threshold_kb": 1024, # At a small threshold + "gc_compaction_ratio_percent": 1, + # No PiTR interval and small GC horizon + "pitr_interval": "0s", + "gc_horizon": f"{1024**2}", +} + @skip_in_debug_build("only run with release build") @pytest.mark.parametrize( @@ -140,6 +162,8 @@ def test_pageserver_compaction_preempt( conf = PREEMPT_COMPACTION_TENANT_CONF.copy() env = neon_env_builder.init_start(initial_tenant_conf=conf) + env.pageserver.allowed_errors.append(".*The timeline or pageserver is shutting down.*") + tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -165,6 +189,41 @@ def test_pageserver_compaction_preempt( env.pageserver.assert_log_contains("resuming image layer creation") +@skip_in_debug_build("only run with release build") +def test_pageserver_gc_compaction_preempt( + neon_env_builder: NeonEnvBuilder, +): + # Ideally we should be able to do unit tests for this, but we need real Postgres + # WALs in order to do unit testing... + + conf = PREEMPT_GC_COMPACTION_TENANT_CONF.copy() + env = neon_env_builder.init_start(initial_tenant_conf=conf) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + row_count = 200000 + churn_rounds = 10 + + ps_http = env.pageserver.http_client() + + workload = Workload(env, tenant_id, timeline_id) + workload.init(env.pageserver.id) + + log.info("Writing initial data ...") + workload.write_rows(row_count, env.pageserver.id) + + for i in range(1, churn_rounds + 1): + log.info(f"Running churn round {i}/{churn_rounds} ...") + workload.churn_rows(row_count, env.pageserver.id, upload=False) + workload.validate(env.pageserver.id) + ps_http.timeline_compact(tenant_id, timeline_id, wait_until_uploaded=True) + log.info("Validating at workload end ...") + workload.validate(env.pageserver.id) + # ensure gc_compaction gets preempted and then resumed + env.pageserver.assert_log_contains("preempt gc-compaction") + + @skip_in_debug_build("only run with release build") @pytest.mark.timeout(900) # This test is slow with sanitizers enabled, especially on ARM @pytest.mark.parametrize( diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index ee96daca3328..e23b1e0bca98 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -148,9 +148,9 @@ def test_create_snapshot( env = neon_env_builder.init_start( initial_tenant_conf={ # Miniature layers to enable generating non-trivial layer map without writing lots of data. - "checkpoint_distance": f"{128 * 1024}", - "compaction_threshold": "1", - "compaction_target_size": f"{128 * 1024}", + "checkpoint_distance": f"{256 * 1024}", + "compaction_threshold": "5", + "compaction_target_size": f"{256 * 1024}", } ) endpoint = env.endpoints.create_start("main") @@ -492,6 +492,13 @@ def __str__(self): PgVersion.V17, "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-02-07-pgv17-nogenerations.tar.zst", ), + # Tenant manifest v1. + HistoricDataSet( + "2025-04-08-tenant-manifest-v1", + TenantId("c547c28588abf1d7b7139ff1f1158345"), + PgVersion.V17, + "https://neon-github-public-dev.s3.eu-central-1.amazonaws.com/compatibility-data-snapshots/2025-04-08-pgv17-tenant-manifest-v1.tar.zst", + ), ] diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py index c1f05830b742..37208c9fff3d 100644 --- a/test_runner/regress/test_compute_catalog.py +++ b/test_runner/regress/test_compute_catalog.py @@ -90,10 +90,12 @@ def test_compute_catalog(neon_simple_env: NeonEnv): # and reconfigure the endpoint to create some test databases. endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "cluster": { - "roles": TEST_ROLE_NAMES, - "databases": TEST_DB_NAMES, + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "roles": TEST_ROLE_NAMES, + "databases": TEST_DB_NAMES, + }, }, } ) @@ -155,10 +157,12 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv): # and reconfigure the endpoint to apply the changes. endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "cluster": { - "roles": TEST_ROLE_NAMES, - "databases": TEST_DB_NAMES, + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "roles": TEST_ROLE_NAMES, + "databases": TEST_DB_NAMES, + }, }, } ) @@ -196,12 +200,14 @@ def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv): endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "cluster": { - "roles": [], - "databases": [], + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "roles": [], + "databases": [], + }, + "delta_operations": delta_operations, }, - "delta_operations": delta_operations, } ) endpoint.reconfigure() @@ -250,9 +256,11 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): # and reconfigure the endpoint to apply the changes. endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "cluster": { - "databases": TEST_DB_NAMES, + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES, + }, }, } ) @@ -306,17 +314,19 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv): # and reconfigure the endpoint to apply the changes. endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "cluster": { - "databases": TEST_DB_NAMES_NEW, + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES_NEW, + }, + "delta_operations": [ + {"action": "delete_db", "name": SUB_DB_NAME}, + # also test the case when we try to delete a non-existent database + # shouldn't happen in normal operation, + # but can occur when failed operations are retried + {"action": "delete_db", "name": "nonexistent_db"}, + ], }, - "delta_operations": [ - {"action": "delete_db", "name": SUB_DB_NAME}, - # also test the case when we try to delete a non-existent database - # shouldn't happen in normal operation, - # but can occur when failed operations are retried - {"action": "delete_db", "name": "nonexistent_db"}, - ], } ) @@ -354,25 +364,27 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "cluster": { - "roles": [ - { - # We need to create role via compute_ctl, because in this case it will receive - # additional grants equivalent to our real environment, so we can repro some - # issues. - "name": "neon", - # Some autocomplete-suggested hash, no specific meaning. - "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=", - "options": [], - }, - ], - "databases": [ - { - "name": TEST_DB_NAME, - "owner": "neon", - }, - ], + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "roles": [ + { + # We need to create role via compute_ctl, because in this case it will receive + # additional grants equivalent to our real environment, so we can repro some + # issues. + "name": "neon", + # Some autocomplete-suggested hash, no specific meaning. + "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=", + "options": [], + }, + ], + "databases": [ + { + "name": TEST_DB_NAME, + "owner": "neon", + }, + ], + }, }, } ) @@ -415,13 +427,15 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne # Drop role via compute_ctl endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "delta_operations": [ - { - "action": "delete_role", - "name": TEST_GRANTEE, - }, - ], + "spec": { + "skip_pg_catalog_updates": False, + "delta_operations": [ + { + "action": "delete_role", + "name": TEST_GRANTEE, + }, + ], + }, } ) endpoint.reconfigure() @@ -444,13 +458,15 @@ def test_drop_role_with_table_privileges_from_neon_superuser(neon_simple_env: Ne endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "delta_operations": [ - { - "action": "delete_role", - "name": "readonly2", - }, - ], + "spec": { + "skip_pg_catalog_updates": False, + "delta_operations": [ + { + "action": "delete_role", + "name": "readonly2", + }, + ], + }, } ) endpoint.reconfigure() @@ -475,25 +491,27 @@ def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env endpoint = env.endpoints.create_start("main") endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "cluster": { - "roles": [ - { - # We need to create role via compute_ctl, because in this case it will receive - # additional grants equivalent to our real environment, so we can repro some - # issues. - "name": TEST_GRANTOR, - # Some autocomplete-suggested hash, no specific meaning. - "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=", - "options": [], - }, - ], - "databases": [ - { - "name": TEST_DB_NAME, - "owner": TEST_GRANTOR, - }, - ], + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "roles": [ + { + # We need to create role via compute_ctl, because in this case it will receive + # additional grants equivalent to our real environment, so we can repro some + # issues. + "name": TEST_GRANTOR, + # Some autocomplete-suggested hash, no specific meaning. + "encrypted_password": "SCRAM-SHA-256$4096:hBT22QjqpydQWqEulorfXA==$miBogcoj68JWYdsNB5PW1X6PjSLBEcNuctuhtGkb4PY=:hxk2gxkwxGo6P7GCtfpMlhA9zwHvPMsCz+NQf2HfvWk=", + "options": [], + }, + ], + "databases": [ + { + "name": TEST_DB_NAME, + "owner": TEST_GRANTOR, + }, + ], + }, }, } ) @@ -507,13 +525,15 @@ def test_drop_role_with_table_privileges_from_non_neon_superuser(neon_simple_env endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "delta_operations": [ - { - "action": "delete_role", - "name": TEST_GRANTEE, - }, - ], + "spec": { + "skip_pg_catalog_updates": False, + "delta_operations": [ + { + "action": "delete_role", + "name": TEST_GRANTEE, + }, + ], + }, } ) endpoint.reconfigure() diff --git a/test_runner/regress/test_compute_reconfigure.py b/test_runner/regress/test_compute_reconfigure.py index 6396ba67a10f..b533d45b1eeb 100644 --- a/test_runner/regress/test_compute_reconfigure.py +++ b/test_runner/regress/test_compute_reconfigure.py @@ -31,15 +31,17 @@ def test_compute_reconfigure(neon_simple_env: NeonEnv): endpoint.respec_deep( **{ - "skip_pg_catalog_updates": True, - "cluster": { - "settings": [ - { - "name": "log_line_prefix", - "vartype": "string", - "value": TEST_LOG_LINE_PREFIX, - } - ] + "spec": { + "skip_pg_catalog_updates": True, + "cluster": { + "settings": [ + { + "name": "log_line_prefix", + "vartype": "string", + "value": TEST_LOG_LINE_PREFIX, + } + ] + }, }, } ) diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 7280a91a1249..c5a1bf0d16d5 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -276,3 +276,34 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): if i > 1: before_timestamp = tbl[i - step_size][1] assert timestamp >= before_timestamp, "before_timestamp before timestamp" + + +def test_timestamp_of_lsn_empty_branch(neon_env_builder: NeonEnvBuilder): + """ + Test that getting the timestamp of the head LSN of a newly created branch works. + This verifies that we don't get a 404 error when trying to get the timestamp + of the head LSN of a branch that was just created. + We now return a special status code 412 to indicate if there is no timestamp found for lsn. + + Reproducer for https://github.com/neondatabase/neon/issues/11439 + """ + env = neon_env_builder.init_start() + + # Create a new branch + new_timeline_id = env.create_branch("test_timestamp_of_lsn_empty_branch") + + # Retrieve the commit LSN of the empty branch, which we have never run postgres on + detail = env.pageserver.http_client().timeline_detail( + tenant_id=env.initial_tenant, timeline_id=new_timeline_id + ) + head_lsn = detail["last_record_lsn"] + + # Verify that we get 412 status code + with env.pageserver.http_client() as client: + with pytest.raises(PageserverApiException) as err: + client.timeline_get_timestamp_of_lsn( + env.initial_tenant, + new_timeline_id, + head_lsn, + ) + assert err.value.status_code == 412 diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index 8bd0662ef8bd..e6bcdf8e67a8 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -134,10 +134,11 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder): """ env = neon_env_builder.init_start() - # Stop default ps/sk + # Stop default services env.neon_cli.pageserver_stop(env.pageserver.id) env.neon_cli.safekeeper_stop() env.neon_cli.storage_controller_stop(False) + env.neon_cli.object_storage_stop(False) env.neon_cli.storage_broker_stop() # Keep NeonEnv state up to date, it usually owns starting/stopping services @@ -179,11 +180,13 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder): # Using the single-pageserver shortcut property throws when there are multiple pageservers with pytest.raises(AssertionError): - _drop = env.pageserver + _ = env.pageserver env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 1) env.neon_cli.safekeeper_stop(neon_env_builder.safekeepers_id_start + 2) + env.neon_cli.object_storage_stop(False) + # Stop this to get out of the way of the following `start` env.neon_cli.storage_controller_stop(False) env.neon_cli.storage_broker_stop() diff --git a/test_runner/regress/test_object_storage.py b/test_runner/regress/test_object_storage.py new file mode 100644 index 000000000000..0b1cfa344fe6 --- /dev/null +++ b/test_runner/regress/test_object_storage.py @@ -0,0 +1,56 @@ +from time import time + +import pytest +from aiohttp import ClientSession +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv +from jwcrypto import jwk, jwt + + +@pytest.mark.asyncio +async def test_object_storage_insert_retrieve_delete(neon_simple_env: NeonEnv): + """ + Inserts, retrieves, and deletes test file using a JWT token + """ + env = neon_simple_env + ep = env.endpoints.create_start(branch_name="main") + tenant_id = str(ep.tenant_id) + timeline_id = str(ep.show_timeline_id()) + endpoint_id = ep.endpoint_id + + key_path = env.repo_dir / "auth_private_key.pem" + key = jwk.JWK.from_pem(key_path.read_bytes()) + claims = { + "tenant_id": tenant_id, + "timeline_id": timeline_id, + "endpoint_id": endpoint_id, + "exp": round(time()) + 99, + } + log.info(f"key path {key_path}\nclaims {claims}") + token = jwt.JWT(header={"alg": "EdDSA"}, claims=claims) + token.make_signed_token(key) + token = token.serialize() + + base_url = env.object_storage.base_url() + key = f"http://{base_url}/{tenant_id}/{timeline_id}/{endpoint_id}/key" + headers = {"Authorization": f"Bearer {token}"} + log.info(f"cache key url {key}") + log.info(f"token {token}") + + async with ClientSession(headers=headers) as session: + async with session.get(key) as res: + assert res.status == 404, f"Non-existing file is present: {res}" + + data = b"cheburash" + async with session.put(key, data=data) as res: + assert res.status == 200, f"Error writing file: {res}" + + async with session.get(key) as res: + read_data = await res.read() + assert data == read_data + + async with session.delete(key) as res: + assert res.status == 200, f"Error removing file {res}" + + async with session.get(key) as res: + assert res.status == 404, f"File was not deleted: {res}" diff --git a/test_runner/regress/test_page_service_batching_regressions.py b/test_runner/regress/test_page_service_batching_regressions.py index fa85e1210b30..50303a498622 100644 --- a/test_runner/regress/test_page_service_batching_regressions.py +++ b/test_runner/regress/test_page_service_batching_regressions.py @@ -16,6 +16,7 @@ def patch_pageserver_toml(config): "mode": "pipelined", "max_batch_size": 32, "execution": "concurrent-futures", + "batching": "uniform-lsn", } neon_env_builder.pageserver_config_override = patch_pageserver_toml diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py index 3d7204d88388..5ef63e2fe92f 100644 --- a/test_runner/regress/test_pageserver_getpage_throttle.py +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -1,6 +1,5 @@ from __future__ import annotations -import copy import json import uuid from typing import TYPE_CHECKING @@ -16,7 +15,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin -@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/11395") def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): env = neon_env_builder.init_start() @@ -44,7 +42,6 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P "refill_interval": "100ms", "refill_amount": int(rate_limit_rps / 10), "max": int(rate_limit_rps / 10), - "fair": True, }, }, ) @@ -98,17 +95,12 @@ def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: i _, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None)) log.info("run pagebench") - duration_secs = 10 + duration_secs = 20 actual_ncompleted = run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs) log.info("validate the client is capped at the configured rps limit") expect_ncompleted = duration_secs * rate_limit_rps - delta_abs = abs(expect_ncompleted - actual_ncompleted) - threshold = 0.05 * expect_ncompleted - assert threshold / rate_limit_rps < 0.1 * duration_secs, ( - "test self-test: unrealistic expecations regarding precision in this test" - ) - assert delta_abs < 0.05 * expect_ncompleted, ( + assert pytest.approx(expect_ncompleted, 0.05) == actual_ncompleted, ( "the throttling deviates more than 5percent from the expectation" ) @@ -122,6 +114,7 @@ def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: i timeout=compaction_period, ) + log.info("validate the metrics") smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query) assert smgr_query_seconds_post is not None throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query) @@ -130,72 +123,13 @@ def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: i actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre actual_throttled_secs = actual_throttled_usecs / 1_000_000 - log.info("validate that the metric doesn't include throttle wait time") - assert duration_secs >= 10 * actual_smgr_query_seconds, ( - "smgr metrics should not include throttle wait time" - ) - - log.info("validate that the throttling wait time metrics is correct") assert pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs, ( - "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates" - ) - - -throttle_config_with_field_fair_set = { - "task_kinds": ["PageRequestHandler"], - "fair": True, - "initial": 27, - "refill_interval": "43s", - "refill_amount": 23, - "max": 42, -} - - -def assert_throttle_config_with_field_fair_set(conf): - """ - Field `fair` is ignored, so, responses don't contain it - """ - without_fair = copy.deepcopy(throttle_config_with_field_fair_set) - without_fair.pop("fair") - - assert conf == without_fair - - -def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder): - """ - To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out. - """ - env = neon_env_builder.init_start() - vps_http = env.storage_controller.pageserver_api() - # with_fair config should still be settable - vps_http.set_tenant_config( - env.initial_tenant, - {"timeline_get_throttle": throttle_config_with_field_fair_set}, - ) - conf = vps_http.tenant_config(env.initial_tenant) - assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"]) - assert_throttle_config_with_field_fair_set( - conf.tenant_specific_overrides["timeline_get_throttle"] + "throttling and processing latency = total request time; this assert validates thi holds on average" ) - -def test_throttle_fair_config_is_settable_but_ignored_in_config_toml( - neon_env_builder: NeonEnvBuilder, -): - """ - To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out. - """ - - def set_tenant_config(ps_cfg): - tenant_config = ps_cfg.setdefault("tenant_config", {}) - tenant_config["timeline_get_throttle"] = throttle_config_with_field_fair_set - - neon_env_builder.pageserver_config_override = set_tenant_config - env = neon_env_builder.init_start() - ps_http = env.pageserver.http_client() - conf = ps_http.tenant_config(env.initial_tenant) - assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"]) - - env.pageserver.allowed_errors.append( - r'.*ignoring unknown configuration item path="tenant_config\.timeline_get_throttle\.fair"*' + # without this assertion, the test would pass even if the throttling was completely broken + # but the request processing is so slow that it makes up for the latency that a correct throttling + # implementation would add + assert actual_smgr_query_seconds < 0.66 * duration_secs, ( + "test self-test: request processing is consuming most of the wall clock time; this risks that we're not actually testing throttling" ) diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index c73a592d98d6..d48e73139403 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -61,7 +61,7 @@ def evict_random_layers( ) client = pageserver.http_client() for layer in initial_local_layers: - if "ephemeral" in layer.name or "temp_download" in layer.name: + if "ephemeral" in layer.name or "temp_download" in layer.name or ".___temp" in layer.name: continue layer_name = parse_layer_file_name(layer.name) @@ -242,7 +242,13 @@ def ignore_notify(request: Request): pageserver.tenant_location_configure(tenant_id, location_conf) last_state[pageserver.id] = (mode, generation) - if mode.startswith("Attached"): + # It's only valid to connect to the last generation. Newer generations may yank layer + # files used in older generations. + last_generation = max( + [s[1] for s in last_state.values() if s[1] is not None], default=None + ) + + if mode.startswith("Attached") and generation == last_generation: # This is a basic test: we are validating that he endpoint works properly _between_ # configuration changes. A stronger test would be to validate that clients see # no errors while we are making the changes. diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index a3fae9732741..0fea70688801 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -239,6 +239,8 @@ def test_isolation( "neon.regress_test_mode = true", # Stack size should be increased for tests to pass with asan. "max_stack_depth = 4MB", + # Neon extensiosn starts 2 BGW so decreasing number of parallel workers which can affect deadlock-parallel test if it hits max_worker_processes. + "max_worker_processes = 16", ], ) endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") diff --git a/test_runner/regress/test_ssl.py b/test_runner/regress/test_ssl.py index 9a7204ca17cc..39c94c05a9d7 100644 --- a/test_runner/regress/test_ssl.py +++ b/test_runner/regress/test_ssl.py @@ -1,5 +1,6 @@ import os import ssl +from datetime import datetime, timedelta import pytest import requests @@ -151,3 +152,63 @@ def cert_reloaded(): requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() cur_cert = ssl.get_server_certificate(("localhost", port)) assert cur_cert == sk_cert + + +def test_server_and_cert_metrics(neon_env_builder: NeonEnvBuilder): + """ + Test metrics exported from http/https server and tls cert reloader. + """ + neon_env_builder.use_https_pageserver_api = True + neon_env_builder.pageserver_config_override = "ssl_cert_reload_period='100 ms'" + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.append(".*Error reloading certificate.*") + + ps_client = env.pageserver.http_client() + + # 1. Test connection started metric. + filter_https = {"scheme": "https"} + old_https_conn_count = ( + ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0 + ) + + addr = f"https://localhost:{env.pageserver.service_port.https}/v1/status" + requests.get(addr, verify=str(env.ssl_ca_file)).raise_for_status() + + new_https_conn_count = ( + ps_client.get_metric_value("http_server_connection_started_total", filter_https) or 0 + ) + # The counter should increase after the request, + # but it may increase by more than one because of storcon requests. + assert new_https_conn_count > old_https_conn_count + + # 2. Test tls connection error. + # Request without specified CA cert file should fail. + with pytest.raises(requests.exceptions.SSLError): + requests.get(addr) + + tls_error_cnt = ( + ps_client.get_metric_value("http_server_connection_errors_total", {"type": "tls"}) or 0 + ) + assert tls_error_cnt == 1 + + # 3. Test expiration time metric. + expiration_time = datetime.fromtimestamp( + ps_client.get_metric_value("tls_certs_expiration_time_seconds") or 0 + ) + now = datetime.now() + # neon_local generates certs valid for 100 years. + # Compare with +-1 year to not care about leap years. + assert now + timedelta(days=365 * 99) < expiration_time < now + timedelta(days=365 * 101) + + # 4. Test cert reload failed metric. + reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total") + assert reload_error_cnt == 0 + + os.remove(env.pageserver.workdir / "server.crt") + + def reload_failed(): + reload_error_cnt = ps_client.get_metric_value("tls_certs_reload_failed_total") or 0 + assert reload_error_cnt > 0 + + wait_until(reload_failed) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 702f4eeccfe2..b2c8415e9a09 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -95,6 +95,7 @@ def test_storage_controller_smoke( env.pageservers[1].start() for sk in env.safekeepers: sk.start() + env.object_storage.start() # The pageservers we started should have registered with the sharding service on startup nodes = env.storage_controller.node_list() @@ -346,6 +347,7 @@ def prepare_onboarding_env( env = neon_env_builder.init_configs() env.broker.start() env.storage_controller.start() + env.object_storage.start() # This is the pageserver where we'll initially create the tenant. Run it in emergency # mode so that it doesn't talk to storage controller, and do not register it. @@ -675,7 +677,7 @@ def received_restart_notification(): env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2) expect = { "tenant_id": str(env.initial_tenant), - "stripe_size": 32768, + "stripe_size": 2048, "shards": [ {"node_id": int(env.pageservers[1].id), "shard_number": 0}, {"node_id": int(env.pageservers[1].id), "shard_number": 1}, @@ -2890,10 +2892,12 @@ def new_becomes_leader(): ) +@pytest.mark.parametrize("step_down_times_out", [False, True]) def test_storage_controller_leadership_transfer_during_split( neon_env_builder: NeonEnvBuilder, storage_controller_proxy: StorageControllerProxy, port_distributor: PortDistributor, + step_down_times_out: bool, ): """ Exercise a race between shard splitting and graceful leadership transfer. This is @@ -2934,6 +2938,18 @@ def test_storage_controller_leadership_transfer_during_split( ) env.storage_controller.reconcile_until_idle() + # We are testing scenarios where the step down API does not complete: either because it is stuck + # doing a shard split, or because it totally times out on some other failpoint. + env.storage_controller.allowed_errors.extend( + [ + ".*step_down.*request was dropped before completing.*", + ".*step_down.*operation timed out.*", + ".*Send step down request failed, will retry.*", + ".*Send step down request still failed after.*retries.*", + ".*Leader .+ did not respond to step-down request.*", + ] + ) + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: # Start a shard split env.storage_controller.allowed_errors.extend( @@ -2941,6 +2957,14 @@ def test_storage_controller_leadership_transfer_during_split( ) pause_failpoint = "shard-split-pre-complete" env.storage_controller.configure_failpoints((pause_failpoint, "pause")) + + if not step_down_times_out: + # Prevent the timeout self-terminate code from executing: we will block step down on the + # shard split itself + env.storage_controller.configure_failpoints( + ("step-down-delay-timeout", "return(3600000)") + ) + split_fut = executor.submit( env.storage_controller.tenant_shard_split, list(tenants)[0], shard_count * 2 ) @@ -2959,12 +2983,20 @@ def hit_failpoint(): timeout_in_seconds=30, instance_id=2, base_port=storage_controller_2_port ) + if step_down_times_out: + # Step down will time out, original controller will terminate itself + env.storage_controller.allowed_errors.extend([".*terminating process.*"]) + else: + # Step down does not time out: original controller hits its shard split completion + # code path and realises that it must not purge the parent shards from the database. + env.storage_controller.allowed_errors.extend([".*Enqueuing background abort.*"]) + def passed_split_abort(): try: log.info("Checking log for pattern...") - assert env.storage_controller.log_contains( - ".*Using observed state received from leader.*" - ) + # This log is indicative of entering startup_reconcile, which happens + # after the point we would abort shard splits + assert env.storage_controller.log_contains(".*Populating tenant shards.*") except Exception: log.exception("Failed to find pattern in log") raise @@ -2973,34 +3005,42 @@ def passed_split_abort(): wait_until(passed_split_abort, interval=0.1, status_interval=1.0) assert env.storage_controller.log_contains(".*Aborting shard split.*") - # Proxy is still talking to original controller here: disable its pause failpoint so - # that its shard split can run to completion. - log.info("Disabling failpoint") - # Bypass the proxy: the python test HTTPServer is single threaded and still blocked - # on handling the shard split request. - env.storage_controller.request( - "PUT", - f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints", - json=[{"name": "shard-split-pre-complete", "actions": "off"}], - headers=env.storage_controller.headers(TokenScope.ADMIN), - ) + if step_down_times_out: + # We will let the old controller hit a timeout path where it terminates itself, rather than + # completing step_down and trying to complete a shard split + def old_controller_terminated(): + assert env.storage_controller.log_contains(".*terminating process.*") - def previous_stepped_down(): - assert ( - env.storage_controller.get_leadership_status() - == StorageControllerLeadershipStatus.STEPPED_DOWN + wait_until(old_controller_terminated) + else: + # Proxy is still talking to original controller here: disable its pause failpoint so + # that its shard split can run to completion. + log.info("Disabling failpoint") + # Bypass the proxy: the python test HTTPServer is single threaded and still blocked + # on handling the shard split request. + env.storage_controller.request( + "PUT", + f"http://127.0.0.1:{storage_controller_1_port}/debug/v1/failpoints", + json=[{"name": "shard-split-pre-complete", "actions": "off"}], + headers=env.storage_controller.headers(TokenScope.ADMIN), ) - log.info("Awaiting step down") - wait_until(previous_stepped_down) + def previous_stepped_down(): + assert ( + env.storage_controller.get_leadership_status() + == StorageControllerLeadershipStatus.STEPPED_DOWN + ) - # Let the shard split complete: this may happen _after_ the replacement has come up - # and tried to clean up the databases - log.info("Unblocking & awaiting shard split") - with pytest.raises(Exception, match="Unexpected child shard count"): - # This split fails when it tries to persist results, because it encounters - # changes already made by the new controller's abort-on-startup - split_fut.result() + log.info("Awaiting step down") + wait_until(previous_stepped_down) + + # Let the shard split complete: this may happen _after_ the replacement has come up + # and tried to clean up the databases + log.info("Unblocking & awaiting shard split") + with pytest.raises(Exception, match="Unexpected child shard count"): + # This split fails when it tries to persist results, because it encounters + # changes already made by the new controller's abort-on-startup + split_fut.result() log.info("Routing to new leader") storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}") @@ -3018,13 +3058,14 @@ def new_becomes_leader(): env.storage_controller.wait_until_ready() env.storage_controller.consistency_check() - # Check that the stepped down instance forwards requests - # to the new leader while it's still running. - storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") - env.storage_controller.tenant_shard_dump() - env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"}) - status = env.storage_controller.node_status(env.pageservers[0].id) - assert status["scheduling"] == "Pause" + if not step_down_times_out: + # Check that the stepped down instance forwards requests + # to the new leader while it's still running. + storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}") + env.storage_controller.tenant_shard_dump() + env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"}) + status = env.storage_controller.node_status(env.pageservers[0].id) + assert status["scheduling"] == "Pause" def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 70af299de355..03cd133ccbb8 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -75,7 +75,7 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: tenant_shard_ids = [TenantShardId(tenant_id, 0, 0)] # Let shards finish rescheduling to other pageservers: this makes the rest of the test more stable - # is it won't overlap with migrations + # as it won't overlap with migrations env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120) output_path = neon_env_builder.test_output_dir / "snapshot" @@ -87,6 +87,13 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: workload.stop() + # Disable scheduling, so the storage controller doesn't migrate shards around + # while we are stopping pageservers + env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Stop"}) + env.storage_controller.allowed_errors.extend( + [".*Scheduling is disabled by policy Stop.*", ".*Skipping reconcile for policy Stop.*"] + ) + # Stop pageservers for pageserver in env.pageservers: pageserver.stop() @@ -127,9 +134,16 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: for pageserver in env.pageservers: pageserver.start() + # Turn scheduling back on. + # We don't care about optimizations, so enable only essential scheduling + env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Essential"}) + # Check we can read everything workload.validate() + # Reconcile to avoid a race between test shutdown and background reconciliation (#11278) + env.storage_controller.reconcile_until_idle() + def drop_local_state(env: NeonEnv, tenant_id: TenantId): env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"}) diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py index 6175643389a4..83bebc19becb 100644 --- a/test_runner/regress/test_subscriber_branching.py +++ b/test_runner/regress/test_subscriber_branching.py @@ -251,7 +251,7 @@ def test_multiple_subscription_branching(neon_simple_env: NeonEnv): NUMBER_OF_DBS = 5 # Create and start endpoint so that neon_local put all the generated - # stuff into the spec.json file. + # stuff into the config.json file. endpoint = env.endpoints.create_start( "main", config_lines=[ @@ -280,13 +280,15 @@ def test_multiple_subscription_branching(neon_simple_env: NeonEnv): } ) - # Update the spec.json file to create the databases + # Update the config.json file to create the databases # and reconfigure the endpoint to apply the changes. endpoint.respec_deep( **{ - "skip_pg_catalog_updates": False, - "cluster": { - "databases": TEST_DB_NAMES, + "spec": { + "skip_pg_catalog_updates": False, + "cluster": { + "databases": TEST_DB_NAMES, + }, }, } ) diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index c613a79374f1..c00f8f4ca5a2 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -390,6 +390,7 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): # Tenant creation requests which arrive out of order will generate complaints about # generation nubmers out of order. env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+") + env.pageserver.allowed_errors.append(".*due to stale generation.+") # Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of # an incomplete attach, or some other problem. In the field this should be rare, diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index e3d39f9315ee..a9a6699e5cb5 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -45,7 +45,7 @@ s3_storage, ) from fixtures.safekeeper.http import ( - Configuration, + MembershipConfiguration, SafekeeperHttpClient, SafekeeperId, TimelineCreateRequest, @@ -589,7 +589,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder, remote_storage_kind: Re for sk in env.safekeepers: sk.start() cli = sk.http_client() - mconf = Configuration(generation=0, members=[], new_members=None) + mconf = MembershipConfiguration(generation=0, members=[], new_members=None) # set start_lsn to the beginning of the first segment to allow reading # WAL from there (could you intidb LSN as well). r = TimelineCreateRequest( @@ -1948,7 +1948,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): sk_id_2 = SafekeeperId(11, "localhost", 5434) # just a mock # Request to switch before timeline creation should fail. - init_conf = Configuration(generation=1, members=[sk_id_1], new_members=None) + init_conf = MembershipConfiguration(generation=1, members=[sk_id_1], new_members=None) with pytest.raises(requests.exceptions.HTTPError): http_cli.membership_switch(tenant_id, timeline_id, init_conf) @@ -1960,7 +1960,7 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): http_cli.timeline_create(create_r) # Switch into some conf. - joint_conf = Configuration(generation=4, members=[sk_id_1], new_members=[sk_id_2]) + joint_conf = MembershipConfiguration(generation=4, members=[sk_id_1], new_members=[sk_id_2]) resp = http_cli.membership_switch(tenant_id, timeline_id, joint_conf) log.info(f"joint switch resp: {resp}") assert resp.previous_conf.generation == 1 @@ -1973,24 +1973,26 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder): assert after_restart.generation == 4 # Switch into non joint conf of which sk is not a member, must fail. - non_joint_not_member = Configuration(generation=5, members=[sk_id_2], new_members=None) + non_joint_not_member = MembershipConfiguration( + generation=5, members=[sk_id_2], new_members=None + ) with pytest.raises(requests.exceptions.HTTPError): resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint_not_member) # Switch into good non joint conf. - non_joint = Configuration(generation=6, members=[sk_id_1], new_members=None) + non_joint = MembershipConfiguration(generation=6, members=[sk_id_1], new_members=None) resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint) log.info(f"non joint switch resp: {resp}") assert resp.previous_conf.generation == 4 assert resp.current_conf.generation == 6 # Switch request to lower conf should be rejected. - lower_conf = Configuration(generation=3, members=[sk_id_1], new_members=None) + lower_conf = MembershipConfiguration(generation=3, members=[sk_id_1], new_members=None) with pytest.raises(requests.exceptions.HTTPError): http_cli.membership_switch(tenant_id, timeline_id, lower_conf) # Now, exclude sk from the membership, timeline should be deleted. - excluded_conf = Configuration(generation=7, members=[sk_id_2], new_members=None) + excluded_conf = MembershipConfiguration(generation=7, members=[sk_id_2], new_members=None) http_cli.timeline_exclude(tenant_id, timeline_id, excluded_conf) with pytest.raises(requests.exceptions.HTTPError): http_cli.timeline_status(tenant_id, timeline_id) @@ -2010,11 +2012,6 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder): tenant_id = env.initial_tenant timeline_id = env.initial_timeline - ps = env.pageservers[0] - ps_http_cli = ps.http_client() - - http_clis = [sk.http_client() for sk in env.safekeepers] - config_lines = [ "neon.safekeeper_proto_version = 3", ] @@ -2023,22 +2020,11 @@ def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder): # expected to fail because timeline is not created on safekeepers with pytest.raises(Exception, match=r".*timed out.*"): ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3], timeout="2s") - # figure out initial LSN. - ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id) - init_lsn = ps_timeline_detail["last_record_lsn"] - log.info(f"initial LSN: {init_lsn}") - # sk timeline creation request expects minor version - pg_version = ps_timeline_detail["pg_version"] * 10000 # create inital mconf - sk_ids = [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in env.safekeepers] - mconf = Configuration(generation=1, members=sk_ids, new_members=None) - create_r = TimelineCreateRequest( - tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None + mconf = MembershipConfiguration( + generation=1, members=Safekeeper.sks_to_safekeeper_ids(env.safekeepers), new_members=None ) - log.info(f"sending timeline create: {create_r.to_json()}") - - for sk_http_cli in http_clis: - sk_http_cli.timeline_create(create_r) + Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, env.safekeepers) # Once timeline created endpoint should start. ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3]) ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)") diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index b7c7478e7816..c5dd34f64ff2 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -18,6 +18,7 @@ Safekeeper, ) from fixtures.remote_storage import RemoteStorageKind +from fixtures.safekeeper.http import MembershipConfiguration from fixtures.utils import skip_in_debug_build if TYPE_CHECKING: @@ -452,20 +453,24 @@ def test_concurrent_computes(neon_env_builder: NeonEnvBuilder): asyncio.run(run_concurrent_computes(env)) +async def assert_query_hangs(endpoint: Endpoint, query: str): + """ + Start on endpoint query which is expected to hang and check that it does. + """ + conn = await endpoint.connect_async() + bg_query = asyncio.create_task(conn.execute(query)) + await asyncio.sleep(2) + assert not bg_query.done() + return bg_query + + # Stop safekeeper and check that query cannot be executed while safekeeper is down. # Query will insert a single row into a table. -async def check_unavailability( - sk: Safekeeper, conn: asyncpg.Connection, key: int, start_delay_sec: int = 2 -): +async def check_unavailability(sk: Safekeeper, ep: Endpoint, key: int, start_delay_sec: int = 2): # shutdown one of two acceptors, that is, majority sk.stop() - bg_query = asyncio.create_task(conn.execute(f"INSERT INTO t values ({key}, 'payload')")) - - await asyncio.sleep(start_delay_sec) - # ensure that the query has not been executed yet - assert not bg_query.done() - + bg_query = await assert_query_hangs(ep, f"INSERT INTO t values ({key}, 'payload')") # start safekeeper and await the query sk.start() await bg_query @@ -480,10 +485,10 @@ async def run_unavailability(env: NeonEnv, endpoint: Endpoint): await conn.execute("INSERT INTO t values (1, 'payload')") # stop safekeeper and check that query cannot be executed while safekeeper is down - await check_unavailability(env.safekeepers[0], conn, 2) + await check_unavailability(env.safekeepers[0], endpoint, 2) # for the world's balance, do the same with second safekeeper - await check_unavailability(env.safekeepers[1], conn, 3) + await check_unavailability(env.safekeepers[1], endpoint, 3) # check that we can execute queries after restart await conn.execute("INSERT INTO t values (4, 'payload')") @@ -514,15 +519,7 @@ async def run_recovery_uncommitted(env: NeonEnv): # insert with only one safekeeper up to create tail of flushed but not committed WAL sk1.stop() sk2.stop() - conn = await ep.connect_async() - # query should hang, so execute in separate task - bg_query = asyncio.create_task( - conn.execute("insert into t select generate_series(1, 2000), 'payload'") - ) - sleep_sec = 2 - await asyncio.sleep(sleep_sec) - # it must still be not finished - assert not bg_query.done() + await assert_query_hangs(ep, "insert into t select generate_series(1, 2000), 'payload'") # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers. ep.stop_and_destroy() @@ -559,15 +556,7 @@ async def run_wal_truncation(env: NeonEnv, safekeeper_proto_version: int): # insert with only one sk3 up to create tail of flushed but not committed WAL on it sk1.stop() sk2.stop() - conn = await ep.connect_async() - # query should hang, so execute in separate task - bg_query = asyncio.create_task( - conn.execute("insert into t select generate_series(1, 180000), 'Papaya'") - ) - sleep_sec = 2 - await asyncio.sleep(sleep_sec) - # it must still be not finished - assert not bg_query.done() + await assert_query_hangs(ep, "insert into t select generate_series(1, 180000), 'Papaya'") # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers. ep.stop_and_destroy() @@ -607,6 +596,132 @@ def test_wal_truncation(neon_env_builder: NeonEnvBuilder, safekeeper_proto_versi asyncio.run(run_wal_truncation(env, safekeeper_proto_version)) +async def quorum_sanity_single( + env: NeonEnv, + compute_sks_ids: list[int], + members_sks_ids: list[int], + new_members_sks_ids: list[int] | None, + sks_to_stop_ids: list[int], + should_work_when_stopped: bool, +): + """ + *_ids params contain safekeeper node ids; it is assumed they are issued + from 1 and sequentially assigned to env.safekeepers. + """ + members_sks = [env.safekeepers[i - 1] for i in members_sks_ids] + new_members_sks = ( + [env.safekeepers[i - 1] for i in new_members_sks_ids] if new_members_sks_ids else None + ) + sks_to_stop = [env.safekeepers[i - 1] for i in sks_to_stop_ids] + + mconf = MembershipConfiguration( + generation=1, + members=Safekeeper.sks_to_safekeeper_ids(members_sks), + new_members=Safekeeper.sks_to_safekeeper_ids(new_members_sks) if new_members_sks else None, + ) + members_sks = Safekeeper.mconf_sks(env, mconf) + + tenant_id = env.initial_tenant + compute_sks_ids_str = "-".join([str(sk_id) for sk_id in compute_sks_ids]) + members_sks_ids_str = "-".join([str(sk.id) for sk in mconf.members]) + new_members_sks_ids_str = "-".join( + [str(sk.id) for sk in mconf.new_members] if mconf.new_members is not None else [] + ) + sks_to_stop_ids_str = "-".join([str(sk.id) for sk in sks_to_stop]) + log.info( + f"running quorum_sanity_single with compute_sks={compute_sks_ids_str}, members_sks={members_sks_ids_str}, new_members_sks={new_members_sks_ids_str}, sks_to_stop={sks_to_stop_ids_str}, should_work_when_stopped={should_work_when_stopped}" + ) + branch_name = f"test_quorum_single_c{compute_sks_ids_str}_m{members_sks_ids_str}_{new_members_sks_ids_str}_s{sks_to_stop_ids_str}" + timeline_id = env.create_branch(branch_name) + + # create timeline on `members_sks` + Safekeeper.create_timeline(tenant_id, timeline_id, env.pageservers[0], mconf, members_sks) + + config_lines = [ + "neon.safekeeper_proto_version = 3", + ] + ep = env.endpoints.create(branch_name, config_lines=config_lines) + ep.start(safekeeper_generation=1, safekeepers=compute_sks_ids) + ep.safe_psql("create table t(key int, value text)") + + # stop specified sks and check whether writes work + for sk in sks_to_stop: + sk.stop() + if should_work_when_stopped: + log.info("checking that writes still work") + ep.safe_psql("insert into t select generate_series(1, 100), 'Papaya'") + # restarting ep should also be fine + ep.stop() + ep.start() + ep.safe_psql("insert into t select generate_series(1, 100), 'plum'") + bg_query = None + else: + log.info("checking that writes hang") + bg_query = await assert_query_hangs( + ep, "insert into t select generate_series(1, 100), 'Papaya'" + ) + # start again; now they should work + for sk in sks_to_stop: + sk.start() + if bg_query: + log.info("awaiting query") + await bg_query + + +# It's a bit tempting to iterate over all possible combinations, but let's stick +# with this for now. +async def run_quorum_sanity(env: NeonEnv): + # 3 members, all up, should work + await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [], True) + # 3 members, 2/3 up, should work + await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [3], True) + # 3 members, 1/3 up, should not work + await quorum_sanity_single(env, [1, 2, 3], [1, 2, 3], None, [2, 3], False) + + # 3 members, all up, should work; wp redundantly talks to 4th. + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], None, [], True) + # 3 members, all up, should work with wp talking to 2 of these 3 + plus one redundant + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [], True) + # 3 members, 2/3 up, could work but wp talks to different 3s, so it shouldn't + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], None, [3], False) + + # joint conf of 1-2-3 and 4, all up, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [], True) + # joint conf of 1-2-3 and 4, 4 down, shouldn't work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [4], [4], False) + + # joint conf of 1-2-3 and 2-3-4, all up, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [], True) + # joint conf of 1-2-3 and 2-3-4, 1 and 4 down, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 4], True) + # joint conf of 1-2-3 and 2-3-4, 2 down, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2], True) + # joint conf of 1-2-3 and 2-3-4, 3 down, should work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [3], True) + # joint conf of 1-2-3 and 2-3-4, 1 and 2 down, shouldn't work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [1, 2], False) + # joint conf of 1-2-3 and 2-3-4, 2 and 4 down, shouldn't work + await quorum_sanity_single(env, [1, 2, 3, 4], [1, 2, 3], [2, 3, 4], [2, 4], False) + + # joint conf of 1-2-3 and 2-3-4 with wp talking to 2-3-4 only. + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [], True) + # with 1 down should still be ok + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [1], True) + # but with 2 down not ok + await quorum_sanity_single(env, [2, 3, 4], [1, 2, 3], [2, 3, 4], [2], False) + + +# Test various combinations of membership configurations / neon.safekeepers +# (list of safekeepers endpoint connects to) values / up & down safekeepers and +# check that endpont can start and write data when we have quorum and can't when +# we don't. +def test_quorum_sanity(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 4 + env = neon_env_builder.init_start() + + asyncio.run(run_quorum_sanity(env)) + + async def run_segment_init_failure(env: NeonEnv): env.create_branch("test_segment_init_failure") ep = env.endpoints.create_start("test_segment_init_failure")