diff --git a/.gitignore b/.gitignore index 65d7fa78..28e560cc 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,8 @@ templates/ .env.registry docs/guides/miden-dashboard/operators.json docs/guides/miden-dashboard/guardian-dashboard/ +docs/guides/horizontal-scaling/operators.json +docs/guides/horizontal-scaling/docker-compose.override.yml .cursor/ .claude/ diff --git a/Cargo.lock b/Cargo.lock index 052f46eb..b9ea5554 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3263,6 +3263,7 @@ dependencies = [ "tokio", "tokio-postgres", "tokio-postgres-rustls", + "tokio-util", "tonic", "tonic-prost", "tonic-prost-build", @@ -8053,6 +8054,7 @@ dependencies = [ "bytes", "futures-core", "futures-sink", + "futures-util", "pin-project-lite", "tokio", ] diff --git a/crates/server/Cargo.toml b/crates/server/Cargo.toml index d2c108a9..fd28ea33 100644 --- a/crates/server/Cargo.toml +++ b/crates/server/Cargo.toml @@ -41,6 +41,7 @@ url = "2.5" zeroize = { version = "1.7", features = ["derive"] } serde_json = { workspace = true } tokio = { workspace = true, features = ["full"] } +tokio-util = { version = "0.7", features = ["rt"] } tonic = { workspace = true } tonic-prost = { workspace = true } tonic-reflection = "0.14" diff --git a/crates/server/migrations/2026-06-23-000001_auth_sessions/down.sql b/crates/server/migrations/2026-06-23-000001_auth_sessions/down.sql new file mode 100644 index 00000000..874930ac --- /dev/null +++ b/crates/server/migrations/2026-06-23-000001_auth_sessions/down.sql @@ -0,0 +1,3 @@ +DROP INDEX IF EXISTS auth_sessions_realm_expires_idx; +DROP INDEX IF EXISTS auth_sessions_expires_idx; +DROP TABLE IF EXISTS auth_sessions; diff --git a/crates/server/migrations/2026-06-23-000001_auth_sessions/up.sql b/crates/server/migrations/2026-06-23-000001_auth_sessions/up.sql new file mode 100644 index 00000000..0b26a593 --- /dev/null +++ b/crates/server/migrations/2026-06-23-000001_auth_sessions/up.sql @@ -0,0 +1,21 @@ +-- Shared operator/EVM session store for horizontal scaling (issue #242). +-- Sessions move out of per-process memory so a session issued on one replica +-- is honored on every replica. Keyed by the SHA-256 digest of the session +-- token (the plaintext token is never stored). The primary key is composite on +-- (realm, token_digest) so operator and EVM sessions share one table with the +-- realm boundary enforced by the database, not merely by token randomness. + +CREATE TABLE auth_sessions ( + realm TEXT NOT NULL, + token_digest BYTEA NOT NULL, + subject JSONB NOT NULL, + issued_at TIMESTAMPTZ NOT NULL, + expires_at TIMESTAMPTZ NOT NULL, + -- Set on logout; the row is kept until natural expiry so the revocation + -- is honored fleet-wide for as long as the token would have been valid. + revoked_at TIMESTAMPTZ NULL, + PRIMARY KEY (realm, token_digest) +); + +CREATE INDEX auth_sessions_expires_idx ON auth_sessions (expires_at); +CREATE INDEX auth_sessions_realm_expires_idx ON auth_sessions (realm, expires_at); diff --git a/crates/server/migrations/2026-06-23-000002_auth_challenges/down.sql b/crates/server/migrations/2026-06-23-000002_auth_challenges/down.sql new file mode 100644 index 00000000..16f2effa --- /dev/null +++ b/crates/server/migrations/2026-06-23-000002_auth_challenges/down.sql @@ -0,0 +1,3 @@ +DROP INDEX IF EXISTS auth_challenges_expires_idx; +DROP INDEX IF EXISTS auth_challenges_realm_principal_idx; +DROP TABLE IF EXISTS auth_challenges; diff --git a/crates/server/migrations/2026-06-23-000002_auth_challenges/up.sql b/crates/server/migrations/2026-06-23-000002_auth_challenges/up.sql new file mode 100644 index 00000000..be2494ed --- /dev/null +++ b/crates/server/migrations/2026-06-23-000002_auth_challenges/up.sql @@ -0,0 +1,21 @@ +-- Shared operator/EVM login-challenge store for horizontal scaling (issue #242). +-- A challenge issued on one replica must be verifiable on another. Realm-aware +-- so the two verification models coexist: `challenge_key` is the operator +-- signing-digest hex or the EVM nonce, and `payload` carries the realm-specific +-- fields needed to match/recover at verify time. Matching runs in Rust (Falcon +-- verify / ECDSA recover); the store provides the candidates and the single-use +-- claim. + +CREATE TABLE auth_challenges ( + realm TEXT NOT NULL, + challenge_key TEXT NOT NULL, + principal TEXT NOT NULL, + payload JSONB NOT NULL, + issued_at TIMESTAMPTZ NOT NULL, + expires_at TIMESTAMPTZ NOT NULL, + consumed_at TIMESTAMPTZ NULL, + PRIMARY KEY (realm, challenge_key) +); + +CREATE INDEX auth_challenges_realm_principal_idx ON auth_challenges (realm, principal); +CREATE INDEX auth_challenges_expires_idx ON auth_challenges (expires_at); diff --git a/crates/server/migrations/2026-06-24-000001_worker_leases/down.sql b/crates/server/migrations/2026-06-24-000001_worker_leases/down.sql new file mode 100644 index 00000000..f6742949 --- /dev/null +++ b/crates/server/migrations/2026-06-24-000001_worker_leases/down.sql @@ -0,0 +1 @@ +DROP TABLE IF EXISTS worker_leases; diff --git a/crates/server/migrations/2026-06-24-000001_worker_leases/up.sql b/crates/server/migrations/2026-06-24-000001_worker_leases/up.sql new file mode 100644 index 00000000..c225178e --- /dev/null +++ b/crates/server/migrations/2026-06-24-000001_worker_leases/up.sql @@ -0,0 +1,14 @@ +-- Single-owner coordination for background workers under horizontal scaling +-- (issue #242, subsumes #190). At most one replica holds a named lease at a +-- time; the holder renews on a heartbeat and a stale lease can be reclaimed by +-- another replica once it expires. `fence_token` increments only on a change of +-- holder (steal), so a superseded holder can be detected at its write boundary. + +CREATE TABLE worker_leases ( + lease_name TEXT PRIMARY KEY, + holder_id TEXT NOT NULL, + acquired_at TIMESTAMPTZ NOT NULL, + renewed_at TIMESTAMPTZ NOT NULL, + expires_at TIMESTAMPTZ NOT NULL, + fence_token BIGINT NOT NULL DEFAULT 0 +); diff --git a/crates/server/src/ack/mod.rs b/crates/server/src/ack/mod.rs index 4fecb69f..caf2a511 100644 --- a/crates/server/src/ack/mod.rs +++ b/crates/server/src/ack/mod.rs @@ -11,7 +11,7 @@ pub mod miden_falcon_rpo; mod secrets_manager; use crate::delta_object::DeltaObject; -use crate::error::{GuardianError, Result}; +use crate::error::Result; use guardian_shared::SignatureScheme; use miden_protocol::crypto::dsa::ecdsa_k256_keccak::SigningKey as EcdsaSecretKey; use std::path::{Path, PathBuf}; @@ -25,9 +25,6 @@ pub(crate) use miden_ecdsa::{ }; pub use miden_falcon_rpo::MidenFalconRpoSigner; -const ENV_GUARDIAN_ENV: &str = "GUARDIAN_ENV"; -const PROD_ENV: &str = "prod"; - /// The ECDSA signer is abstracted over [`EcdsaSignerBackend`] so its key can live /// in a hosted backend (e.g. AWS KMS); Falcon stays concrete because hosted /// backends only support the secp256k1 ECDSA scheme. @@ -40,7 +37,7 @@ pub struct AckRegistry { impl AckRegistry { pub async fn new(keystore_path: PathBuf) -> Result { let ecdsa_backend = EcdsaBackendKind::from_env()?; - if is_prod_environment()? { + if crate::config::stage::is_prod()? { let provider = AwsSecretsManagerProvider::from_env().await?; Self::from_provider(keystore_path, ecdsa_backend, Some(&provider)).await } else { @@ -135,19 +132,10 @@ async fn build_ecdsa_signer( Ok(MidenEcdsaSigner::new(backend)) } -fn is_prod_environment() -> Result { - match std::env::var(ENV_GUARDIAN_ENV) { - Ok(value) => Ok(value.eq_ignore_ascii_case(PROD_ENV)), - Err(std::env::VarError::NotPresent) => Ok(false), - Err(std::env::VarError::NotUnicode(_)) => Err(GuardianError::ConfigurationError(format!( - "{ENV_GUARDIAN_ENV} must contain valid UTF-8" - ))), - } -} - #[cfg(all(test, not(any(feature = "integration", feature = "e2e"))))] mod tests { use super::*; + use crate::error::GuardianError; use async_trait::async_trait; use miden_keystore::{EcdsaKeyStore, FilesystemEcdsaKeyStore, FilesystemKeyStore, KeyStore}; use miden_protocol::crypto::dsa::falcon512_poseidon2::SecretKey as FalconSecretKey; diff --git a/crates/server/src/api/dashboard.rs b/crates/server/src/api/dashboard.rs index 556a5d7c..5c8a5532 100644 --- a/crates/server/src/api/dashboard.rs +++ b/crates/server/src/api/dashboard.rs @@ -173,23 +173,31 @@ pub async fn verify_operator_login( security(("operator_session" = [])), responses( (status = 200, description = "Session invalidated", body = LogoutOperatorResponse), + (status = 500, description = "Session revocation failed", body = crate::openapi::ApiErrorResponse), ) )] pub async fn logout_operator( State(state): State, headers: HeaderMap, -) -> impl IntoResponse { +) -> Result<( + StatusCode, + [(header::HeaderName, String); 1], + Json, +)> { let token = extract_cookie(&headers, state.dashboard.cookie_name()); + // Fail closed: if the shared session store cannot revoke (e.g. Postgres is + // unavailable), surface the error so the caller can retry instead of being + // told a logout succeeded that did not take effect fleet-wide. state .dashboard .logout(token.as_deref(), state.clock.now()) - .await; + .await?; - ( + Ok(( StatusCode::OK, [(header::SET_COOKIE, state.dashboard.clear_cookie_header())], Json(LogoutOperatorResponse { success: true }), - ) + )) } /// Paginated list of accounts visible to the operator. Requires the diff --git a/crates/server/src/api/evm.rs b/crates/server/src/api/evm.rs index 87b4e233..d0e032e2 100644 --- a/crates/server/src/api/evm.rs +++ b/crates/server/src/api/evm.rs @@ -173,6 +173,7 @@ pub async fn verify_evm_session( security(("evm_session" = [])), responses( (status = 200, description = "Session invalidated", body = LogoutResponse), + (status = 500, description = "Session revocation failed", body = crate::openapi::ApiErrorResponse), ) )] pub async fn logout_evm_session( @@ -180,11 +181,13 @@ pub async fn logout_evm_session( headers: HeaderMap, ) -> Result<([(header::HeaderName, String); 1], Json)> { let token = extract_cookie(&headers, state.evm.sessions.cookie_name()); + // Fail closed: a revoke failure (e.g. shared store outage) is surfaced so the + // caller can retry rather than believing the session was invalidated. state .evm .sessions .logout(token.as_deref(), state.clock.now()) - .await; + .await?; Ok(( [(header::SET_COOKIE, state.evm.sessions.clear_cookie_header())], Json(LogoutResponse { success: true }), diff --git a/crates/server/src/builder/handle.rs b/crates/server/src/builder/handle.rs index 1ad15636..70c7c654 100644 --- a/crates/server/src/builder/handle.rs +++ b/crates/server/src/builder/handle.rs @@ -42,6 +42,7 @@ use crate::state::AppState; /// Provides methods to run the server with the configured settings. pub struct ServerHandle { pub(crate) app_state: AppState, + pub(crate) leader: std::sync::Arc, pub(crate) startup_info: StartupInfo, pub(crate) cors_layer: Option, pub(crate) rate_limit_config: Option, @@ -127,13 +128,15 @@ impl ServerHandle { // Start background jobs based on canonicalization config if self.app_state.canonicalization.is_some() { tracing::info!("Starting canonicalization worker"); - start_canonicalization_worker(self.app_state.clone()); + start_canonicalization_worker(self.app_state.clone(), self.leader.clone()); } else { tracing::info!( "Running in optimistic mode - deltas accepted without on-chain verification" ); } + start_session_sweep_worker(self.app_state.clone()); + // Start HTTP server if enabled if self.http_enabled { let state = self.app_state.clone(); @@ -368,3 +371,33 @@ impl ServerHandle { } } } + +const SESSION_SWEEP_INTERVAL_SECS: u64 = 60; + +/// Periodically reclaim expired operator sessions/challenges from the +/// coordination store. Expiry is enforced on read regardless; this only frees +/// rows (Postgres) or memory (in-memory). +fn start_session_sweep_worker(state: AppState) { + tokio::spawn(async move { + let mut ticker = + tokio::time::interval(std::time::Duration::from_secs(SESSION_SWEEP_INTERVAL_SECS)); + loop { + ticker.tick().await; + if let Err(error) = state.dashboard.sweep_expired(state.clock.now()).await { + tracing::warn!( + target: "dashboard.session_sweep", + %error, + "operator session/challenge sweep failed", + ); + } + #[cfg(feature = "evm")] + if let Err(error) = state.evm.sessions.sweep_expired(state.clock.now()).await { + tracing::warn!( + target: "evm.session_sweep", + %error, + "EVM session/challenge sweep failed", + ); + } + } + }); +} diff --git a/crates/server/src/builder/mod.rs b/crates/server/src/builder/mod.rs index f71d3030..9ccc659c 100644 --- a/crates/server/src/builder/mod.rs +++ b/crates/server/src/builder/mod.rs @@ -41,6 +41,7 @@ pub struct ServerBuilder { ack: Option, canonicalization: Option, dashboard: Option>, + coordination: Option, logging_config: Option, cors_layer: Option, rate_limit_config: Option, @@ -63,6 +64,7 @@ impl ServerBuilder { ack: None, canonicalization: Some(CanonicalizationConfig::default()), dashboard: None, + coordination: None, logging_config: None, cors_layer: None, rate_limit_config: None, @@ -181,6 +183,14 @@ impl ServerBuilder { self } + /// Coordination store handles selected by the storage backend (Postgres => + /// shared, filesystem => in-memory). Injected into the realm-scoped consumers + /// when their state is built from the environment. + pub fn coordination(mut self, handles: crate::coordination::CoordinationHandles) -> Self { + self.coordination = Some(handles); + self + } + /// Configure canonicalization mode /// /// # Arguments @@ -442,12 +452,59 @@ impl ServerBuilder { .ok_or("Auditor not set. Use .auditor(...) — typically populated by StorageMetadataBuilder::build()")?; let ack = self.ack.ok_or("AckRegistry not set. Use .ack(...)")?; + let coordination = self.coordination; + // Fail closed before anything else: the Postgres backend must never fall + // back to per-process coordination (AlwaysLeader + in-memory sessions), + // which would let every replica run canonicalization and split auth + // state. Checking here (not only on the dashboard==None path) catches a + // manual builder that supplies a custom dashboard but skips coordination. + if coordination.is_none() && storage.kind() == crate::storage::StorageType::Postgres { + return Err("Postgres storage requires coordination handles for shared \ + sessions/challenges and canonicalization leadership; call \ + .coordination(...) (populated by StorageMetadataBuilder::build())" + .to_string()); + } + let coordination_mode = coordination + .as_ref() + .map(|handles| handles.mode) + .unwrap_or(crate::coordination::CoordinationMode::SingleProcess); + let leader: Arc = coordination + .as_ref() + .map(|handles| handles.leader.clone()) + .unwrap_or_else(|| { + Arc::new(crate::coordination::AlwaysLeader::new( + crate::coordination::CANONICALIZATION_LEASE, + "single-process", + )) + }); let dashboard = match self.dashboard { Some(dashboard) => dashboard, - None => Arc::new(DashboardState::from_env_for_network(network_type).await?), + None => match coordination.as_ref() { + Some(handles) => Arc::new( + DashboardState::from_env_for_network_with_stores( + network_type, + handles.operator_sessions.clone(), + handles.operator_challenges.clone(), + ) + .await?, + ), + // The Postgres-without-coordination case already failed closed + // above, so reaching here with no handles means a non-Postgres + // (filesystem/dev) backend using per-process dashboard state. + None => Arc::new(DashboardState::from_env_for_network(network_type).await?), + }, }; #[cfg(feature = "evm")] - let evm = Arc::new(EvmAppState::from_env().await?); + let evm = { + let sessions = match coordination.as_ref() { + Some(handles) => crate::evm::EvmSessionState::new( + handles.evm_sessions.clone(), + handles.evm_challenges.clone(), + ), + None => crate::evm::EvmSessionState::default(), + }; + Arc::new(EvmAppState::from_env_with_sessions(sessions).await?) + }; let network_client = MidenNetworkClient::from_network(network_type) .await @@ -456,6 +513,7 @@ impl ServerBuilder { let startup_info = startup::StartupInfo::new( network_type, storage.kind(), + coordination_mode.as_str(), ack.ecdsa_backend_id(), ack.commitment(&SignatureScheme::Falcon), ack.commitment(&SignatureScheme::Ecdsa), @@ -467,6 +525,29 @@ impl ServerBuilder { metrics_config.enabled.then_some(metrics_config.bind_addr), ); + // Prod fail-fast: an enabled rate limit that partitions to 0 per replica + // (global limit below GUARDIAN_MAX_REPLICAS) silently throttles all + // traffic on every replica. Mirror the filesystem-backend prod guard and + // refuse to start rather than serve a fleet that denies every request. + // (A missing cursor secret only warns and boots — it is not a prod guard.) + // Non-prod keeps the warning emitted by + // RateLimitConfig::from_env. + let rate_limit_config = self + .rate_limit_config + .unwrap_or_else(RateLimitConfig::from_env); + if crate::config::stage::is_prod().map_err(|error| error.to_string())? + && rate_limit_config.enabled + && (rate_limit_config.burst_per_sec == 0 || rate_limit_config.per_min == 0) + { + return Err( + "rate limiting partitions to 0 requests per replica in the prod stage \ + (GUARDIAN_ENV=prod): a global GUARDIAN_RATE_BURST_PER_SEC/GUARDIAN_RATE_PER_MIN \ + below GUARDIAN_MAX_REPLICAS makes every replica throttle all traffic. Raise the \ + global rate limit or lower GUARDIAN_MAX_REPLICAS." + .to_string(), + ); + } + let app_state = AppState { storage, metadata, @@ -482,9 +563,10 @@ impl ServerBuilder { Ok(ServerHandle { app_state, + leader, startup_info, cors_layer: self.cors_layer, - rate_limit_config: self.rate_limit_config, + rate_limit_config: Some(rate_limit_config), body_limit_config: self.body_limit_config, metrics_config, http_enabled: self.http_enabled, diff --git a/crates/server/src/builder/startup.rs b/crates/server/src/builder/startup.rs index 52c01b37..cd29e0d8 100644 --- a/crates/server/src/builder/startup.rs +++ b/crates/server/src/builder/startup.rs @@ -19,6 +19,7 @@ use std::net::SocketAddr; pub(crate) struct StartupInfo { network: NetworkType, storage: StorageType, + coordination_mode: &'static str, ecdsa_backend: &'static str, falcon_commitment: String, ecdsa_commitment: String, @@ -35,6 +36,7 @@ impl StartupInfo { pub(crate) fn new( network: NetworkType, storage: StorageType, + coordination_mode: &'static str, ecdsa_backend: &'static str, falcon_commitment: String, ecdsa_commitment: String, @@ -48,6 +50,7 @@ impl StartupInfo { Self { network, storage, + coordination_mode, ecdsa_backend, falcon_commitment, ecdsa_commitment, @@ -74,6 +77,22 @@ impl StartupInfo { "network" ); tracing::info!(storage = %self.storage, "storage backend"); + tracing::info!( + mode = self.coordination_mode, + backend = backend_label(&self.storage), + stage = if crate::config::stage::is_prod().unwrap_or(false) { + "prod" + } else { + "non-prod" + }, + max_replicas = %std::env::var("GUARDIAN_MAX_REPLICAS").unwrap_or_else(|_| "1".to_string()), + cursor_secret = if self.cursor_secret_configured { + "configured" + } else { + "ephemeral" + }, + "coordination", + ); tracing::info!( falcon = "enabled", falcon_commitment = %self.falcon_commitment, @@ -115,6 +134,13 @@ impl StartupInfo { } } +fn backend_label(storage: &StorageType) -> &'static str { + match storage { + StorageType::Postgres => "postgres", + StorageType::Filesystem => "filesystem", + } +} + fn port_label(port: Option) -> String { match port { Some(port) => port.to_string(), @@ -142,6 +168,7 @@ mod tests { let info = StartupInfo::new( NetworkType::MidenDevnet, StorageType::Postgres, + "shared", "aws-kms", "0xfalcon".to_string(), "0xecdsa".to_string(), @@ -159,6 +186,7 @@ mod tests { assert_eq!(info.network, NetworkType::MidenDevnet); assert_eq!(info.storage, StorageType::Postgres); + assert_eq!(info.coordination_mode, "shared"); assert_eq!(info.ecdsa_backend, "aws-kms"); assert_eq!(info.falcon_commitment, "0xfalcon"); assert_eq!(info.ecdsa_commitment, "0xecdsa"); @@ -178,6 +206,7 @@ mod tests { let info = StartupInfo::new( NetworkType::MidenLocal, StorageType::Filesystem, + "single-process", "in-memory", "0xfalcon".to_string(), "0xecdsa".to_string(), @@ -198,6 +227,33 @@ mod tests { assert_eq!(info.grpc_port, None); } + #[test] + fn backend_label_maps_storage_type() { + assert_eq!(backend_label(&StorageType::Postgres), "postgres"); + assert_eq!(backend_label(&StorageType::Filesystem), "filesystem"); + } + + #[test] + fn coordination_mode_label_is_logged_as_resolved() { + let info = StartupInfo::new( + NetworkType::MidenDevnet, + StorageType::Postgres, + "single-process", + "in-memory", + "0xfalcon".to_string(), + "0xecdsa".to_string(), + None, + 0, + false, + None, + None, + None, + ); + // Mode reflects the resolved coordination backing passed in, not the + // storage type — so it cannot claim "shared" while actually in-memory. + assert_eq!(info.coordination_mode, "single-process"); + } + #[test] fn port_label_renders_number_or_disabled() { assert_eq!(port_label(Some(3000)), "3000"); diff --git a/crates/server/src/builder/storage.rs b/crates/server/src/builder/storage.rs index 50a7c57e..73a65fb5 100644 --- a/crates/server/src/builder/storage.rs +++ b/crates/server/src/builder/storage.rs @@ -100,6 +100,7 @@ impl StorageMetadataBuilder { Arc, Arc, SharedAuditor, + crate::coordination::CoordinationHandles, ), String, > { @@ -128,11 +129,20 @@ impl StorageMetadataBuilder { let auditor: SharedAuditor = Arc::new(PostgresAuditor::new(metadata.pool_handle())); let storage = wrap_with_encryption(storage).await?; - Ok((storage, Arc::new(metadata), auditor)) + let holder_id = format!("{}-{:016x}", std::process::id(), rand::random::()); + let coordination = crate::coordination::CoordinationHandles::postgres( + metadata.pool_handle(), + holder_id, + ); + + Ok((storage, Arc::new(metadata), auditor, coordination)) } #[cfg(not(feature = "postgres"))] { + reject_filesystem_in_prod( + crate::config::stage::is_prod().map_err(|error| error.to_string())?, + )?; let storage_path = self .storage_path .ok_or_else(|| "GUARDIAN_STORAGE_PATH is required".to_string())?; @@ -153,11 +163,29 @@ impl StorageMetadataBuilder { let auditor: SharedAuditor = Arc::new(LogAuditor::new()); let storage = wrap_with_encryption(storage).await?; - Ok((storage, Arc::new(metadata), auditor)) + let coordination = crate::coordination::CoordinationHandles::in_memory(); + + Ok((storage, Arc::new(metadata), auditor, coordination)) } } } +/// The filesystem backend is local to one task and cannot be shared across +/// replicas, so it is refused in the prod stage. It remains the default for +/// local development and tests. +#[cfg(not(feature = "postgres"))] +fn reject_filesystem_in_prod(is_prod: bool) -> Result<(), String> { + if is_prod { + return Err( + "the filesystem storage backend is not supported in the prod stage \ + (GUARDIAN_ENV=prod): it is single-instance only and cannot be shared across \ + replicas. Use the Postgres image and set DATABASE_URL." + .to_string(), + ); + } + Ok(()) +} + async fn wrap_with_encryption(storage: S) -> Result, String> where S: StorageBackend + MarkerStore + 'static, @@ -433,6 +461,19 @@ mod tests { } } + #[cfg(not(feature = "postgres"))] + #[test] + fn filesystem_rejected_in_prod_stage() { + assert!( + reject_filesystem_in_prod(true).is_err(), + "prod stage must refuse the filesystem backend" + ); + assert!( + reject_filesystem_in_prod(false).is_ok(), + "non-prod tolerates the filesystem backend" + ); + } + #[cfg(not(feature = "postgres"))] #[tokio::test] async fn test_build_without_storage_path_fails() { diff --git a/crates/server/src/config/mod.rs b/crates/server/src/config/mod.rs new file mode 100644 index 00000000..12ef045f --- /dev/null +++ b/crates/server/src/config/mod.rs @@ -0,0 +1 @@ +pub mod stage; diff --git a/crates/server/src/config/stage.rs b/crates/server/src/config/stage.rs new file mode 100644 index 00000000..c6b2ef12 --- /dev/null +++ b/crates/server/src/config/stage.rs @@ -0,0 +1,16 @@ +use crate::error::{GuardianError, Result}; + +const ENV_GUARDIAN_ENV: &str = "GUARDIAN_ENV"; +const PROD_ENV: &str = "prod"; + +/// True when the deployment stage is production (`GUARDIAN_ENV=prod`, +/// case-insensitive). Gates production-only startup guards. +pub fn is_prod() -> Result { + match std::env::var(ENV_GUARDIAN_ENV) { + Ok(value) => Ok(value.trim().eq_ignore_ascii_case(PROD_ENV)), + Err(std::env::VarError::NotPresent) => Ok(false), + Err(std::env::VarError::NotUnicode(_)) => Err(GuardianError::ConfigurationError(format!( + "{ENV_GUARDIAN_ENV} must contain valid UTF-8" + ))), + } +} diff --git a/crates/server/src/coordination/challenge_store.rs b/crates/server/src/coordination/challenge_store.rs new file mode 100644 index 00000000..b4571315 --- /dev/null +++ b/crates/server/src/coordination/challenge_store.rs @@ -0,0 +1,297 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use async_trait::async_trait; +use chrono::{DateTime, Utc}; +use guardian_shared::hex::{FromHex, IntoHex}; +use miden_protocol::Word; +use tokio::sync::Mutex; + +use crate::error::{GuardianError, Result}; + +/// Realm-specific data needed to match a submitted credential against a pending +/// challenge at verify time. Operator verification re-runs a Falcon signature +/// check over the signing digest; EVM verification recovers the signer from the +/// full original challenge fields. +#[derive(Clone, Debug)] +pub enum ChallengePayload { + OperatorDigest(Word), + EvmChallenge { + address: String, + nonce: String, + issued_at: DateTime, + expires_at: DateTime, + }, +} + +impl ChallengePayload { + /// JSONB representation persisted in `auth_challenges.payload`. `Word` is not + /// directly serializable, so the operator digest is stored as canonical hex. + pub fn to_value(&self) -> serde_json::Value { + match self { + ChallengePayload::OperatorDigest(word) => serde_json::json!({ + "kind": "operator_digest", + "signing_digest": (*word).into_hex(), + }), + ChallengePayload::EvmChallenge { + address, + nonce, + issued_at, + expires_at, + } => serde_json::json!({ + "kind": "evm_challenge", + "address": address, + "nonce": nonce, + "issued_at": issued_at.to_rfc3339(), + "expires_at": expires_at.to_rfc3339(), + }), + } + } + + pub fn from_value(value: &serde_json::Value) -> Result { + let kind = value + .get("kind") + .and_then(serde_json::Value::as_str) + .ok_or_else(|| { + GuardianError::StorageError("challenge payload missing kind".to_string()) + })?; + match kind { + "operator_digest" => { + let hex = string_field(value, "signing_digest")?; + let word = Word::from_hex(&hex).map_err(GuardianError::StorageError)?; + Ok(ChallengePayload::OperatorDigest(word)) + } + "evm_challenge" => Ok(ChallengePayload::EvmChallenge { + address: string_field(value, "address")?, + nonce: string_field(value, "nonce")?, + issued_at: time_field(value, "issued_at")?, + expires_at: time_field(value, "expires_at")?, + }), + other => Err(GuardianError::StorageError(format!( + "unknown challenge payload kind: {other}" + ))), + } + } +} + +fn string_field(value: &serde_json::Value, key: &str) -> Result { + value + .get(key) + .and_then(serde_json::Value::as_str) + .map(str::to_string) + .ok_or_else(|| GuardianError::StorageError(format!("challenge payload missing {key}"))) +} + +fn time_field(value: &serde_json::Value, key: &str) -> Result> { + let raw = string_field(value, key)?; + DateTime::parse_from_rfc3339(&raw) + .map(|parsed| parsed.with_timezone(&Utc)) + .map_err(|error| { + GuardianError::StorageError(format!("challenge payload {key} invalid: {error}")) + }) +} + +#[derive(Clone, Debug)] +pub struct StoredChallenge { + pub key: String, + pub payload: ChallengePayload, + pub issued_at: DateTime, + pub expires_at: DateTime, +} + +/// A realm-scoped store of pending login challenges grouped by principal +/// (operator commitment or EVM address). Verification matches a returned +/// credential against the active challenges in Rust, then claims the matched one +/// via [`ChallengeStore::consume`], which is single-use across replicas. +#[async_trait] +pub trait ChallengeStore: Send + Sync { + async fn issue( + &self, + principal: &str, + challenge: StoredChallenge, + max_outstanding: usize, + now: DateTime, + ) -> Result<()>; + async fn active_for(&self, principal: &str, now: DateTime) + -> Result>; + async fn consume(&self, principal: &str, key: &str, now: DateTime) -> Result; + async fn sweep_expired(&self, now: DateTime) -> Result; +} + +#[derive(Clone, Default)] +pub struct InMemoryChallengeStore { + challenges: Arc>>>, +} + +impl InMemoryChallengeStore { + pub fn new() -> Self { + Self::default() + } +} + +#[async_trait] +impl ChallengeStore for InMemoryChallengeStore { + async fn issue( + &self, + principal: &str, + challenge: StoredChallenge, + max_outstanding: usize, + now: DateTime, + ) -> Result<()> { + let mut challenges = self.challenges.lock().await; + let pending = challenges.entry(principal.to_string()).or_default(); + pending.retain(|challenge| challenge.expires_at > now); + pending.push(challenge); + if pending.len() > max_outstanding { + pending.sort_by_key(|challenge| challenge.issued_at); + let drain_len = pending.len() - max_outstanding; + pending.drain(0..drain_len); + } + Ok(()) + } + + async fn active_for( + &self, + principal: &str, + now: DateTime, + ) -> Result> { + let challenges = self.challenges.lock().await; + Ok(challenges + .get(principal) + .map(|pending| { + pending + .iter() + .filter(|challenge| challenge.expires_at > now) + .cloned() + .collect() + }) + .unwrap_or_default()) + } + + async fn consume(&self, principal: &str, key: &str, now: DateTime) -> Result { + let mut challenges = self.challenges.lock().await; + let Some(pending) = challenges.get_mut(principal) else { + return Ok(false); + }; + let matched = pending + .iter() + .position(|challenge| challenge.key == key && challenge.expires_at > now); + let Some(index) = matched else { + return Ok(false); + }; + pending.remove(index); + if pending.is_empty() { + challenges.remove(principal); + } + Ok(true) + } + + async fn sweep_expired(&self, now: DateTime) -> Result { + let mut challenges = self.challenges.lock().await; + let before: usize = challenges.values().map(Vec::len).sum(); + for pending in challenges.values_mut() { + pending.retain(|challenge| challenge.expires_at > now); + } + challenges.retain(|_, pending| !pending.is_empty()); + let after: usize = challenges.values().map(Vec::len).sum(); + Ok((before - after) as u64) + } +} + +#[cfg(all(test, not(any(feature = "integration", feature = "e2e"))))] +mod tests { + use super::*; + use chrono::Duration; + + fn challenge(key: &str, now: DateTime, ttl_secs: i64) -> StoredChallenge { + StoredChallenge { + key: key.to_string(), + payload: ChallengePayload::EvmChallenge { + address: "0x1".to_string(), + nonce: key.to_string(), + issued_at: now, + expires_at: now + Duration::seconds(ttl_secs), + }, + issued_at: now, + expires_at: now + Duration::seconds(ttl_secs), + } + } + + #[tokio::test] + async fn consume_is_single_use() { + let store = InMemoryChallengeStore::new(); + let now = Utc::now(); + store + .issue("0xp", challenge("k1", now, 60), 8, now) + .await + .unwrap(); + + assert!(store.consume("0xp", "k1", now).await.unwrap()); + assert!(!store.consume("0xp", "k1", now).await.unwrap()); + } + + #[tokio::test] + async fn active_for_hides_expired() { + let store = InMemoryChallengeStore::new(); + let now = Utc::now(); + store + .issue("0xp", challenge("k1", now, 10), 8, now) + .await + .unwrap(); + + assert_eq!(store.active_for("0xp", now).await.unwrap().len(), 1); + assert!( + store + .active_for("0xp", now + Duration::seconds(11)) + .await + .unwrap() + .is_empty() + ); + } + + #[tokio::test] + async fn issue_caps_outstanding_dropping_oldest() { + let store = InMemoryChallengeStore::new(); + let now = Utc::now(); + for i in 0..5 { + let issued = now + Duration::seconds(i); + let mut c = challenge(&format!("k{i}"), now, 600); + c.issued_at = issued; + store.issue("0xp", c, 3, now).await.unwrap(); + } + let active = store.active_for("0xp", now).await.unwrap(); + assert_eq!(active.len(), 3); + assert!(active.iter().all(|c| c.key != "k0" && c.key != "k1")); + } + + #[tokio::test] + async fn consume_unknown_principal_is_false() { + let store = InMemoryChallengeStore::new(); + assert!(!store.consume("0xnope", "k1", Utc::now()).await.unwrap()); + } + + #[test] + fn evm_payload_round_trips_through_json() { + let payload = ChallengePayload::EvmChallenge { + address: "0xabc".to_string(), + nonce: "0xdead".to_string(), + issued_at: "2026-06-23T00:00:00+00:00".parse().unwrap(), + expires_at: "2026-06-23T00:05:00+00:00".parse().unwrap(), + }; + let value = payload.to_value(); + let restored = ChallengePayload::from_value(&value).unwrap(); + match restored { + ChallengePayload::EvmChallenge { address, nonce, .. } => { + assert_eq!(address, "0xabc"); + assert_eq!(nonce, "0xdead"); + } + _ => panic!("expected evm challenge"), + } + } + + #[test] + fn unknown_payload_kind_is_rejected() { + let value = serde_json::json!({ "kind": "nope" }); + assert!(ChallengePayload::from_value(&value).is_err()); + } +} diff --git a/crates/server/src/coordination/leader.rs b/crates/server/src/coordination/leader.rs new file mode 100644 index 00000000..1301cd09 --- /dev/null +++ b/crates/server/src/coordination/leader.rs @@ -0,0 +1,94 @@ +use async_trait::async_trait; +use chrono::{DateTime, Utc}; +use std::time::Duration; + +use crate::error::Result; + +/// A held leadership lease. `fence_token` strictly increases on every +/// (re)acquisition so a superseded holder can be detected at the write boundary. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct Lease { + pub name: String, + pub holder_id: String, + pub fence_token: i64, + pub expires_at: DateTime, +} + +/// Coordinates single-owner background work across replicas. `renew` runs on its +/// own timer concurrent with the protected work; a `false` return means the lease +/// was lost. `verify_held` is the mandatory fence check the holder runs +/// immediately before any state-mutating write. +#[async_trait] +pub trait LeaderElector: Send + Sync { + async fn try_acquire(&self, ttl: Duration) -> Result>; + async fn renew(&self, lease: &Lease, ttl: Duration) -> Result; + async fn verify_held(&self, lease: &Lease) -> Result; + async fn release(&self, lease: Lease) -> Result<()>; +} + +/// Single-process elector: the only replica is always the leader. Used on the +/// filesystem backend, where no shared coordination store exists. +pub struct AlwaysLeader { + name: String, + holder_id: String, +} + +impl AlwaysLeader { + pub fn new(name: impl Into, holder_id: impl Into) -> Self { + Self { + name: name.into(), + holder_id: holder_id.into(), + } + } + + fn lease(&self) -> Lease { + Lease { + name: self.name.clone(), + holder_id: self.holder_id.clone(), + fence_token: 0, + expires_at: DateTime::::MAX_UTC, + } + } +} + +#[async_trait] +impl LeaderElector for AlwaysLeader { + async fn try_acquire(&self, _ttl: Duration) -> Result> { + Ok(Some(self.lease())) + } + + async fn renew(&self, _lease: &Lease, _ttl: Duration) -> Result { + Ok(true) + } + + async fn verify_held(&self, _lease: &Lease) -> Result { + Ok(true) + } + + async fn release(&self, _lease: Lease) -> Result<()> { + Ok(()) + } +} + +#[cfg(all(test, not(any(feature = "integration", feature = "e2e"))))] +mod tests { + use super::*; + + #[tokio::test] + async fn always_leader_acquires_renews_and_verifies() { + let elector = AlwaysLeader::new("canonicalization", "single-process"); + let lease = elector + .try_acquire(Duration::from_secs(30)) + .await + .unwrap() + .expect("always leader acquires"); + assert_eq!(lease.holder_id, "single-process"); + assert!( + elector + .renew(&lease, Duration::from_secs(30)) + .await + .unwrap() + ); + assert!(elector.verify_held(&lease).await.unwrap()); + } +} diff --git a/crates/server/src/coordination/mod.rs b/crates/server/src/coordination/mod.rs new file mode 100644 index 00000000..8a69d797 --- /dev/null +++ b/crates/server/src/coordination/mod.rs @@ -0,0 +1,103 @@ +pub mod challenge_store; +pub mod leader; +#[cfg(feature = "postgres")] +pub mod postgres; +pub mod session_store; + +pub use challenge_store::{ + ChallengePayload, ChallengeStore, InMemoryChallengeStore, StoredChallenge, +}; +pub use leader::{AlwaysLeader, LeaderElector, Lease}; +pub use session_store::{ + InMemorySessionStore, SessionKey, SessionStore, SessionSubject, StoredSession, +}; + +use std::sync::Arc; + +/// Whether coordination is backed by the shared external store (replica-safe) or +/// is single-process in-memory. Carried on the handles so the startup log and +/// guards reflect the **actual** resolved backing, not an inference. +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum CoordinationMode { + Shared, + SingleProcess, +} + +impl CoordinationMode { + pub fn as_str(self) -> &'static str { + match self { + CoordinationMode::Shared => "shared", + CoordinationMode::SingleProcess => "single-process", + } + } +} + +/// Lease name for the single-owner canonicalization worker. +pub const CANONICALIZATION_LEASE: &str = "canonicalization"; + +/// Coordination store handles selected by the storage backend, threaded from the +/// storage builder (where the Postgres pool is available) into the realm-scoped +/// consumers. +#[derive(Clone)] +pub struct CoordinationHandles { + pub mode: CoordinationMode, + pub operator_sessions: Arc, + pub operator_challenges: Arc, + pub leader: Arc, + #[cfg(feature = "evm")] + pub evm_sessions: Arc, + #[cfg(feature = "evm")] + pub evm_challenges: Arc, +} + +impl CoordinationHandles { + pub fn in_memory() -> Self { + Self { + mode: CoordinationMode::SingleProcess, + operator_sessions: Arc::new(InMemorySessionStore::new()), + operator_challenges: Arc::new(InMemoryChallengeStore::new()), + leader: Arc::new(AlwaysLeader::new(CANONICALIZATION_LEASE, "single-process")), + #[cfg(feature = "evm")] + evm_sessions: Arc::new(InMemorySessionStore::new()), + #[cfg(feature = "evm")] + evm_challenges: Arc::new(InMemoryChallengeStore::new()), + } + } + + #[cfg(feature = "postgres")] + pub fn postgres( + pool: diesel_async::pooled_connection::deadpool::Pool, + holder_id: String, + ) -> Self { + use postgres::{PgChallengeStore, PgLeaseElector, PgSessionStore}; + Self { + mode: CoordinationMode::Shared, + operator_sessions: Arc::new(PgSessionStore::new(pool.clone(), Realm::Operator)), + operator_challenges: Arc::new(PgChallengeStore::new(pool.clone(), Realm::Operator)), + leader: Arc::new(PgLeaseElector::new( + pool.clone(), + CANONICALIZATION_LEASE, + holder_id, + )), + #[cfg(feature = "evm")] + evm_sessions: Arc::new(PgSessionStore::new(pool.clone(), Realm::Evm)), + #[cfg(feature = "evm")] + evm_challenges: Arc::new(PgChallengeStore::new(pool, Realm::Evm)), + } + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Realm { + Operator, + Evm, +} + +impl Realm { + pub fn as_str(self) -> &'static str { + match self { + Realm::Operator => "operator", + Realm::Evm => "evm", + } + } +} diff --git a/crates/server/src/coordination/postgres/challenge_store.rs b/crates/server/src/coordination/postgres/challenge_store.rs new file mode 100644 index 00000000..2f24dbd7 --- /dev/null +++ b/crates/server/src/coordination/postgres/challenge_store.rs @@ -0,0 +1,279 @@ +use async_trait::async_trait; +use chrono::{DateTime, Utc}; +use diesel::prelude::*; +use diesel::sql_types::{BigInt, Double, Jsonb, Text}; +use diesel_async::pooled_connection::deadpool::Pool; +use diesel_async::scoped_futures::ScopedFutureExt; +use diesel_async::{AsyncConnection, AsyncPgConnection, RunQueryDsl}; + +use crate::coordination::Realm; +use crate::coordination::challenge_store::{ChallengePayload, ChallengeStore, StoredChallenge}; +use crate::error::{GuardianError, Result}; +use crate::schema::auth_challenges; + +#[derive(Queryable, Selectable)] +#[diesel(table_name = auth_challenges)] +#[diesel(check_for_backend(diesel::pg::Pg))] +#[allow(dead_code)] +struct AuthChallengeRow { + realm: String, + challenge_key: String, + principal: String, + payload: serde_json::Value, + issued_at: DateTime, + expires_at: DateTime, + consumed_at: Option>, +} + +impl AuthChallengeRow { + fn into_stored(self) -> Result { + Ok(StoredChallenge { + key: self.challenge_key, + payload: ChallengePayload::from_value(&self.payload)?, + issued_at: self.issued_at, + expires_at: self.expires_at, + }) + } +} + +/// Postgres-backed [`ChallengeStore`] bound to one realm. Verification matches +/// in Rust over [`ChallengeStore::active_for`]; [`ChallengeStore::consume`] is an +/// atomic single-use claim keyed by `(realm, challenge_key)`. +pub struct PgChallengeStore { + pool: Pool, + realm: Realm, +} + +impl PgChallengeStore { + pub fn new(pool: Pool, realm: Realm) -> Self { + Self { pool, realm } + } +} + +#[async_trait] +impl ChallengeStore for PgChallengeStore { + async fn issue( + &self, + principal: &str, + challenge: StoredChallenge, + max_outstanding: usize, + _now: DateTime, + ) -> Result<()> { + let mut conn = super::checkout(&self.pool, "challenge").await?; + + let realm = self.realm.as_str().to_string(); + let principal = principal.to_string(); + let challenge_key = challenge.key; + let payload = challenge.payload.to_value(); + // The duration is clock-independent (both ends app-computed); anchoring on + // the DB clock for the stored row keeps expiry/capping consistent across + // replicas regardless of per-process clock skew. + let ttl_secs = (challenge.expires_at - challenge.issued_at) + .num_seconds() + .max(0) as f64; + let max = max_outstanding as i64; + let lock_key = format!("{realm}|{principal}"); + + conn.transaction::<(), diesel::result::Error, _>(|conn| { + async move { + // Serialize concurrent issuance for this (realm, principal) so the + // insert + cap-trim below sees a consistent row set; without it two + // racing issues can each trim to `max` independently and leave + // `max + 1` outstanding. The xact lock auto-releases at commit and + // only contends per principal, not across the table. + diesel::sql_query("SELECT pg_advisory_xact_lock(hashtextextended($1, 0))") + .bind::(&lock_key) + .execute(conn) + .await?; + + // ON CONFLICT refreshes a re-issued challenge (latest wins, + // re-arming consumed_at) rather than aborting the transaction on + // a duplicate `(realm, challenge_key)`. Keys are random nonces / + // unique digests so a collision is practically a re-issue; this + // matches InMemoryChallengeStore, which tolerates re-issue. + diesel::sql_query( + "INSERT INTO auth_challenges \ + (realm, challenge_key, principal, payload, issued_at, expires_at) \ + VALUES ($1, $2, $3, $4, now(), now() + make_interval(secs => $5)) \ + ON CONFLICT (realm, challenge_key) DO UPDATE SET \ + principal = EXCLUDED.principal, \ + payload = EXCLUDED.payload, \ + issued_at = EXCLUDED.issued_at, \ + expires_at = EXCLUDED.expires_at, \ + consumed_at = NULL", + ) + .bind::(&realm) + .bind::(&challenge_key) + .bind::(&principal) + .bind::(&payload) + .bind::(ttl_secs) + .execute(conn) + .await?; + + diesel::sql_query( + "DELETE FROM auth_challenges \ + WHERE realm = $1 AND principal = $2 AND expires_at < now()", + ) + .bind::(&realm) + .bind::(&principal) + .execute(conn) + .await?; + + diesel::sql_query( + "DELETE FROM auth_challenges WHERE ctid IN (\ + SELECT ctid FROM auth_challenges \ + WHERE realm = $1 AND principal = $2 \ + ORDER BY issued_at DESC OFFSET $3)", + ) + .bind::(&realm) + .bind::(&principal) + .bind::(max) + .execute(conn) + .await?; + + Ok(()) + } + .scope_boxed() + }) + .await + .map_err(|error| GuardianError::StorageError(format!("challenge issue: {error}")))?; + + Ok(()) + } + + async fn active_for( + &self, + principal: &str, + _now: DateTime, + ) -> Result> { + let mut conn = super::checkout(&self.pool, "challenge").await?; + let rows = auth_challenges::table + .filter(auth_challenges::realm.eq(self.realm.as_str())) + .filter(auth_challenges::principal.eq(principal)) + .filter(auth_challenges::consumed_at.is_null()) + .filter(auth_challenges::expires_at.gt(diesel::dsl::now)) + .select(AuthChallengeRow::as_select()) + .load(&mut conn) + .await + .map_err(|error| GuardianError::StorageError(format!("challenge load: {error}")))?; + rows.into_iter() + .map(AuthChallengeRow::into_stored) + .collect() + } + + async fn consume(&self, principal: &str, key: &str, _now: DateTime) -> Result { + let mut conn = super::checkout(&self.pool, "challenge").await?; + // `principal` is part of the predicate (not just `(realm, key)`) so the + // Postgres and in-memory impls agree that a wrong-principal consume fails. + let affected = diesel::update(auth_challenges::table) + .filter(auth_challenges::realm.eq(self.realm.as_str())) + .filter(auth_challenges::principal.eq(principal)) + .filter(auth_challenges::challenge_key.eq(key)) + .filter(auth_challenges::consumed_at.is_null()) + .filter(auth_challenges::expires_at.gt(diesel::dsl::now)) + .set(auth_challenges::consumed_at.eq(diesel::dsl::now)) + .execute(&mut conn) + .await + .map_err(|error| GuardianError::StorageError(format!("challenge consume: {error}")))?; + Ok(affected == 1) + } + + async fn sweep_expired(&self, _now: DateTime) -> Result { + let mut conn = super::checkout(&self.pool, "challenge").await?; + let deleted = diesel::delete(auth_challenges::table) + .filter(auth_challenges::realm.eq(self.realm.as_str())) + .filter(auth_challenges::expires_at.lt(diesel::dsl::now)) + .execute(&mut conn) + .await + .map_err(|error| GuardianError::StorageError(format!("challenge sweep: {error}")))?; + Ok(deleted as u64) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::postgres::{build_postgres_pool_lazy, run_migrations}; + use chrono::Duration; + + fn database_url() -> Option { + std::env::var("DATABASE_URL") + .ok() + .filter(|url| !url.trim().is_empty()) + } + + #[tokio::test] + async fn active_for_fails_closed_when_store_unreachable() { + let pool = build_postgres_pool_lazy("postgresql://127.0.0.1:1/__guardian_coord_fault__", 1) + .expect("lazy pool builds even with an unreachable address"); + let store = PgChallengeStore::new(pool, Realm::Operator); + assert!( + store.active_for("0xprincipal", Utc::now()).await.is_err(), + "challenge lookup must fail closed when the store is unreachable", + ); + } + + #[tokio::test] + #[ignore = "requires DATABASE_URL with migrations applied"] + async fn challenge_is_single_use_across_replicas() { + let url = database_url().expect("DATABASE_URL must be set for this #[ignore] test"); + run_migrations(&url).await.expect("migrations apply"); + let replica_a = PgChallengeStore::new( + build_postgres_pool_lazy(&url, 2).expect("pool a"), + Realm::Evm, + ); + let replica_b = PgChallengeStore::new( + build_postgres_pool_lazy(&url, 2).expect("pool b"), + Realm::Evm, + ); + let now = Utc::now(); + let stamp = now.timestamp_micros(); + let principal = format!("0xprincipal-{stamp}"); + let key = format!("nonce-{stamp}"); + + replica_a + .issue( + &principal, + StoredChallenge { + key: key.clone(), + payload: ChallengePayload::EvmChallenge { + address: principal.clone(), + nonce: key.clone(), + issued_at: now, + expires_at: now + Duration::minutes(5), + }, + issued_at: now, + expires_at: now + Duration::minutes(5), + }, + 8, + now, + ) + .await + .expect("issue on replica A"); + + assert!( + replica_b + .active_for(&principal, now) + .await + .expect("active_for on B") + .iter() + .any(|challenge| challenge.key == key), + "a challenge issued on A must be visible on B", + ); + + assert!( + replica_b + .consume(&principal, &key, now) + .await + .expect("consume on B"), + "first consume wins on replica B", + ); + assert!( + !replica_a + .consume(&principal, &key, now) + .await + .expect("replay consume on A"), + "single-use: a replay on replica A must lose", + ); + } +} diff --git a/crates/server/src/coordination/postgres/lease.rs b/crates/server/src/coordination/postgres/lease.rs new file mode 100644 index 00000000..a86f7840 --- /dev/null +++ b/crates/server/src/coordination/postgres/lease.rs @@ -0,0 +1,213 @@ +use std::time::Duration; + +use async_trait::async_trait; +use chrono::{DateTime, Utc}; +use diesel::OptionalExtension; +use diesel::sql_types::{BigInt, Double, Integer, Text, Timestamptz}; +use diesel_async::pooled_connection::deadpool::Pool; +use diesel_async::{AsyncPgConnection, RunQueryDsl}; + +use crate::coordination::leader::{LeaderElector, Lease}; +use crate::error::{GuardianError, Result}; + +#[derive(diesel::QueryableByName)] +struct AcquireRow { + #[diesel(sql_type = BigInt)] + fence_token: i64, + #[diesel(sql_type = Timestamptz)] + expires_at: DateTime, +} + +#[derive(diesel::QueryableByName)] +struct HeldRow { + #[diesel(sql_type = Integer)] + #[allow(dead_code)] + held: i32, +} + +/// Postgres lease elector backed by one `worker_leases` row. All timing uses the +/// database clock so replicas agree. `fence_token` only advances when ownership +/// changes (a steal), so a holder can detect supersession at its write boundary. +pub struct PgLeaseElector { + pool: Pool, + lease_name: String, + holder_id: String, +} + +impl PgLeaseElector { + pub fn new( + pool: Pool, + lease_name: impl Into, + holder_id: impl Into, + ) -> Self { + Self { + pool, + lease_name: lease_name.into(), + holder_id: holder_id.into(), + } + } +} + +#[async_trait] +impl LeaderElector for PgLeaseElector { + async fn try_acquire(&self, ttl: Duration) -> Result> { + let mut conn = super::checkout(&self.pool, "lease").await?; + let row = diesel::sql_query( + "INSERT INTO worker_leases \ + (lease_name, holder_id, acquired_at, renewed_at, expires_at, fence_token) \ + VALUES ($1, $2, now(), now(), now() + make_interval(secs => $3), 0) \ + ON CONFLICT (lease_name) DO UPDATE SET \ + holder_id = EXCLUDED.holder_id, \ + acquired_at = CASE WHEN worker_leases.holder_id = EXCLUDED.holder_id \ + THEN worker_leases.acquired_at ELSE now() END, \ + renewed_at = now(), \ + expires_at = now() + make_interval(secs => $3), \ + fence_token = CASE WHEN worker_leases.holder_id = EXCLUDED.holder_id \ + THEN worker_leases.fence_token ELSE worker_leases.fence_token + 1 END \ + WHERE worker_leases.expires_at < now() \ + OR worker_leases.holder_id = EXCLUDED.holder_id \ + RETURNING fence_token, expires_at", + ) + .bind::(&self.lease_name) + .bind::(&self.holder_id) + .bind::(ttl.as_secs_f64()) + .get_result::(&mut conn) + .await + .optional() + .map_err(|error| GuardianError::StorageError(format!("lease acquire: {error}")))?; + + Ok(row.map(|row| Lease { + name: self.lease_name.clone(), + holder_id: self.holder_id.clone(), + fence_token: row.fence_token, + expires_at: row.expires_at, + })) + } + + async fn renew(&self, lease: &Lease, ttl: Duration) -> Result { + let mut conn = super::checkout(&self.pool, "lease").await?; + let affected = diesel::sql_query( + "UPDATE worker_leases SET renewed_at = now(), expires_at = now() + make_interval(secs => $1) \ + WHERE lease_name = $2 AND holder_id = $3 AND fence_token = $4 AND now() < expires_at", + ) + .bind::(ttl.as_secs_f64()) + .bind::(&lease.name) + .bind::(&lease.holder_id) + .bind::(lease.fence_token) + .execute(&mut conn) + .await + .map_err(|error| GuardianError::StorageError(format!("lease renew: {error}")))?; + Ok(affected == 1) + } + + async fn verify_held(&self, lease: &Lease) -> Result { + let mut conn = super::checkout(&self.pool, "lease").await?; + let row = diesel::sql_query( + "SELECT 1 AS held FROM worker_leases \ + WHERE lease_name = $1 AND holder_id = $2 AND fence_token = $3 AND now() < expires_at", + ) + .bind::(&lease.name) + .bind::(&lease.holder_id) + .bind::(lease.fence_token) + .get_result::(&mut conn) + .await + .optional() + .map_err(|error| GuardianError::StorageError(format!("lease verify: {error}")))?; + Ok(row.is_some()) + } + + async fn release(&self, lease: Lease) -> Result<()> { + let mut conn = super::checkout(&self.pool, "lease").await?; + // Expire the lease in place instead of deleting the row, so `fence_token` + // survives and keeps advancing monotonically on the next steal. A DELETE + // would let a fresh acquire re-INSERT `fence_token = 0`, after which a + // stale `Lease { fence_token: 0 }` from a long-gone holder could pass + // `verify_held` again. + diesel::sql_query( + "UPDATE worker_leases SET expires_at = now() \ + WHERE lease_name = $1 AND holder_id = $2 AND fence_token = $3", + ) + .bind::(&lease.name) + .bind::(&lease.holder_id) + .bind::(lease.fence_token) + .execute(&mut conn) + .await + .map_err(|error| GuardianError::StorageError(format!("lease release: {error}")))?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::postgres::{build_postgres_pool_lazy, run_migrations}; + + fn database_url() -> Option { + std::env::var("DATABASE_URL") + .ok() + .filter(|url| !url.trim().is_empty()) + } + + #[tokio::test] + async fn try_acquire_fails_closed_when_unreachable() { + let pool = build_postgres_pool_lazy("postgresql://127.0.0.1:1/__guardian_lease_fault__", 1) + .expect("lazy pool builds even with an unreachable address"); + let elector = PgLeaseElector::new(pool, "canonicalization", "replica-a"); + assert!( + elector.try_acquire(Duration::from_secs(30)).await.is_err(), + "lease acquire must surface an error (not a false None) when unreachable", + ); + } + + #[tokio::test] + #[ignore = "requires DATABASE_URL with migrations applied"] + async fn single_owner_failover_fences_the_old_holder() { + let url = database_url().expect("DATABASE_URL must be set for this #[ignore] test"); + run_migrations(&url).await.expect("migrations apply"); + let name = format!("canon-test-{}", Utc::now().timestamp_micros()); + let short_ttl = Duration::from_secs(1); + let ttl = Duration::from_secs(60); + let a = PgLeaseElector::new( + build_postgres_pool_lazy(&url, 2).unwrap(), + &name, + "replica-a", + ); + let b = PgLeaseElector::new( + build_postgres_pool_lazy(&url, 2).unwrap(), + &name, + "replica-b", + ); + + let lease_a = a + .try_acquire(short_ttl) + .await + .expect("acquire A") + .expect("A becomes the single owner"); + // While A holds an unexpired lease, B cannot acquire. + assert!( + b.try_acquire(ttl).await.expect("B attempt").is_none(), + "only one replica may hold the lease", + ); + + // A crashes (stops renewing); after the TTL elapses B steals the expired + // lease, which advances the fence token (change of holder). + tokio::time::sleep(Duration::from_millis(1200)).await; + let lease_b = b + .try_acquire(ttl) + .await + .expect("acquire B") + .expect("B takes over the expired lease"); + assert!( + lease_b.fence_token > lease_a.fence_token, + "a steal must advance the fence token", + ); + + // The superseded holder A can neither renew nor pass its fence check; + // only the current holder B is verified. + assert!(!a.renew(&lease_a, ttl).await.expect("A stale renew")); + assert!(!a.verify_held(&lease_a).await.expect("A stale verify")); + assert!(b.verify_held(&lease_b).await.expect("B verify")); + + b.release(lease_b).await.expect("cleanup"); + } +} diff --git a/crates/server/src/coordination/postgres/mod.rs b/crates/server/src/coordination/postgres/mod.rs new file mode 100644 index 00000000..abc007d9 --- /dev/null +++ b/crates/server/src/coordination/postgres/mod.rs @@ -0,0 +1,23 @@ +pub mod challenge_store; +pub mod lease; +pub mod session_store; + +pub use challenge_store::PgChallengeStore; +pub use lease::PgLeaseElector; +pub use session_store::PgSessionStore; + +use diesel_async::AsyncPgConnection; +use diesel_async::pooled_connection::deadpool::{Object, Pool}; + +use crate::error::{GuardianError, Result}; + +/// Check out a pooled connection, mapping checkout failure to the fail-closed +/// `StorageError` surface. `context` labels the call site in the error message. +async fn checkout( + pool: &Pool, + context: &str, +) -> Result> { + pool.get() + .await + .map_err(|error| GuardianError::StorageError(format!("{context} pool: {error}"))) +} diff --git a/crates/server/src/coordination/postgres/session_store.rs b/crates/server/src/coordination/postgres/session_store.rs new file mode 100644 index 00000000..c7b0dac9 --- /dev/null +++ b/crates/server/src/coordination/postgres/session_store.rs @@ -0,0 +1,216 @@ +use async_trait::async_trait; +use chrono::{DateTime, Utc}; +use diesel::prelude::*; +use diesel_async::pooled_connection::deadpool::Pool; +use diesel_async::{AsyncPgConnection, RunQueryDsl}; + +use crate::coordination::Realm; +use crate::coordination::session_store::{SessionKey, SessionStore, SessionSubject, StoredSession}; +use crate::error::{GuardianError, Result}; +use crate::schema::auth_sessions; + +#[derive(Insertable)] +#[diesel(table_name = auth_sessions)] +struct NewAuthSession { + token_digest: Vec, + realm: String, + subject: serde_json::Value, + issued_at: DateTime, + expires_at: DateTime, +} + +#[derive(Queryable, Selectable)] +#[diesel(table_name = auth_sessions)] +#[diesel(check_for_backend(diesel::pg::Pg))] +#[allow(dead_code)] +struct AuthSessionRow { + token_digest: Vec, + realm: String, + subject: serde_json::Value, + issued_at: DateTime, + expires_at: DateTime, + revoked_at: Option>, +} + +impl AuthSessionRow { + fn into_stored(self) -> Result { + let subject: SessionSubject = serde_json::from_value(self.subject).map_err(|error| { + GuardianError::StorageError(format!("session subject decode: {error}")) + })?; + Ok(StoredSession { + subject, + issued_at: self.issued_at, + expires_at: self.expires_at, + }) + } +} + +/// Postgres-backed [`SessionStore`] bound to one realm. Expiry and revocation +/// use the database clock so every replica agrees. Any DB error surfaces as a +/// `StorageError`, which the auth path treats as fail-closed. +pub struct PgSessionStore { + pool: Pool, + realm: Realm, +} + +impl PgSessionStore { + pub fn new(pool: Pool, realm: Realm) -> Self { + Self { pool, realm } + } +} + +#[async_trait] +impl SessionStore for PgSessionStore { + async fn insert(&self, key: SessionKey, session: StoredSession) -> Result<()> { + let mut conn = super::checkout(&self.pool, "session").await?; + let subject = serde_json::to_value(&session.subject).map_err(|error| { + GuardianError::StorageError(format!("session subject encode: {error}")) + })?; + let row = NewAuthSession { + token_digest: key.to_vec(), + realm: self.realm.as_str().to_string(), + subject, + issued_at: session.issued_at, + expires_at: session.expires_at, + }; + // Upsert: a digest collision (astronomically unlikely) or a re-insert + // over an unswept revoked row replaces it with the fresh, unrevoked + // session rather than erroring. + diesel::insert_into(auth_sessions::table) + .values(&row) + .on_conflict((auth_sessions::realm, auth_sessions::token_digest)) + .do_update() + .set(( + auth_sessions::realm.eq(self.realm.as_str()), + auth_sessions::subject.eq(&row.subject), + auth_sessions::issued_at.eq(session.issued_at), + auth_sessions::expires_at.eq(session.expires_at), + auth_sessions::revoked_at.eq(None::>), + )) + .execute(&mut conn) + .await + .map_err(|error| GuardianError::StorageError(format!("session insert: {error}")))?; + Ok(()) + } + + async fn get(&self, key: &SessionKey, _now: DateTime) -> Result> { + let mut conn = super::checkout(&self.pool, "session").await?; + let row = auth_sessions::table + .filter(auth_sessions::token_digest.eq(key.to_vec())) + .filter(auth_sessions::realm.eq(self.realm.as_str())) + .filter(auth_sessions::revoked_at.is_null()) + .filter(auth_sessions::expires_at.gt(diesel::dsl::now)) + .select(AuthSessionRow::as_select()) + .first(&mut conn) + .await + .optional() + .map_err(|error| GuardianError::StorageError(format!("session lookup: {error}")))?; + row.map(AuthSessionRow::into_stored).transpose() + } + + async fn revoke(&self, key: &SessionKey) -> Result> { + let mut conn = super::checkout(&self.pool, "session").await?; + let row = diesel::update(auth_sessions::table) + .filter(auth_sessions::token_digest.eq(key.to_vec())) + .filter(auth_sessions::realm.eq(self.realm.as_str())) + .filter(auth_sessions::revoked_at.is_null()) + .set(auth_sessions::revoked_at.eq(diesel::dsl::now)) + .returning(AuthSessionRow::as_returning()) + .get_result(&mut conn) + .await + .optional() + .map_err(|error| GuardianError::StorageError(format!("session revoke: {error}")))?; + row.map(AuthSessionRow::into_stored).transpose() + } + + async fn sweep_expired(&self, _now: DateTime) -> Result { + let mut conn = super::checkout(&self.pool, "session").await?; + let deleted = diesel::delete(auth_sessions::table) + .filter(auth_sessions::realm.eq(self.realm.as_str())) + .filter(auth_sessions::expires_at.lt(diesel::dsl::now)) + .execute(&mut conn) + .await + .map_err(|error| GuardianError::StorageError(format!("session sweep: {error}")))?; + Ok(deleted as u64) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::postgres::{build_postgres_pool_lazy, run_migrations}; + use chrono::Duration; + + fn database_url() -> Option { + std::env::var("DATABASE_URL") + .ok() + .filter(|url| !url.trim().is_empty()) + } + + fn unique_key(now: DateTime) -> SessionKey { + let mut key = [0u8; 32]; + key[..16].copy_from_slice(&now.timestamp_micros().to_le_bytes().repeat(2)[..16]); + key + } + + #[tokio::test] + async fn get_fails_closed_when_store_unreachable() { + let pool = build_postgres_pool_lazy("postgresql://127.0.0.1:1/__guardian_coord_fault__", 1) + .expect("lazy pool builds even with an unreachable address"); + let store = PgSessionStore::new(pool, Realm::Operator); + assert!( + store.get(&[7u8; 32], Utc::now()).await.is_err(), + "session lookup must fail closed when the store is unreachable", + ); + } + + #[tokio::test] + #[ignore = "requires DATABASE_URL with migrations applied"] + async fn session_visible_across_replicas_and_revoke_propagates() { + let url = database_url().expect("DATABASE_URL must be set for this #[ignore] test"); + run_migrations(&url).await.expect("migrations apply"); + let replica_a = PgSessionStore::new( + build_postgres_pool_lazy(&url, 2).expect("pool a"), + Realm::Operator, + ); + let replica_b = PgSessionStore::new( + build_postgres_pool_lazy(&url, 2).expect("pool b"), + Realm::Operator, + ); + let now = Utc::now(); + let key = unique_key(now); + + replica_a + .insert( + key, + StoredSession { + subject: SessionSubject::Operator { + operator_id: "op-x".to_string(), + commitment: "0xc".to_string(), + }, + issued_at: now, + expires_at: now + Duration::hours(1), + }, + ) + .await + .expect("insert on replica A"); + + assert!( + replica_b.get(&key, now).await.expect("get on B").is_some(), + "a session written by replica A must be visible on replica B", + ); + + assert!( + replica_a.revoke(&key).await.expect("revoke on A").is_some(), + "revoke returns the prior session", + ); + assert!( + replica_b + .get(&key, now) + .await + .expect("get on B after revoke") + .is_none(), + "revocation on A must be honored on B", + ); + } +} diff --git a/crates/server/src/coordination/session_store.rs b/crates/server/src/coordination/session_store.rs new file mode 100644 index 00000000..d517c6e4 --- /dev/null +++ b/crates/server/src/coordination/session_store.rs @@ -0,0 +1,167 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use async_trait::async_trait; +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use tokio::sync::Mutex; + +use crate::error::Result; + +pub type SessionKey = [u8; 32]; + +/// Realm-specific authenticated identity persisted with a session. Operator +/// permissions are intentionally absent: they are re-resolved from the live +/// allowlist on each request, so only the stable identity is stored. +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(tag = "realm", rename_all = "snake_case")] +pub enum SessionSubject { + Operator { + operator_id: String, + commitment: String, + }, + Evm { + address: String, + }, +} + +#[derive(Clone, Debug)] +pub struct StoredSession { + pub subject: SessionSubject, + pub issued_at: DateTime, + pub expires_at: DateTime, +} + +/// A store of authenticated sessions keyed by the SHA-256 digest of the session +/// token. Each instance is bound to a single realm at construction (the Postgres +/// implementation scopes its rows by that realm; the in-memory implementation is +/// instance-scoped). Implementations expose only unexpired, unrevoked sessions; +/// reclamation of expired rows is the job of [`SessionStore::sweep_expired`]. +#[async_trait] +pub trait SessionStore: Send + Sync { + async fn insert(&self, key: SessionKey, session: StoredSession) -> Result<()>; + async fn get(&self, key: &SessionKey, now: DateTime) -> Result>; + /// Revoke a session (logout), returning the prior session if present for + /// logout-side logging. The cross-replica contract: once revoked, `get` MUST + /// reject it on every replica until its natural expiry. The Postgres + /// implementation marks `revoked_at` and keeps the row until expiry; the + /// in-memory implementation removes it. + async fn revoke(&self, key: &SessionKey) -> Result>; + async fn sweep_expired(&self, now: DateTime) -> Result; +} + +#[derive(Clone, Default)] +pub struct InMemorySessionStore { + sessions: Arc>>, +} + +impl InMemorySessionStore { + pub fn new() -> Self { + Self::default() + } +} + +#[async_trait] +impl SessionStore for InMemorySessionStore { + async fn insert(&self, key: SessionKey, session: StoredSession) -> Result<()> { + self.sessions.lock().await.insert(key, session); + Ok(()) + } + + async fn get(&self, key: &SessionKey, now: DateTime) -> Result> { + Ok(self + .sessions + .lock() + .await + .get(key) + .filter(|session| session.expires_at > now) + .cloned()) + } + + async fn revoke(&self, key: &SessionKey) -> Result> { + Ok(self.sessions.lock().await.remove(key)) + } + + async fn sweep_expired(&self, now: DateTime) -> Result { + let mut sessions = self.sessions.lock().await; + let before = sessions.len(); + sessions.retain(|_, session| session.expires_at > now); + Ok((before - sessions.len()) as u64) + } +} + +#[cfg(all(test, not(any(feature = "integration", feature = "e2e"))))] +mod tests { + use super::*; + use chrono::Duration; + + fn operator_session(now: DateTime, ttl_secs: i64) -> StoredSession { + StoredSession { + subject: SessionSubject::Operator { + operator_id: "op-1".to_string(), + commitment: "0xabc".to_string(), + }, + issued_at: now, + expires_at: now + Duration::seconds(ttl_secs), + } + } + + #[tokio::test] + async fn get_returns_unexpired_and_hides_expired() { + let store = InMemorySessionStore::new(); + let now = Utc::now(); + store + .insert([1u8; 32], operator_session(now, 60)) + .await + .unwrap(); + + assert!(store.get(&[1u8; 32], now).await.unwrap().is_some()); + assert!( + store + .get(&[1u8; 32], now + Duration::seconds(61)) + .await + .unwrap() + .is_none() + ); + } + + #[tokio::test] + async fn revoke_returns_record_then_absent() { + let store = InMemorySessionStore::new(); + let now = Utc::now(); + store + .insert([2u8; 32], operator_session(now, 60)) + .await + .unwrap(); + + assert!(store.revoke(&[2u8; 32]).await.unwrap().is_some()); + assert!(store.get(&[2u8; 32], now).await.unwrap().is_none()); + } + + #[tokio::test] + async fn sweep_reclaims_only_expired() { + let store = InMemorySessionStore::new(); + let now = Utc::now(); + store + .insert([3u8; 32], operator_session(now, 10)) + .await + .unwrap(); + store + .insert([4u8; 32], operator_session(now, 600)) + .await + .unwrap(); + + let swept = store + .sweep_expired(now + Duration::seconds(60)) + .await + .unwrap(); + assert_eq!(swept, 1); + assert!( + store + .get(&[4u8; 32], now + Duration::seconds(60)) + .await + .unwrap() + .is_some() + ); + } +} diff --git a/crates/server/src/dashboard/state.rs b/crates/server/src/dashboard/state.rs index a49ab312..578c6e54 100644 --- a/crates/server/src/dashboard/state.rs +++ b/crates/server/src/dashboard/state.rs @@ -1,10 +1,9 @@ -use std::collections::HashMap; use std::sync::Arc; use chrono::{DateTime, Utc}; use guardian_shared::hex::{FromHex, IntoHex}; use miden_protocol::crypto::dsa::falcon512_poseidon2::Signature; -use tokio::sync::{Mutex, RwLock}; +use tokio::sync::RwLock; use super::allowlist::{ AllowlistSource, OperatorAllowlist, OperatorAllowlistEntryInput, normalize_commitment, @@ -13,35 +12,71 @@ use super::config::DashboardConfig; use super::cursor::CursorSecret; use super::types::{ AuthenticatedOperator, IssuedOperatorSession, OperatorChallenge, OperatorChallengePayload, - OperatorSessionRecord, PendingChallenge, }; use super::util::{cookie_date, correlation_id, random_hex, rate_limit_error}; +use crate::coordination::{ + ChallengePayload, ChallengeStore, InMemoryChallengeStore, InMemorySessionStore, SessionStore, + SessionSubject, StoredChallenge, StoredSession, +}; use crate::error::{GuardianError, Result}; use crate::middleware::rate_limit::RateLimitStore; use crate::network::NetworkType; use crate::secret::session_digest; -#[derive(Clone, Debug)] +#[derive(Clone)] pub struct DashboardState { config: DashboardConfig, allowlist_source: AllowlistSource, allowlist: Arc>, - challenges: Arc>>>, - sessions: Arc>>, + session_store: Arc, + challenge_store: Arc, commitment_rate_limits: RateLimitStore, cursor_secret: CursorSecret, cursor_secret_configured: bool, started_at: DateTime, } +impl std::fmt::Debug for DashboardState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DashboardState") + .field("config", &self.config) + .field("cursor_secret_configured", &self.cursor_secret_configured) + .field("started_at", &self.started_at) + .finish_non_exhaustive() + } +} + impl DashboardState { pub async fn from_env_for_network( network_type: NetworkType, + ) -> std::result::Result { + Self::from_env_for_network_with_stores( + network_type, + Arc::new(InMemorySessionStore::new()), + Arc::new(InMemoryChallengeStore::new()), + ) + .await + } + + /// Same as [`DashboardState::from_env_for_network`] but with explicit, + /// realm-bound coordination stores. The server builder passes shared + /// (Postgres) stores here on the Postgres backend; the default path uses + /// in-memory stores (single-process / dev). + pub async fn from_env_for_network_with_stores( + network_type: NetworkType, + session_store: Arc, + challenge_store: Arc, ) -> std::result::Result { let config = DashboardConfig::from_env_for_network(network_type)?; let allowlist_source = AllowlistSource::from_env().await?; let allowlist = allowlist_source.load().await?; - Self::from_allowlist_source(allowlist_source, allowlist, config) + Self::from_allowlist_source( + allowlist_source, + allowlist, + config, + session_store, + challenge_store, + ) } pub fn for_tests(entries: Vec<(String, String)>) -> Self { @@ -58,6 +93,8 @@ impl DashboardState { AllowlistSource::Static, allowlist, DashboardConfig::for_tests(), + Arc::new(InMemorySessionStore::new()), + Arc::new(InMemoryChallengeStore::new()), ) .expect("dashboard test configuration should be valid") } @@ -76,6 +113,8 @@ impl DashboardState { AllowlistSource::Static, allowlist, DashboardConfig::for_tests(), + Arc::new(InMemorySessionStore::new()), + Arc::new(InMemoryChallengeStore::new()), ) .expect("dashboard test configuration should be valid") } @@ -132,19 +171,20 @@ impl DashboardState { .is_some() { let expires_at = now + self.config.nonce_ttl; - let mut challenges = self.challenges.lock().await; - let pending = challenges.entry(normalized_commitment.clone()).or_default(); - pending.retain(|challenge| challenge.expires_at > now); - pending.push(PendingChallenge { - signing_digest, + let challenge = StoredChallenge { + key: signing_digest.into_hex(), + payload: ChallengePayload::OperatorDigest(signing_digest), issued_at: now, expires_at, - }); - if pending.len() > self.config.max_outstanding_challenges { - pending.sort_by_key(|challenge| challenge.issued_at); - let drain_len = pending.len() - self.config.max_outstanding_challenges; - pending.drain(0..drain_len); - } + }; + self.challenge_store + .issue( + &normalized_commitment, + challenge, + self.config.max_outstanding_challenges, + now, + ) + .await?; tracing::info!( auth_event = "challenge_issued", @@ -221,18 +261,16 @@ impl DashboardState { )); } - let mut challenges = self.challenges.lock().await; - let pending = challenges.entry(normalized_commitment.clone()).or_default(); - pending.retain(|challenge| challenge.expires_at > now); - - let matched_index = pending - .iter() - .position(|challenge| public_key.verify(challenge.signing_digest, &signature)); + let active = self + .challenge_store + .active_for(&normalized_commitment, now) + .await?; + let matched = active.iter().find(|challenge| match &challenge.payload { + ChallengePayload::OperatorDigest(digest) => public_key.verify(*digest, &signature), + _ => false, + }); - let Some(matched_index) = matched_index else { - if pending.is_empty() { - challenges.remove(&normalized_commitment); - } + let Some(matched) = matched else { tracing::warn!( auth_event = "verify_failed", correlation_id = %correlation_id, @@ -244,34 +282,42 @@ impl DashboardState { )); }; - pending.remove(matched_index); - if pending.is_empty() { - challenges.remove(&normalized_commitment); + if !self + .challenge_store + .consume(&normalized_commitment, &matched.key, now) + .await? + { + tracing::warn!( + auth_event = "verify_failed", + correlation_id = %correlation_id, + operator_id = %operator.operator_id, + "Operator verify rejected because the matched challenge was already consumed" + ); + return Err(GuardianError::AuthenticationFailed( + "Invalid operator credentials".to_string(), + )); } - drop(challenges); let issued_at = now; let expires_at = now + self.config.session_ttl; - // Stash the freshly-resolved principal (identity + current - // permissions) into the session record. `authenticate_session` - // re-resolves permissions per request from the live allowlist - // anyway, so the copy held here is just a fallback used for - // logout-side logging. let operator_identity = operator.clone(); let token = random_hex::<32>(); let cookie_header = self.session_cookie_header(&token, issued_at, expires_at); let session_key = session_digest(&token); - let mut sessions = self.sessions.lock().await; - sessions.retain(|_, session| session.expires_at > now); - sessions.insert( - session_key, - OperatorSessionRecord { - operator: operator_identity.clone(), - issued_at, - expires_at, - }, - ); + self.session_store + .insert( + session_key, + StoredSession { + subject: SessionSubject::Operator { + operator_id: operator_identity.operator_id.clone(), + commitment: normalized_commitment.clone(), + }, + issued_at, + expires_at, + }, + ) + .await?; tracing::info!( auth_event = "verify_success", @@ -293,33 +339,39 @@ impl DashboardState { now: DateTime, ) -> Result { self.refresh_allowlist().await?; - let mut sessions = self.sessions.lock().await; - sessions.retain(|_, session| session.expires_at > now); let session_key = session_digest(token); - let session = sessions.get(&session_key).cloned().ok_or_else(|| { - tracing::warn!( - auth_event = "session_rejected", - reason = "missing_or_expired", - "Operator session rejected" - ); - GuardianError::AuthenticationFailed("Invalid operator session".to_string()) - })?; + let session = self + .session_store + .get(&session_key, now) + .await? + .ok_or_else(|| { + tracing::warn!( + auth_event = "session_rejected", + reason = "missing_or_expired", + "Operator session rejected" + ); + GuardianError::AuthenticationFailed("Invalid operator session".to_string()) + })?; - // Re-resolve the principal from the **live** allowlist snapshot - // rather than returning the (potentially stale) copy carried in - // the session record. This is the load-bearing wiring for - // feature 006-operator-authz FR-008 / SC-004: a permission - // grant or revocation written to the allowlist source takes - // effect on the next authenticated request without re-login. - let Some(live_operator) = self - .lookup_allowlisted_operator(&session.operator.commitment) - .await + let SessionSubject::Operator { + operator_id, + commitment, + } = &session.subject else { - sessions.remove(&session_key); + return Err(GuardianError::AuthenticationFailed( + "Invalid operator session".to_string(), + )); + }; + + // Re-resolve the principal from the **live** allowlist snapshot rather + // than the identity carried in the session record, so a permission grant + // or revocation takes effect on the next request without re-login. + let Some(live_operator) = self.lookup_allowlisted_operator(commitment).await else { + self.session_store.revoke(&session_key).await?; tracing::warn!( auth_event = "session_rejected", - operator_id = %session.operator.operator_id, + operator_id = %operator_id, reason = "revoked", "Operator session rejected because the operator is no longer allowlisted" ); @@ -331,25 +383,27 @@ impl DashboardState { Ok(live_operator) } - pub async fn logout(&self, token: Option<&str>, now: DateTime) { - let mut sessions = self.sessions.lock().await; - sessions.retain(|_, session| session.expires_at > now); + pub async fn logout(&self, token: Option<&str>, _now: DateTime) -> Result<()> { if let Some(token) = token - && let Some(session) = sessions.remove(&session_digest(token)) + && let Some(session) = self.session_store.revoke(&session_digest(token)).await? + && let SessionSubject::Operator { operator_id, .. } = &session.subject { tracing::info!( auth_event = "logout", - operator_id = %session.operator.operator_id, + operator_id = %operator_id, issued_at = %session.issued_at.to_rfc3339(), "Operator session cleared" ); } + Ok(()) } fn from_allowlist_source( allowlist_source: AllowlistSource, allowlist: OperatorAllowlist, mut config: DashboardConfig, + session_store: Arc, + challenge_store: Arc, ) -> std::result::Result { tracing::info!( auth_event = "allowlist_loaded", @@ -358,29 +412,42 @@ impl DashboardState { ); let configured_cursor_secret = config.take_cursor_secret(); let cursor_secret_configured = configured_cursor_secret.is_some(); - let cursor_secret = configured_cursor_secret.unwrap_or_else(|| { - if !cfg!(test) { - tracing::warn!( - "dashboard cursor secret not configured; generating ephemeral per-process \ - secret. Multi-replica deployments must set \ - GUARDIAN_DASHBOARD_CURSOR_SECRET to a stable shared 32-byte hex value." - ); + let cursor_secret = match configured_cursor_secret { + Some(secret) => secret, + None => { + if !cfg!(test) { + tracing::warn!( + "dashboard cursor secret not configured; generating ephemeral per-process \ + secret. This degrades only dashboard pagination: a multi-replica \ + deployment must set GUARDIAN_DASHBOARD_CURSOR_SECRET to a stable shared \ + 64-hex (32-byte) value, or a cursor minted on one replica fails on another." + ); + } + CursorSecret::generate() } - CursorSecret::generate() - }); + }; Ok(Self { commitment_rate_limits: RateLimitStore::new(config.commitment_rate_limit.clone()), config, allowlist_source, allowlist: Arc::new(RwLock::new(allowlist)), - challenges: Arc::new(Mutex::new(HashMap::new())), - sessions: Arc::new(Mutex::new(HashMap::new())), + session_store, + challenge_store, cursor_secret, cursor_secret_configured, started_at: Utc::now(), }) } + /// Reclaim expired operator sessions and challenges. Run periodically by the + /// server's background sweep; expiry is also enforced on read, so this is + /// housekeeping (and a no-op for the in-memory backend beyond freeing memory). + pub async fn sweep_expired(&self, now: DateTime) -> Result<()> { + self.session_store.sweep_expired(now).await?; + self.challenge_store.sweep_expired(now).await?; + Ok(()) + } + /// Server-side signing secret for opaque pagination cursors. See /// `crate::dashboard::cursor`. Generated once per server startup. pub fn cursor_secret(&self) -> &CursorSecret { @@ -794,7 +861,7 @@ mod tests { } #[tokio::test] - async fn session_map_never_holds_plaintext_token() { + async fn session_is_keyed_by_digest_not_plaintext_token() { let operator = TestSigner::new(); let state = DashboardState::for_tests(vec![( "operator-1".to_string(), @@ -813,12 +880,49 @@ mod tests { .expect("verify"); let token = parse_token_from_cookie(&session.cookie_header); - let sessions = state.sessions.lock().await; - let only_key = sessions.keys().next().expect("session exists"); + // The store only ever receives `session_digest(token)`, never the + // plaintext token: the digest differs from the token bytes, and + // authentication round-trips on the real token (proving the mapping). assert_ne!( - only_key.as_slice(), + crate::secret::session_digest(&token).as_slice(), token.as_bytes(), - "map key must be a digest, not the plaintext token" + "session key must be a digest, not the plaintext token" + ); + state + .authenticate_session(&token, now) + .await + .expect("session lookup round-trips on the real token"); + } + + #[tokio::test] + async fn unset_cursor_secret_boots_in_every_stage() { + let _env_lock = ENV_LOCK.lock().await; + let _no_secret = EnvVarGuard::remove("GUARDIAN_DASHBOARD_CURSOR_SECRET"); + + let prod = EnvVarGuard::set("GUARDIAN_ENV", "prod"); + let prod_result = DashboardState::from_allowlist_source( + super::AllowlistSource::Static, + super::OperatorAllowlist::from_entries(Vec::new()).expect("empty allowlist is valid"), + DashboardConfig::for_tests(), + std::sync::Arc::new(crate::coordination::InMemorySessionStore::new()), + std::sync::Arc::new(crate::coordination::InMemoryChallengeStore::new()), + ); + assert!( + prod_result.is_ok(), + "prod stage tolerates an unset cursor secret (warns, ephemeral fallback)" + ); + drop(prod); + + let non_prod_result = DashboardState::from_allowlist_source( + super::AllowlistSource::Static, + super::OperatorAllowlist::from_entries(Vec::new()).expect("empty allowlist is valid"), + DashboardConfig::for_tests(), + std::sync::Arc::new(crate::coordination::InMemorySessionStore::new()), + std::sync::Arc::new(crate::coordination::InMemoryChallengeStore::new()), + ); + assert!( + non_prod_result.is_ok(), + "non-prod tolerates an unset cursor secret (ephemeral fallback)" ); } } diff --git a/crates/server/src/dashboard/types.rs b/crates/server/src/dashboard/types.rs index dc66265f..847229ad 100644 --- a/crates/server/src/dashboard/types.rs +++ b/crates/server/src/dashboard/types.rs @@ -1,7 +1,6 @@ use std::collections::BTreeSet; use std::sync::Arc; -use chrono::{DateTime, Utc}; use guardian_shared::auth_request_payload::AuthRequestPayload; use miden_protocol::Word; use serde::{Deserialize, Serialize}; @@ -57,17 +56,3 @@ pub struct IssuedOperatorSession { pub expires_at: String, pub cookie_header: String, } - -#[derive(Clone, Debug)] -pub(crate) struct PendingChallenge { - pub(crate) signing_digest: Word, - pub(crate) issued_at: DateTime, - pub(crate) expires_at: DateTime, -} - -#[derive(Clone, Debug)] -pub(crate) struct OperatorSessionRecord { - pub(crate) operator: AuthenticatedOperator, - pub(crate) issued_at: DateTime, - pub(crate) expires_at: DateTime, -} diff --git a/crates/server/src/evm/mod.rs b/crates/server/src/evm/mod.rs index 332f5111..396e4a1d 100644 --- a/crates/server/src/evm/mod.rs +++ b/crates/server/src/evm/mod.rs @@ -21,10 +21,17 @@ pub struct EvmAppState { impl EvmAppState { pub async fn from_env() -> Result { - let chains = Arc::new(EvmChainRegistry::from_env()?); - let sessions = Arc::new(EvmSessionState::default()); + Self::from_env_with_sessions(EvmSessionState::default()).await + } - Ok(Self { chains, sessions }) + /// Build EVM state with explicit (evm-realm) session state. The server + /// builder passes shared (Postgres) stores on the Postgres backend. + pub async fn from_env_with_sessions(sessions: EvmSessionState) -> Result { + let chains = Arc::new(EvmChainRegistry::from_env()?); + Ok(Self { + chains, + sessions: Arc::new(sessions), + }) } pub fn for_tests() -> Self { diff --git a/crates/server/src/evm/session.rs b/crates/server/src/evm/session.rs index fb4d2ce4..749017c7 100644 --- a/crates/server/src/evm/session.rs +++ b/crates/server/src/evm/session.rs @@ -1,10 +1,12 @@ -use std::collections::HashMap; use std::sync::Arc; use chrono::{DateTime, Duration, Utc}; use rand::RngCore; -use tokio::sync::Mutex; +use crate::coordination::{ + ChallengePayload, ChallengeStore, InMemoryChallengeStore, InMemorySessionStore, SessionStore, + SessionSubject, StoredChallenge, StoredSession, +}; use crate::error::{GuardianError, Result}; use crate::metadata::network::normalize_evm_address; use crate::secret::session_digest; @@ -16,8 +18,8 @@ const MAX_OUTSTANDING_CHALLENGES: usize = 8; #[derive(Clone)] pub struct EvmSessionState { - challenges: Arc>>>, - sessions: Arc>>, + session_store: Arc, + challenge_store: Arc, } #[derive(Clone, Debug, PartialEq, Eq)] @@ -40,27 +42,29 @@ pub struct AuthenticatedEvmSession { pub address: String, } -#[derive(Clone)] -struct PendingEvmChallenge { - challenge: EvmChallenge, -} - -#[derive(Clone)] -struct EvmSessionRecord { - address: String, - expires_at: DateTime, -} - impl Default for EvmSessionState { fn default() -> Self { - Self { - challenges: Arc::new(Mutex::new(HashMap::new())), - sessions: Arc::new(Mutex::new(HashMap::new())), - } + Self::new( + Arc::new(InMemorySessionStore::new()), + Arc::new(InMemoryChallengeStore::new()), + ) } } impl EvmSessionState { + /// Build EVM session state over explicit, evm-realm coordination stores. The + /// server builder passes shared (Postgres) stores on the Postgres backend; + /// the default uses in-memory stores (single-process / dev). + pub fn new( + session_store: Arc, + challenge_store: Arc, + ) -> Self { + Self { + session_store, + challenge_store, + } + } + pub fn cookie_name(&self) -> &'static str { COOKIE_NAME } @@ -82,16 +86,20 @@ impl EvmSessionState { expires_at: now + Duration::seconds(CHALLENGE_TTL_SECS), }; - let mut challenges = self.challenges.lock().await; - let pending = challenges.entry(address).or_default(); - pending.retain(|challenge| challenge.challenge.expires_at > now); - pending.push(PendingEvmChallenge { - challenge: challenge.clone(), - }); - if pending.len() > MAX_OUTSTANDING_CHALLENGES { - let drain_len = pending.len() - MAX_OUTSTANDING_CHALLENGES; - pending.drain(0..drain_len); - } + let stored = StoredChallenge { + key: challenge.nonce.clone(), + payload: ChallengePayload::EvmChallenge { + address: challenge.address.clone(), + nonce: challenge.nonce.clone(), + issued_at: challenge.issued_at, + expires_at: challenge.expires_at, + }, + issued_at: challenge.issued_at, + expires_at: challenge.expires_at, + }; + self.challenge_store + .issue(&address, stored, MAX_OUTSTANDING_CHALLENGES, now) + .await?; Ok(challenge) } @@ -105,20 +113,32 @@ impl EvmSessionState { ) -> Result { let address = normalize_evm_address(address).map_err(GuardianError::InvalidInput)?; let signature = crate::evm::proposal::normalize_signature(signature)?; - let mut challenges = self.challenges.lock().await; - let pending = challenges.entry(address.clone()).or_default(); - pending.retain(|challenge| challenge.challenge.expires_at > now); - - let Some(index) = pending - .iter() - .position(|pending| pending.challenge.nonce.eq_ignore_ascii_case(nonce)) - else { + + let active = self.challenge_store.active_for(&address, now).await?; + let matched = active.iter().find_map(|stored| match &stored.payload { + ChallengePayload::EvmChallenge { + address: challenge_address, + nonce: challenge_nonce, + issued_at, + expires_at, + } if challenge_nonce.eq_ignore_ascii_case(nonce) => Some(( + stored.key.clone(), + EvmChallenge { + address: challenge_address.clone(), + nonce: challenge_nonce.clone(), + issued_at: *issued_at, + expires_at: *expires_at, + }, + )), + _ => None, + }); + + let Some((key, challenge)) = matched else { return Err(GuardianError::AuthenticationFailed( "No active EVM challenge matched the nonce".to_string(), )); }; - let challenge = pending[index].challenge.clone(); let recovered = crate::evm::contracts::recover_session_address(&challenge, &signature)?; if recovered != address { return Err(GuardianError::AuthenticationFailed( @@ -126,25 +146,28 @@ impl EvmSessionState { )); } - pending.remove(index); - if pending.is_empty() { - challenges.remove(&address); + if !self.challenge_store.consume(&address, &key, now).await? { + return Err(GuardianError::AuthenticationFailed( + "No active EVM challenge matched the nonce".to_string(), + )); } - drop(challenges); let token = random_hex_32(); let expires_at = now + Duration::seconds(SESSION_TTL_SECS); let cookie_header = self.session_cookie_header(&token, expires_at); let session_key = session_digest(&token); - let mut sessions = self.sessions.lock().await; - sessions.retain(|_, session| session.expires_at > now); - sessions.insert( - session_key, - EvmSessionRecord { - address: address.clone(), - expires_at, - }, - ); + self.session_store + .insert( + session_key, + StoredSession { + subject: SessionSubject::Evm { + address: address.clone(), + }, + issued_at: now, + expires_at, + }, + ) + .await?; Ok(VerifiedEvmSession { address, @@ -158,25 +181,34 @@ impl EvmSessionState { token: &str, now: DateTime, ) -> Result { - let mut sessions = self.sessions.lock().await; - sessions.retain(|_, session| session.expires_at > now); - let session = sessions - .get(&session_digest(token)) - .cloned() + let session = self + .session_store + .get(&session_digest(token), now) + .await? .ok_or_else(|| { GuardianError::AuthenticationFailed("Invalid EVM session".to_string()) })?; - Ok(AuthenticatedEvmSession { - address: session.address, - }) + let SessionSubject::Evm { address } = session.subject else { + return Err(GuardianError::AuthenticationFailed( + "Invalid EVM session".to_string(), + )); + }; + Ok(AuthenticatedEvmSession { address }) } - pub async fn logout(&self, token: Option<&str>, now: DateTime) { - let mut sessions = self.sessions.lock().await; - sessions.retain(|_, session| session.expires_at > now); + pub async fn logout(&self, token: Option<&str>, _now: DateTime) -> Result<()> { if let Some(token) = token { - sessions.remove(&session_digest(token)); + self.session_store.revoke(&session_digest(token)).await?; } + Ok(()) + } + + /// Reclaim expired EVM sessions and challenges (housekeeping; expiry is also + /// enforced on read). + pub async fn sweep_expired(&self, now: DateTime) -> Result<()> { + self.session_store.sweep_expired(now).await?; + self.challenge_store.sweep_expired(now).await?; + Ok(()) } fn session_cookie_header(&self, token: &str, expires_at: DateTime) -> String { @@ -202,7 +234,7 @@ mod tests { use super::*; #[tokio::test] - async fn challenge_is_single_use_after_manual_removal() { + async fn challenge_is_single_use_via_consume() { let state = EvmSessionState::default(); let now = Utc::now(); let challenge = state @@ -210,13 +242,27 @@ mod tests { .await .expect("challenge"); - let mut challenges = state.challenges.lock().await; - let pending = challenges - .get_mut(&challenge.address) - .expect("pending challenge"); - assert_eq!(pending.len(), 1); - pending.remove(0); - assert!(pending.is_empty()); + let active = state + .challenge_store + .active_for(&challenge.address, now) + .await + .expect("active challenges"); + assert_eq!(active.len(), 1); + + assert!( + state + .challenge_store + .consume(&challenge.address, &challenge.nonce, now) + .await + .expect("consume") + ); + assert!( + !state + .challenge_store + .consume(&challenge.address, &challenge.nonce, now) + .await + .expect("replay consume") + ); } #[test] diff --git a/crates/server/src/jobs/canonicalization/processor.rs b/crates/server/src/jobs/canonicalization/processor.rs index d906312e..a21480e7 100644 --- a/crates/server/src/jobs/canonicalization/processor.rs +++ b/crates/server/src/jobs/canonicalization/processor.rs @@ -1,10 +1,39 @@ +use std::sync::Arc; + use crate::canonicalization::CanonicalizationConfig; +use crate::coordination::{AlwaysLeader, CANONICALIZATION_LEASE, LeaderElector, Lease}; use crate::delta_object::{DeltaObject, DeltaStatus}; use crate::error::{GuardianError, Result}; use crate::state::AppState; use crate::state_object::StateObject; use async_trait::async_trait; use chrono::{DateTime, Utc}; +use tokio_util::sync::CancellationToken; + +/// A leader handle for a single canonicalization pass: who we are, the fence we +/// hold, and a cancellation signal tripped when the lease is lost mid-pass. +struct PassLease { + leader: Arc, + lease: Lease, + cancel: CancellationToken, +} + +impl PassLease { + /// Single-process default (filesystem / tests): always the leader, never + /// cancelled. + fn single_process() -> Self { + Self { + leader: Arc::new(AlwaysLeader::new(CANONICALIZATION_LEASE, "single-process")), + lease: Lease { + name: CANONICALIZATION_LEASE.to_string(), + holder_id: "single-process".to_string(), + fence_token: 0, + expires_at: DateTime::::MAX_UTC, + }, + cancel: CancellationToken::new(), + } + } +} #[async_trait] pub trait Processor: Send + Sync { @@ -36,11 +65,35 @@ fn get_candidates(deltas: &[DeltaObject]) -> Vec { struct DeltasProcessorBase { state: AppState, + pass: PassLease, max_retries: u32, submission_grace_period_seconds: u64, } impl DeltasProcessorBase { + /// Mandatory fence check before every custody-state write: the canonical + /// `submit_state` / `submit_delta`, the `update_auth` cosigner-key sync, the + /// discard `delete_delta`, and the retry `update_delta_status`. If this + /// replica no longer holds the lease (superseded mid-pass), refuse the write + /// so a stale leader can never commit a custody transition. Best-effort + /// cleanup that trails a fenced write — clearing `has_pending_candidate` and + /// deleting a finalized proposal — is intentionally left unfenced: both are + /// idempotent and non-custodial, so a brief two-leader overlap can at most + /// repeat them harmlessly. + async fn ensure_lease_held(&self, delta: &DeltaObject) -> Result<()> { + if self.pass.leader.verify_held(&self.pass.lease).await? { + return Ok(()); + } + tracing::warn!( + account_id = %delta.account_id, + nonce = delta.nonce, + "Canonicalization lease lost; refusing canonical write" + ); + Err(GuardianError::StorageError( + "canonicalization lease lost; aborting write".to_string(), + )) + } + fn candidate_age_seconds(&self, delta: &DeltaObject, now: DateTime) -> Option { let DeltaStatus::Candidate { timestamp, .. } = &delta.status else { return None; @@ -65,6 +118,12 @@ impl DeltasProcessorBase { ); for account_id in account_ids { + if self.pass.cancel.is_cancelled() { + tracing::warn!( + "Canonicalization pass cancelled (lease lost); stopping before next account" + ); + break; + } if let Err(e) = self.process_account(&account_id).await { tracing::error!( account_id = %account_id, @@ -109,6 +168,13 @@ impl DeltasProcessorBase { ); for delta in candidates { + if self.pass.cancel.is_cancelled() { + tracing::warn!( + account_id = %account_id, + "Canonicalization pass cancelled (lease lost); stopping before next candidate" + ); + break; + } let nonce = delta.nonce; if let Err(e) = self.process_candidate(delta).await { tracing::error!( @@ -203,6 +269,7 @@ impl DeltasProcessorBase { "Delta verification failed after max retries, discarding" ); + self.ensure_lease_held(&delta).await?; storage_backend .delete_delta(&delta.account_id, delta.nonce) .await @@ -282,6 +349,7 @@ impl DeltasProcessorBase { let new_status = delta.status.with_incremented_retry(now); + self.ensure_lease_held(&delta).await?; storage_backend .update_delta_status(&delta.account_id, delta.nonce, new_status) .await @@ -340,6 +408,7 @@ impl DeltasProcessorBase { auth_scheme: String::new(), }; + self.ensure_lease_held(&delta).await?; storage_backend .submit_state(&updated_state) .await @@ -363,6 +432,7 @@ impl DeltasProcessorBase { "Syncing cosigner public keys from on-chain storage" ); + self.ensure_lease_held(&delta).await?; self.state .metadata .update_auth(&delta.account_id, new_auth, &now) @@ -380,6 +450,7 @@ impl DeltasProcessorBase { let mut canonical_delta = delta.clone(); canonical_delta.status = DeltaStatus::canonical(now.clone()); + self.ensure_lease_held(&delta).await?; storage_backend .submit_delta(&canonical_delta) .await @@ -452,10 +523,31 @@ pub struct DeltasProcessor { } impl DeltasProcessor { + /// Single-process processor (filesystem / tests): always the leader, never + /// fenced out. Behavior is identical to the pre-lease worker. + #[allow(dead_code)] pub fn new(state: AppState, config: CanonicalizationConfig) -> Self { + let pass = PassLease::single_process(); + Self::with_lease(state, config, pass.leader, pass.lease, pass.cancel) + } + + /// Lease-bound processor used by the multi-replica worker: writes are fenced + /// by `leader`/`lease` and the pass aborts when `cancel` is tripped. + pub fn with_lease( + state: AppState, + config: CanonicalizationConfig, + leader: Arc, + lease: Lease, + cancel: CancellationToken, + ) -> Self { Self { base: DeltasProcessorBase { state, + pass: PassLease { + leader, + lease, + cancel, + }, max_retries: config.max_retries, submission_grace_period_seconds: config.submission_grace_period_seconds, }, @@ -483,6 +575,7 @@ impl TestDeltasProcessor { Self { base: DeltasProcessorBase { state, + pass: PassLease::single_process(), max_retries: u32::MAX, // Test processor doesn't discard on retries submission_grace_period_seconds: 0, }, diff --git a/crates/server/src/jobs/canonicalization/worker.rs b/crates/server/src/jobs/canonicalization/worker.rs index 2346d902..a79052aa 100644 --- a/crates/server/src/jobs/canonicalization/worker.rs +++ b/crates/server/src/jobs/canonicalization/worker.rs @@ -1,16 +1,22 @@ +use std::sync::Arc; +use std::time::Duration; + +use tokio::time::interval; +use tokio_util::sync::CancellationToken; + +use crate::coordination::{LeaderElector, Lease}; use crate::error::Result; use crate::state::AppState; -use tokio::time::interval; use super::processor::{DeltasProcessor, Processor, TestDeltasProcessor}; -pub fn start_worker(state: AppState) { +pub fn start_worker(state: AppState, leader: Arc) { tokio::spawn(async move { - run_worker(state).await; + run_worker(state, leader).await; }); } -async fn run_worker(state: AppState) { +async fn run_worker(state: AppState, leader: Arc) { let config = match &state.canonicalization { Some(config) => config.clone(), None => { @@ -21,12 +27,43 @@ async fn run_worker(state: AppState) { } }; - let processor = DeltasProcessor::new(state.clone(), config.clone()); - let mut interval_timer = interval(config.check_interval()); + let check_interval = config.check_interval(); + // TTL outlives several renew cycles so a healthy holder never loses the lease + // mid-pass and the lease survives the idle gap between ticks; failover after a + // crash happens within one TTL. + let lease_ttl = check_interval * 3; + let renew_interval = check_interval; + let mut interval_timer = interval(check_interval); loop { interval_timer.tick().await; + let lease = match leader.try_acquire(lease_ttl).await { + Ok(Some(lease)) => lease, + Ok(None) => continue, + Err(error) => { + tracing::warn!(error = %error, "Failed to acquire canonicalization lease"); + continue; + } + }; + + let cancel = CancellationToken::new(); + let renewal = spawn_renewal( + leader.clone(), + lease.clone(), + lease_ttl, + renew_interval, + cancel.clone(), + ); + + let processor = DeltasProcessor::with_lease( + state.clone(), + config.clone(), + leader.clone(), + lease, + cancel.clone(), + ); + let started = std::time::Instant::now(); let result = processor.process_all_accounts().await; metrics::histogram!(crate::metrics::names::CANONICALIZATION_RUN_DURATION_SECONDS) @@ -38,12 +75,49 @@ async fn run_worker(state: AppState) { ) .increment(1); + cancel.cancel(); + let _ = renewal.await; + if let Err(e) = result { tracing::error!(error = %e, "Canonicalization worker error"); } } } +/// Renew the lease on its own timer, concurrent with the pass. On a lost lease +/// (stolen, expired, or store error) it trips `cancel` so the pass aborts at its +/// next checkpoint; the fence check still guards any in-flight write. +fn spawn_renewal( + leader: Arc, + lease: Lease, + ttl: Duration, + renew_interval: Duration, + cancel: CancellationToken, +) -> tokio::task::JoinHandle<()> { + tokio::spawn(async move { + let mut ticker = interval(renew_interval); + ticker.tick().await; + loop { + tokio::select! { + _ = cancel.cancelled() => break, + _ = ticker.tick() => match leader.renew(&lease, ttl).await { + Ok(true) => {} + Ok(false) => { + tracing::warn!("Canonicalization lease lost during pass; cancelling"); + cancel.cancel(); + break; + } + Err(error) => { + tracing::warn!(error = %error, "Canonicalization lease renew failed; cancelling pass"); + cancel.cancel(); + break; + } + }, + } + } + }) +} + pub async fn process_all_accounts_now(state: &AppState) -> Result<()> { let processor = TestDeltasProcessor::new(state.clone()); processor.process_all_accounts().await diff --git a/crates/server/src/lib.rs b/crates/server/src/lib.rs index fee40fdb..688d41e1 100644 --- a/crates/server/src/lib.rs +++ b/crates/server/src/lib.rs @@ -5,6 +5,8 @@ pub mod api; pub mod audit; pub mod build_info; pub mod builder; +pub mod config; +pub mod coordination; pub mod dashboard; pub mod middleware; diff --git a/crates/server/src/main.rs b/crates/server/src/main.rs index 1af43301..06b5b6a3 100644 --- a/crates/server/src/main.rs +++ b/crates/server/src/main.rs @@ -17,7 +17,7 @@ async fn main() { .unwrap_or_else(|_| "/var/guardian/keystore".to_string()) .into(); - let (storage_backend, metadata, auditor) = StorageMetadataBuilder::from_env() + let (storage_backend, metadata, auditor, coordination) = StorageMetadataBuilder::from_env() .build() .await .expect("Failed to initialize storage backends"); @@ -44,6 +44,7 @@ async fn main() { .storage(storage_backend) .metadata(metadata) .auditor(auditor) + .coordination(coordination) .ack(ack) .http(true, 3000) .grpc(true, 50051) diff --git a/crates/server/src/middleware/rate_limit.rs b/crates/server/src/middleware/rate_limit.rs index 3b87548a..22174c85 100644 --- a/crates/server/src/middleware/rate_limit.rs +++ b/crates/server/src/middleware/rate_limit.rs @@ -27,6 +27,10 @@ const DEFAULT_BURST_PER_SEC: u32 = 10; const DEFAULT_PER_MIN: u32 = 60; /// Environment variable for enabling or disabling rate limiting const ENV_RATE_LIMIT_ENABLED: &str = "GUARDIAN_RATE_LIMIT_ENABLED"; +/// Deployment's maximum replica capacity; the configured global limits are +/// divided by it so per-process enforcement keeps the fleet aggregate at or +/// below the global limit (issue #242). Drives rate limiting only. +const ENV_MAX_REPLICAS: &str = "GUARDIAN_MAX_REPLICAS"; /// Cleanup interval for stale entries const CLEANUP_INTERVAL_SECS: u64 = 60; @@ -45,15 +49,35 @@ impl RateLimitConfig { /// Load configuration from environment variables pub fn from_env() -> Self { let enabled = env_flag(ENV_RATE_LIMIT_ENABLED, true); - let burst_per_sec = env::var("GUARDIAN_RATE_BURST_PER_SEC") + let max_replicas = env::var(ENV_MAX_REPLICAS) .ok() - .and_then(|v| v.parse().ok()) - .unwrap_or(DEFAULT_BURST_PER_SEC); + .and_then(|v| v.parse::().ok()) + .unwrap_or(1); + let burst_per_sec = partition_limit( + env::var("GUARDIAN_RATE_BURST_PER_SEC") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(DEFAULT_BURST_PER_SEC), + max_replicas, + ); + let per_min = partition_limit( + env::var("GUARDIAN_RATE_PER_MIN") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(DEFAULT_PER_MIN), + max_replicas, + ); - let per_min = env::var("GUARDIAN_RATE_PER_MIN") - .ok() - .and_then(|v| v.parse().ok()) - .unwrap_or(DEFAULT_PER_MIN); + if enabled && (burst_per_sec == 0 || per_min == 0) { + tracing::warn!( + max_replicas, + burst_per_sec, + per_min, + "rate limit partitions to 0 per replica (global limit is below GUARDIAN_MAX_REPLICAS); \ + this replica will throttle all traffic. Raise the global rate limit or lower \ + GUARDIAN_MAX_REPLICAS." + ); + } Self { enabled, @@ -82,6 +106,16 @@ impl Default for RateLimitConfig { } } +/// Per-replica share of a global limit: `global / max_replicas` (floor), with +/// `max_replicas` clamped to ≥ 1. The floor — not a round-up or a ≥1 clamp — +/// guarantees the fleet aggregate (`max_replicas × share`) never exceeds the +/// global limit (FR-009). A share of `0` means this replica denies all requests; +/// that only happens when the global limit is below the replica count (an +/// extreme misconfiguration), and it still never exceeds the global limit. +fn partition_limit(global_limit: u32, max_replicas: u32) -> u32 { + global_limit / max_replicas.max(1) +} + /// Parse a boolean env flag: unset → `default_value`; `0`/`false`/ /// `no`/`off` (case-insensitive) → false; anything else → true. /// Shared by env-driven configs (rate limiting, metrics). @@ -426,6 +460,10 @@ mod tests { use axum::http::header::HeaderValue; use std::net::{IpAddr, SocketAddr}; + /// Serializes the env-mutating `from_env` tests so they don't race the + /// shared process environment under the multi-threaded test runner. + static ENV_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); + fn request_with_peer_ip(peer_ip: IpAddr) -> Request { let mut req = Request::builder().uri("/test").body(Body::empty()).unwrap(); req.extensions_mut() @@ -433,6 +471,18 @@ mod tests { req } + #[test] + fn partition_divides_global_limit_by_max_replicas() { + assert_eq!(partition_limit(600, 6), 100); + assert_eq!(partition_limit(600, 1), 600); + assert_eq!(partition_limit(600, 0), 600, "zero replicas treated as one"); + // global < max_replicas: floor is 0 (deny) so the fleet aggregate + // (6 x 0 = 0) never exceeds the global limit (FR-009). + assert_eq!(partition_limit(5, 6), 0); + // 6 x 100 = 600 == global; never exceeds. + assert!(partition_limit(600, 6) * 6 <= 600); + } + #[test] fn test_rate_limit_config_default() { let config = RateLimitConfig::default(); @@ -451,12 +501,13 @@ mod tests { #[test] fn test_rate_limit_config_from_env_defaults() { - // Clear any existing env vars - // SAFETY: This test runs single-threaded and these env vars are test-specific + let _guard = ENV_LOCK.lock().unwrap_or_else(|poison| poison.into_inner()); + // SAFETY: serialized by ENV_LOCK; vars are test-specific. unsafe { env::remove_var(ENV_RATE_LIMIT_ENABLED); env::remove_var("GUARDIAN_RATE_BURST_PER_SEC"); env::remove_var("GUARDIAN_RATE_PER_MIN"); + env::remove_var(ENV_MAX_REPLICAS); } let config = RateLimitConfig::from_env(); @@ -467,7 +518,8 @@ mod tests { #[test] fn test_rate_limit_config_from_env_disabled() { - // SAFETY: This test runs single-threaded and these env vars are test-specific + let _guard = ENV_LOCK.lock().unwrap_or_else(|poison| poison.into_inner()); + // SAFETY: serialized by ENV_LOCK; vars are test-specific. unsafe { env::set_var(ENV_RATE_LIMIT_ENABLED, "false"); } @@ -475,12 +527,35 @@ mod tests { let config = RateLimitConfig::from_env(); assert!(!config.enabled); - // SAFETY: This test runs single-threaded and these env vars are test-specific + // SAFETY: serialized by ENV_LOCK; vars are test-specific. unsafe { env::remove_var(ENV_RATE_LIMIT_ENABLED); } } + #[test] + fn from_env_partitions_limits_by_max_replicas() { + let _guard = ENV_LOCK.lock().unwrap_or_else(|poison| poison.into_inner()); + // SAFETY: serialized by ENV_LOCK; vars are test-specific. + unsafe { + env::set_var("GUARDIAN_RATE_BURST_PER_SEC", "600"); + env::set_var("GUARDIAN_RATE_PER_MIN", "6000"); + env::set_var(ENV_MAX_REPLICAS, "6"); + } + + let config = RateLimitConfig::from_env(); + + // SAFETY: serialized by ENV_LOCK; vars are test-specific. + unsafe { + env::remove_var("GUARDIAN_RATE_BURST_PER_SEC"); + env::remove_var("GUARDIAN_RATE_PER_MIN"); + env::remove_var(ENV_MAX_REPLICAS); + } + + assert_eq!(config.burst_per_sec, 100); + assert_eq!(config.per_min, 1000); + } + #[test] fn test_rate_limit_store_allows_under_limit() { let config = RateLimitConfig::new(5, 10); @@ -623,7 +698,8 @@ mod tests { #[test] fn test_rate_limit_layer_from_env() { - // SAFETY: This test runs single-threaded and these env vars are test-specific + let _guard = ENV_LOCK.lock().unwrap_or_else(|poison| poison.into_inner()); + // SAFETY: serialized by ENV_LOCK; vars are test-specific. unsafe { env::remove_var(ENV_RATE_LIMIT_ENABLED); env::remove_var("GUARDIAN_RATE_BURST_PER_SEC"); diff --git a/crates/server/src/schema.rs b/crates/server/src/schema.rs index 0edce19b..07461052 100644 --- a/crates/server/src/schema.rs +++ b/crates/server/src/schema.rs @@ -108,6 +108,51 @@ diesel::table! { } } +diesel::table! { + /// Representation of the `auth_sessions` table. + /// + /// Shared operator/EVM session store for horizontal scaling (issue #242). + auth_sessions (realm, token_digest) { + realm -> Text, + token_digest -> Bytea, + subject -> Jsonb, + issued_at -> Timestamptz, + expires_at -> Timestamptz, + revoked_at -> Nullable, + } +} + +diesel::table! { + /// Representation of the `auth_challenges` table. + /// + /// Shared operator/EVM login-challenge store for horizontal scaling + /// (issue #242). Composite key `(realm, challenge_key)`. + auth_challenges (realm, challenge_key) { + realm -> Text, + challenge_key -> Text, + principal -> Text, + payload -> Jsonb, + issued_at -> Timestamptz, + expires_at -> Timestamptz, + consumed_at -> Nullable, + } +} + +diesel::table! { + /// Representation of the `worker_leases` table. + /// + /// Single-owner background-worker coordination for horizontal scaling + /// (issue #242). + worker_leases (lease_name) { + lease_name -> Text, + holder_id -> Text, + acquired_at -> Timestamptz, + renewed_at -> Timestamptz, + expires_at -> Timestamptz, + fence_token -> Int8, + } +} + diesel::table! { /// Single-row store-level encryption marker. Its presence indicates the /// store is encrypted. diff --git a/crates/server/src/storage/postgres.rs b/crates/server/src/storage/postgres.rs index 7e1a28c6..f425278a 100644 --- a/crates/server/src/storage/postgres.rs +++ b/crates/server/src/storage/postgres.rs @@ -29,17 +29,63 @@ use url::Url; pub const MIGRATIONS: EmbeddedMigrations = embed_migrations!("migrations"); +const MIGRATION_ADVISORY_LOCK_KEY: i64 = 0x4755_4152_4449_414E; +const MIGRATION_LOCK_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(60); +const MIGRATION_LOCK_POLL: std::time::Duration = std::time::Duration::from_millis(500); + +#[derive(diesel::QueryableByName)] +struct AdvisoryLockAcquired { + #[diesel(sql_type = diesel::sql_types::Bool)] + acquired: bool, +} + /// Run database migrations. Call once at application startup. +/// +/// Migrations run under a session advisory lock so that replicas booting +/// simultaneously serialize: the first holder migrates and the rest block, +/// then find nothing pending. The lock is released explicitly and, as a +/// backstop, on connection drop. pub async fn run_migrations(database_url: &str) -> Result<(), String> { let url = database_url.to_string(); tokio::task::spawn_blocking(move || { let mut conn = PgConnection::establish(&url) .map_err(|e| format!("Failed to connect for migrations: {e}"))?; - conn.run_pending_migrations(MIGRATIONS) - .map_err(|e| format!("Failed to run migrations: {e}"))?; + let deadline = std::time::Instant::now() + MIGRATION_LOCK_TIMEOUT; + loop { + let attempt = diesel::RunQueryDsl::get_result::( + diesel::sql_query(format!( + "SELECT pg_try_advisory_lock({MIGRATION_ADVISORY_LOCK_KEY}) AS acquired" + )), + &mut conn, + ) + .map_err(|e| format!("Failed to attempt migration advisory lock: {e}"))?; + if attempt.acquired { + break; + } + if std::time::Instant::now() >= deadline { + return Err(format!( + "Timed out after {}s waiting for the migration advisory lock; \ + another replica may be stuck mid-migration", + MIGRATION_LOCK_TIMEOUT.as_secs() + )); + } + std::thread::sleep(MIGRATION_LOCK_POLL); + } + + let result = conn + .run_pending_migrations(MIGRATIONS) + .map(|_| ()) + .map_err(|e| format!("Failed to run migrations: {e}")); + + let _ = diesel::RunQueryDsl::execute( + diesel::sql_query(format!( + "SELECT pg_advisory_unlock({MIGRATION_ADVISORY_LOCK_KEY})" + )), + &mut conn, + ); - Ok::<(), String>(()) + result }) .await .map_err(|e| format!("Migration task failed: {e}"))??; diff --git a/docs/CONFIGURATION.md b/docs/CONFIGURATION.md index 8acfcd26..6d2cd4cc 100644 --- a/docs/CONFIGURATION.md +++ b/docs/CONFIGURATION.md @@ -176,6 +176,7 @@ multi-stack deployments get scoped IDs. | `GUARDIAN_RATE_LIMIT_ENABLED` | `true` | Master kill-switch for HTTP rate limiting. Set `false` only in test environments. | | `GUARDIAN_RATE_BURST_PER_SEC` | `10` (code default); `200` set by the prod Terraform profile | Token-bucket burst. | | `GUARDIAN_RATE_PER_MIN` | `60` (code default); `5000` set by the prod Terraform profile | Sustained rate. | +| `GUARDIAN_MAX_REPLICAS` | `1` (code default); autoscaling **max** capacity set by the prod Terraform profile | Per-replica rate-limit divisor: each replica enforces `global / GUARDIAN_MAX_REPLICAS` so the fleet aggregate stays at or below the global limit. Use the autoscaling **max**, not the running count. Drives rate-limiting only — coordination mode is backend-derived. Running below max over-throttles (accepted); an override is clamped up to the autoscaling max by Terraform. See [`runbooks/horizontal-scaling.md`](./runbooks/horizontal-scaling.md). | | `GUARDIAN_MAX_REQUEST_BYTES` | `1048576` (1 MB) | Reject request bodies larger than this. | | `GUARDIAN_MAX_PENDING_PROPOSALS_PER_ACCOUNT` | `20` | Account-level cap; hitting it returns `pending_proposals_limit`. | | `GUARDIAN_CORS_ALLOWED_ORIGINS` | _unset_ | Comma-separated explicit origins. **Unset → permissive `Any` origin / `Any` methods / `Any` headers, credentials disabled** (suitable for local dev). **Set → strict allowlist with `allow_credentials(true)`** (required for production browser clients). | @@ -206,11 +207,29 @@ one-command Grafana dashboard stack. |---|---|---| | `GUARDIAN_OPERATOR_PUBLIC_KEYS_SECRET_ID` | _unset_ | AWS Secrets Manager secret name/ARN holding the operator allowlist JSON. Hot-reloaded on every challenge and authenticated `/dashboard/*` request. | | `GUARDIAN_OPERATOR_PUBLIC_KEYS_FILE` | _unset_ | Local JSON path for the same payload. Local dev only. | -| `GUARDIAN_DASHBOARD_CURSOR_SECRET` | random per process | 32-byte hex HMAC key for dashboard pagination cursors. Pin a shared value when running ≥2 ECS tasks so cursors validate across replicas. | +| `GUARDIAN_DASHBOARD_CURSOR_SECRET` | random per process if unset | 32-byte hex HMAC key for dashboard pagination cursors. Pin a shared value across replicas so cursors validate everywhere. If unset the server **warns** and generates an ephemeral per-process key and still boots (in every stage); an ephemeral key only breaks dashboard pagination across replicas — nothing else, so it is not a startup guard. | `GET /dashboard/info.environment` is derived from `GUARDIAN_NETWORK_TYPE` (`testnet`, `devnet`, or `local`) rather than configured separately. +### Prod-stage startup guards & HA behavior + +When `GUARDIAN_ENV=prod`, the server fails fast on misconfigurations that are +silently broken across replicas: + +- the **filesystem** storage backend is refused (single-instance only — use the + Postgres image with `DATABASE_URL`); +- a rate limit that partitions to **0 requests per replica** is refused — i.e. + the global `GUARDIAN_RATE_BURST_PER_SEC`/`GUARDIAN_RATE_PER_MIN` is below + `GUARDIAN_MAX_REPLICAS`, which would make every replica throttle all traffic. + Raise the global limit or lower `GUARDIAN_MAX_REPLICAS`. (Non-prod only warns.) + +On the Postgres backend, operator/EVM sessions, login challenges, and the +canonicalization lease are shared across replicas (backend-derived — no tunable +disables this). If the database is briefly unavailable, authentication **fails +closed** (rejected, never bypassed) and recovers automatically. See the +[horizontal-scaling runbook](./runbooks/horizontal-scaling.md). + Allowlist payload shapes and enrollment flow: [`docs/DASHBOARD.md`](./DASHBOARD.md). @@ -305,5 +324,5 @@ this saves you from grepping: | EVM support locally | `GUARDIAN_EVM_RPC_URLS` (allowed chain set derives from its keys) + build with `--features evm` | | Use Secrets Manager for ACK keys | `GUARDIAN_ENV=prod` + `AWS_REGION=` + secrets pre-created | | Run the dashboard locally | `GUARDIAN_OPERATOR_PUBLIC_KEYS_FILE=/path/to/allowlist.json` | -| Multi-replica dashboard | `GUARDIAN_DASHBOARD_CURSOR_SECRET=<32-byte hex>` pinned across tasks | +| Multi-replica (HA) | Postgres backend + `GUARDIAN_DASHBOARD_CURSOR_SECRET=<64 hex>` pinned across tasks + `GUARDIAN_MAX_REPLICAS=` (all set by the prod Terraform profile) | | Higher throughput in prod | `GUARDIAN_RATE_BURST_PER_SEC`, `GUARDIAN_RATE_PER_MIN`, `GUARDIAN_DB_POOL_MAX_SIZE` | diff --git a/docs/SERVER_AWS_DEPLOY.md b/docs/SERVER_AWS_DEPLOY.md index 19289227..cc3f280c 100644 --- a/docs/SERVER_AWS_DEPLOY.md +++ b/docs/SERVER_AWS_DEPLOY.md @@ -547,6 +547,22 @@ aws ecr delete-repository --repository-name guardian-server --force --region us- - RDS Proxy between ECS and RDS - higher Guardian runtime rate-limit and DB-pool defaults for benchmark traffic +#### Horizontal scaling (multiple replicas) + +The prod profile runs 2–6 tasks behind the ALB. Because it sets `GUARDIAN_ENV=prod` +and the Postgres backend, the server runs **shared coordination** (sessions, +login challenges, and the canonicalization lease live in Postgres) — so any +request lands on any replica and canonicalization runs on exactly one replica at +a time. Terraform also sets `GUARDIAN_MAX_REPLICAS` from +`effective_guardian_max_replicas` (derived from the autoscaling max capacity, +prod `max(desired, 6)`) so rate limits are partitioned across the fleet. Set +`GUARDIAN_DASHBOARD_CURSOR_SECRET` to a stable shared value so dashboard +pagination works across replicas — leaving it unset only degrades pagination (a +startup warning, not a failure) and it is not wired by Terraform today. +Watch the per-replica `GUARDIAN_DB_POOL_MAX_SIZE` against Postgres +`max_connections` (RDS Proxy absorbs most of this). Full operator guidance: +[`runbooks/horizontal-scaling.md`](./runbooks/horizontal-scaling.md). + ## HTTPS And gRPC HTTPS is enabled when `acm_certificate_arn` is set. DNS can be managed through Cloudflare, Route 53, or both depending on which variables are provided. diff --git a/docs/guides/README.md b/docs/guides/README.md index 4968bd02..b66d100f 100644 --- a/docs/guides/README.md +++ b/docs/guides/README.md @@ -24,6 +24,7 @@ storage, or network), not after Compose. | [AWS-managed ACK signers](./aws-signers/README.md) | Self-hosted Compose: Postgres + Secrets Manager (Falcon) + KMS (ECDSA) | | [Miden Dashboard UI](./miden-dashboard/README.md) | Self-hosted Compose: Postgres + Guardian server + the Miden Dashboard operator UI | | [Observability](./observability/README.md) | Local Compose: server + Prometheus + pre-provisioned Grafana dashboard | +| [Horizontal scaling](./horizontal-scaling/README.md) | Local Compose: two replicas + round-robin proxy + shared Postgres (sessions, lease failover, fail-closed auth) | ## Adding a guide diff --git a/docs/guides/horizontal-scaling/.env.example b/docs/guides/horizontal-scaling/.env.example new file mode 100644 index 00000000..0139d38f --- /dev/null +++ b/docs/guides/horizontal-scaling/.env.example @@ -0,0 +1,19 @@ +# Copy to .env and fill in. See ../../CONFIGURATION.md for the authoritative +# meaning of each variable. + +# Published image tag to run (e.g. a release tag, or `latest`). +GUARDIAN_VERSION=latest + +# Postgres password for the shared state/coordination database. +POSTGRES_PASSWORD=guardian + +# 64-hex (32-byte) key signing dashboard pagination cursors. MUST be identical +# on every replica. Generate with: openssl rand -hex 32 +GUARDIAN_DASHBOARD_CURSOR_SECRET= + +# Replica capacity the rate limiter divides global limits by. Match the number +# of `server-*` services in docker-compose.yml (2 here). +GUARDIAN_MAX_REPLICAS=2 + +# Miden network the replicas target. +GUARDIAN_NETWORK_TYPE=MidenDevnet diff --git a/docs/guides/horizontal-scaling/Caddyfile b/docs/guides/horizontal-scaling/Caddyfile new file mode 100644 index 00000000..c367a151 --- /dev/null +++ b/docs/guides/horizontal-scaling/Caddyfile @@ -0,0 +1,33 @@ +# Round-robin reverse proxy across the two Guardian replicas — the stand-in for +# the ALB in front of the prod ECS tasks. Caddy's reverse_proxy automatically +# sets X-Forwarded-For, which the Guardian server reads to key rate limiting by +# the real client IP rather than the proxy's address. +:8080 { + reverse_proxy server-a:3000 server-b:3000 { + lb_policy round_robin + + # Like an ALB target group: actively poll each replica and pull an + # unhealthy one out of rotation, re-adding it when it recovers. Without + # this, round-robin keeps sending half the traffic to a dead replica + # (connection refused -> 502). + # + # Health probes hit `/`, which passes through the server's per-IP rate + # limiter keyed on this proxy's address. Keep the interval comfortably + # slower than the per-replica refill (global / GUARDIAN_MAX_REPLICAS, so + # ~0.5 req/s here) so a probe is never itself rate-limited (429) and + # flaps a healthy replica down. + health_uri / + health_interval 5s + health_timeout 2s + + # Belt-and-suspenders: if a request races a replica that died between + # health polls, retry the other replica within this window instead of + # returning 502 to the client. + lb_try_duration 5s + lb_try_interval 250ms + + # Passively mark an upstream down on a failed dial too. + fail_duration 10s + max_fails 1 + } +} diff --git a/docs/guides/horizontal-scaling/README.md b/docs/guides/horizontal-scaling/README.md new file mode 100644 index 00000000..f09c6d35 --- /dev/null +++ b/docs/guides/horizontal-scaling/README.md @@ -0,0 +1,249 @@ +# Horizontal scaling: two replicas behind a proxy + +Run two Guardian replicas behind a round-robin proxy, sharing one Postgres, and +watch the coordination layer (issue #242) work end to end on your laptop. This +mirrors the prod topology — 2–6 ECS tasks behind a load balancer — in miniature. + +```text + ┌─────────────┐ + client ──▶ :8080 ──▶ │ proxy/Caddy │ ──round-robin──┬──▶ server-a :3000 + └─────────────┘ └──▶ server-b :3010 + │ │ + └────┬────┘ + ▼ + postgres :5432 + (sessions · challenges · worker lease) +``` + +Everything that must be shared for a correct multi-replica deployment lives in +Postgres, so a session minted on one replica is honored on the other, and only +one replica ever canonicalizes. The variable meanings live in +[`../../CONFIGURATION.md`](../../CONFIGURATION.md); the operational contract is in +[`../../runbooks/horizontal-scaling.md`](../../runbooks/horizontal-scaling.md). + +## Prerequisites + +- Docker with Compose v2 (`docker compose`). +- That's it — no AWS. ACK signing keys are generated locally per replica, which + is fine here because this guide exercises operator/EVM auth and coordination, + not the multisig co-signing flow (which is the only thing that needs one + shared ACK key — see [From this demo to production](#from-this-demo-to-production)). + +## Configure and run + +```sh +cp .env.example .env +# Set a real cursor secret — it MUST be identical on every replica: +# openssl rand -hex 32 → paste into GUARDIAN_DASHBOARD_CURSOR_SECRET +cp operators.example.json operators.json # empty allowlist `[]`; add a key for the login walkthrough (step 7) +``` + +> **Building an unreleased version?** The published `latest` image does not yet +> contain these coordination changes (issue #242). Drop in the local-build +> override so the replicas build from the repo-root `Dockerfile` instead of +> pulling from the registry — Compose auto-merges it, so no extra flags are +> needed on any command: +> +> ```sh +> cp docker-compose.override.yml.example docker-compose.override.yml +> ``` +> +> Once the change ships in a published image, delete +> `docker-compose.override.yml` to go back to the registry image. + +```sh +docker compose up -d --build +``` + +The proxy is at . Each replica is also exposed directly — +`server-a` on `:3000`, `server-b` on `:3010` — so you can target a specific +replica during the walkthrough. Postgres is on `:5432`. + +## What is shared (and why) + +| Shared in Postgres | Table | Effect across replicas | +|---|---|---| +| Operator/EVM sessions | `auth_sessions` | Log in on A, your cookie works on B; logout is honored fleet-wide. | +| Login challenges | `auth_challenges` | A challenge is single-use even if issued on A and verified on B. | +| Canonicalization lease | `worker_leases` | Exactly one replica promotes candidates; the others stand by. | + +Coordination is **backend-derived**: it is on because the backend is Postgres. +No environment variable enables or disables it. + +## Validation walkthrough + +### 1. Both replicas report shared coordination + +```sh +docker compose logs server-a server-b | grep -i "coordination mode" +``` + +Each replica prints one line; both must read `mode=shared backend=postgres`. If +you ever see `mode=single-process backend=filesystem`, that replica is **not** +safe to run alongside others. + +### 2. Exactly one canonicalization lease holder + +```sh +docker compose exec postgres \ + psql -U guardian -d guardian \ + -c "select lease_name, holder_id, fence_token from worker_leases;" +``` + +You get a single `canonicalization` row with one `holder_id` (formatted +`{pid}-{random}`) — never two. Both replicas run the worker loop, but only the +lease holder does work; the other keeps trying to acquire and backs off. + +### 3. Lease failover with a fencing-token bump + +Stop the current holder and watch a different replica take over within the lease +TTL (~30s, i.e. 3× the 10s canonicalization interval): + +```sh +docker compose stop server-a # if A wasn't the holder, stop server-b instead +watch -n2 'docker compose exec -T postgres \ + psql -U guardian -d guardian \ + -c "select holder_id, fence_token, expires_at from worker_leases;"' +``` + +`holder_id` changes to the surviving replica and `fence_token` **increments** — +the increment is the steal signal a superseded holder uses to fence itself off +at its next write. Bring the replica back with `docker compose start server-a`; +the lease does not bounce back (the current holder keeps renewing). + +### 4. Proxy request failover + +The lease failover above is server-side; the proxy also has to stop routing +*client* requests to a dead replica. That is what the `health_uri` / `lb_*` +directives in the [`Caddyfile`](./Caddyfile) do — a bare `round_robin` (no health +checks) keeps sending half the traffic to the dead replica and returns `502`. +Kill a replica and hit the proxy: + +```sh +docker compose stop server-b +for i in $(seq 1 4); do curl -s -o /dev/null -w "%{http_code} " \ + http://localhost:8080/pubkey; done; echo +``` + +Every response stays `200` — Caddy health-checks each replica and routes only to +the survivor. Bring it back with `docker compose start server-b`; Caddy re-adds +it within one health interval (~5s). (Strip the health directives from the +`Caddyfile` and the same loop returns alternating `502`s.) + +### 5. Auth fails closed when the shared store is down + +Pause Postgres and watch the holder step down rather than barrel ahead: + +```sh +docker compose pause postgres +docker compose logs -f server-a server-b # Ctrl-C after a few seconds +``` + +You will see lease renew/acquire failures and storage errors — the worker +**cancels its pass** instead of canonicalizing blind. If you have completed the +login walkthrough below, an authenticated request fails rather than silently +succeeding: authentication is **fail-closed**. + +> `docker compose pause` freezes Postgres mid-connection (SIGSTOP), so an +> in-flight request *hangs until it times out* rather than getting a prompt +> `5xx`. Either way it never succeeds. To see a fast `5xx` instead (socket +> closed → connection refused), use `docker compose stop postgres` and +> `docker compose start postgres` to recover. + +Recover with: + +```sh +docker compose unpause postgres +``` + +Coordination resumes automatically; no manual intervention. + +### 6. Rate-limit partitioning and `X-Forwarded-For` + +Each replica enforces `global / GUARDIAN_MAX_REPLICAS`. With the default global +burst of 10 and `GUARDIAN_MAX_REPLICAS=2`, a single replica caps at ~5 req/s. +Hammer one replica directly (the challenge endpoint is unauthenticated and +rate-limited): + +```sh +for i in $(seq 1 12); do + curl -s -o /dev/null -w "%{http_code} " \ + "http://localhost:3000/auth/challenge?commitment=0xdemo" +done; echo +``` + +After the per-replica burst is spent you see `429`s. Through the proxy +(`:8080`), Caddy sets `X-Forwarded-For`, so the server keys the limit on your +real client IP rather than the proxy address — confirm by repeating the loop +against `http://localhost:8080/...` and seeing the same per-IP behavior. + +### 7. (End-to-end) An operator session survives losing its replica + +This is the headline, and it needs a real operator key to sign the challenge. +Use the [`examples/operator-smoke-web`](../../../examples/operator-smoke-web) +harness (or the operator client) pointed at the **proxy** URL +`http://localhost:8080`: + +1. Generate a Falcon operator key with the harness and add its public key to + `operators.json` (replacing the empty `[]`); the allowlist hot-reloads, so no + restart is needed: + + ```json + [{ "public_key": "0x", "permissions": ["dashboard:read"] }] + ``` +2. Complete the login (`GET /auth/challenge` → sign → `POST + /auth/verify`). The proxy round-robins, so this may land on either + replica; the session row is written to `auth_sessions`. +3. Make an authenticated request (e.g. `GET /dashboard/accounts`) a few times — + each may be served by a different replica, and all succeed: the cookie is + validated against the shared store, not per-process memory. +4. Now `docker compose stop` the replica that handled your login and repeat — + **your session still works** on the survivor. Then `POST + /auth/logout`; the revocation is honored on every replica. + +## Cleanup + +```sh +docker compose down -v # -v also drops the Postgres + keystore volumes +``` + +## From this demo to production + +This guide stays AWS-free to be runnable; a real prod deployment differs in two +ways that do not change the coordination behavior shown above: + +- **`GUARDIAN_ENV=prod`** activates the prod-stage startup guards — a filesystem + storage backend and a rate limit that partitions to 0 req/replica are each + refused at startup. (An unset `GUARDIAN_DASHBOARD_CURSOR_SECRET` only *warns* — + it degrades cross-replica dashboard pagination, not custody, so a + single-replica prod server still boots.) Note these guards live behind the ACK + registry init, which in prod requires AWS first: set `GUARDIAN_ENV=prod` + without `AWS_REGION` and the server refuses to start with `AWS_REGION is + required when GUARDIAN_ENV=prod` before it ever reaches the storage or + rate-limit checks — so observing those two specifically needs AWS configured. +- **One shared ACK signing key.** Each replica here auto-generates its own + guardian ACK key into its local keystore — and in non-prod it does so on + *every* startup, so the identity is not even stable across a single replica's + restart. That is fine for auth + coordination, which is all this guide + exercises. It is **not** enough for the multisig co-signing flow: every replica + must present the *same* guardian identity, because each account pins the + guardian's `/pubkey` commitment into its `openzeppelin::guardian::public_key` + slot at configure time. Route a multisig flow through the round-robin proxy and + the replica that did not configure the account rejects it with + `invalid GUARDIAN public key binding`. So prod pins one ACK key via AWS Secrets + Manager — see the [aws-signers guide](../aws-signers/README.md). Per-account + state already lives in Postgres and needs nothing extra. + +> **Smoke-testing multisig against this demo?** Until a stable non-AWS identity +> lands ([issue #289](https://github.com/OpenZeppelin/guardian/issues/289) — a +> local file/env signer key, so every replica can share one identity without +> AWS), point your client at a **single replica directly** +> (`http://localhost:3000`), never the proxy (`:8080`). Also make the client's +> Miden RPC network match the server's `GUARDIAN_NETWORK_TYPE` (e.g. devnet RPC +> ↔ `MidenDevnet`), or canonicalization will loop on an `on_chain=0x00…0` +> commitment because the account was deployed to a different network than the +> guardian verifies against. + +The managed path (published Postgres image + the prod Terraform profile) sets +all of this for you; see [`../../SERVER_AWS_DEPLOY.md`](../../SERVER_AWS_DEPLOY.md) +and the [horizontal-scaling runbook](../../runbooks/horizontal-scaling.md). diff --git a/docs/guides/horizontal-scaling/docker-compose.override.yml.example b/docs/guides/horizontal-scaling/docker-compose.override.yml.example new file mode 100644 index 00000000..ce405ae3 --- /dev/null +++ b/docs/guides/horizontal-scaling/docker-compose.override.yml.example @@ -0,0 +1,28 @@ +# Local-build override for testing UNRELEASED changes — e.g. a feature branch the +# published `ghcr.io/openzeppelin/guardian` image does not yet contain. Builds +# both replicas from the repo-root Dockerfile (the `postgres` feature, like the +# published image) instead of pulling from the registry. +# +# Copy it into place; Compose then auto-merges it onto docker-compose.yml with no +# -f flags to remember (so every command in README.md works unchanged): +# cp docker-compose.override.yml.example docker-compose.override.yml +# docker compose up -d --build +# +# Once the changes ship in a published image, delete docker-compose.override.yml +# and `docker compose up -d` uses the registry image again. + +x-local-build: &local-build + image: guardian-server:local + pull_policy: build + build: + context: ../../.. + dockerfile: Dockerfile + target: server-runner + args: + GUARDIAN_SERVER_FEATURES: postgres + +services: + server-a: + <<: *local-build + server-b: + <<: *local-build diff --git a/docs/guides/horizontal-scaling/docker-compose.yml b/docs/guides/horizontal-scaling/docker-compose.yml new file mode 100644 index 00000000..b08e32fc --- /dev/null +++ b/docs/guides/horizontal-scaling/docker-compose.yml @@ -0,0 +1,89 @@ +# Horizontal scaling: two Guardian replicas behind a round-robin proxy, sharing +# one Postgres. Demonstrates the coordination layer (issue #242) end to end and +# locally: shared operator/EVM sessions and login challenges, the single-owner +# canonicalization lease with failover, per-replica rate-limit partitioning, and +# fail-closed auth when the shared store is unavailable. +# +# Run from this directory: +# cp .env.example .env # then fill in the values +# cp operators.example.json operators.json # for the operator-login walkthrough +# docker compose up +# +# Compose auto-loads .env for both ${VAR} interpolation and each server's runtime +# config. Walkthrough: ./README.md Full config reference: ../../CONFIGURATION.md +# +# Coordination is backend-derived: because both replicas use the Postgres backend +# they share sessions/challenges/lease automatically — no flag turns it on. ACK +# signing keys are auto-generated per replica here, which is fine for this demo +# (it exercises auth + coordination, not multisig co-signing); a real prod +# deployment pins one shared ACK key via Secrets Manager (GUARDIAN_ENV=prod, see +# the aws-signers guide). + +x-guardian-server: &guardian-server + image: ghcr.io/openzeppelin/guardian:${GUARDIAN_VERSION:-latest} + pull_policy: always + depends_on: + postgres: + condition: service_healthy + environment: + RUST_LOG: info + GUARDIAN_NETWORK_TYPE: ${GUARDIAN_NETWORK_TYPE:-MidenDevnet} + DATABASE_URL: postgres://guardian:${POSTGRES_PASSWORD:?set POSTGRES_PASSWORD in .env}@postgres:5432/guardian + GUARDIAN_KEYSTORE_PATH: /var/guardian/keystore + GUARDIAN_OPERATOR_PUBLIC_KEYS_FILE: /etc/guardian/operators.json + # Pinned and identical on every replica: pagination cursors are signed with + # this key, so a cursor minted on one replica must validate on another. + GUARDIAN_DASHBOARD_CURSOR_SECRET: ${GUARDIAN_DASHBOARD_CURSOR_SECRET:?set GUARDIAN_DASHBOARD_CURSOR_SECRET in .env (openssl rand -hex 32)} + # Rate-limit partition divisor: each replica enforces global/GUARDIAN_MAX_REPLICAS + # so the fleet aggregate stays at or below the global limit. + GUARDIAN_MAX_REPLICAS: ${GUARDIAN_MAX_REPLICAS:-2} + +services: + postgres: + image: postgres:16-alpine + volumes: + - guardian-postgres:/var/lib/postgresql/data + environment: + POSTGRES_USER: guardian + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?set POSTGRES_PASSWORD in .env} + POSTGRES_DB: guardian + ports: + - "5432:5432" # exposed so you can inspect worker_leases / auth_sessions with psql + healthcheck: + test: ["CMD-SHELL", "pg_isready -U guardian -d guardian"] + interval: 5s + timeout: 5s + retries: 5 + + server-a: + <<: *guardian-server + hostname: server-a + volumes: + - guardian-keystore-a:/var/guardian/keystore + - ./operators.json:/etc/guardian/operators.json:ro + ports: + - "3000:3000" # target replica A directly (bypassing the proxy) + + server-b: + <<: *guardian-server + hostname: server-b + volumes: + - guardian-keystore-b:/var/guardian/keystore + - ./operators.json:/etc/guardian/operators.json:ro + ports: + - "3010:3000" # target replica B directly (bypassing the proxy) + + proxy: + image: caddy:2-alpine + depends_on: + - server-a + - server-b + volumes: + - ./Caddyfile:/etc/caddy/Caddyfile:ro + ports: + - "8080:8080" # round-robin entrypoint; this is what a client/LB would hit + +volumes: + guardian-postgres: + guardian-keystore-a: + guardian-keystore-b: diff --git a/docs/guides/horizontal-scaling/operators.example.json b/docs/guides/horizontal-scaling/operators.example.json new file mode 100644 index 00000000..fe51488c --- /dev/null +++ b/docs/guides/horizontal-scaling/operators.example.json @@ -0,0 +1 @@ +[] diff --git a/docs/openapi-dashboard.json b/docs/openapi-dashboard.json index ca4d5bf0..0a96804d 100644 --- a/docs/openapi-dashboard.json +++ b/docs/openapi-dashboard.json @@ -108,6 +108,16 @@ } } } + }, + "500": { + "description": "Session revocation failed", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ApiErrorResponse" + } + } + } } }, "security": [ diff --git a/docs/openapi-evm.json b/docs/openapi-evm.json index 98ff4d3e..fdb377dd 100644 --- a/docs/openapi-evm.json +++ b/docs/openapi-evm.json @@ -194,6 +194,16 @@ } } } + }, + "500": { + "description": "Session revocation failed", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ApiErrorResponse" + } + } + } } }, "security": [ diff --git a/docs/openapi.json b/docs/openapi.json index b5b1b04d..b3874347 100644 --- a/docs/openapi.json +++ b/docs/openapi.json @@ -108,6 +108,16 @@ } } } + }, + "500": { + "description": "Session revocation failed", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ApiErrorResponse" + } + } + } } }, "security": [ @@ -2190,6 +2200,16 @@ } } } + }, + "500": { + "description": "Session revocation failed", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ApiErrorResponse" + } + } + } } }, "security": [ diff --git a/docs/runbooks/horizontal-scaling.md b/docs/runbooks/horizontal-scaling.md new file mode 100644 index 00000000..8d793c61 --- /dev/null +++ b/docs/runbooks/horizontal-scaling.md @@ -0,0 +1,101 @@ +# Runbook: Horizontal Scaling (multiple Guardian replicas) + +Guardian runs as 2–6 ECS tasks behind a round-robin load balancer in the prod +profile. This runbook covers what an operator must configure for a correct +high-availability (HA) deployment and how the server behaves across replicas. +Tracking: issue #242. + +## TL;DR — required for a correct multi-replica deployment + +| Setting | Why it matters across replicas | +|---|---| +| **Postgres backend** (`DATABASE_URL`) | Sessions, login challenges, and the canonicalization lease live in Postgres so they are shared. The filesystem backend is **dev-only** and is refused at startup in the prod stage. | +| **`GUARDIAN_DASHBOARD_CURSOR_SECRET`** (64 hex chars) | Pagination cursors are signed with this key. If it differs per replica, a cursor minted on one replica fails on another. Unset → the server **warns** and generates an ephemeral per-process secret (boots fine in every stage); pin a shared value so multi-replica dashboard pagination works. Degrades only pagination, not custody. | +| **`GUARDIAN_ENV=prod`** | Activates the prod-stage startup guards (filesystem-backend refusal, 0-req/replica rate-limit refusal). Set by Terraform from `var.deployment_stage`. | +| **`GUARDIAN_MAX_REPLICAS`** | Rate-limit partitioning divisor (see below). Defaults from the autoscaling max capacity via Terraform. | + +With the published Postgres image + the prod Terraform profile, all of these are +set for you. The rest of this doc is for understanding and for non-default +deployments. + +## Coordination is backend-derived (not a tunable) + +The coordination mode is determined by the **storage backend alone**: + +- **Postgres backend → shared coordination** (replica-safe). Always. No tunable + can turn this off — a missing or wrong env var can never silently revert a + Postgres deployment to per-process state. +- **Filesystem backend → in-memory, single-process** coordination (dev only). + +The startup log emits one line reflecting the resolved state, e.g.: + +```text +coordination mode=shared backend=postgres stage=prod max_replicas=6 cursor_secret=configured +``` + +If you ever see `mode=single-process backend=filesystem` on a deployment you +believe is multi-replica, that deployment is **not** safe to run with more than +one task. + +If the server is auto-built with the Postgres backend but coordination handles +were not wired (only possible via a manual/embedded builder), it **fails to +start** rather than falling back to per-process state. + +## Behavior across replicas + +- **Operator & EVM login**: a challenge issued on one replica verifies on any + other; an established session is honored everywhere; logout and expiry are + effective fleet-wide. +- **Canonicalization** runs on exactly one replica at a time via a Postgres + lease (`worker_leases`). Leadership transfers automatically to another replica + within one lease TTL (≈ 3× the canonicalization check interval) if the holder + crashes. A superseded holder cannot commit a canonical write (a fencing token + is checked before every state/delta write). +- **Rate limiting** is per-process but partitioned (see below). + +## Failure modes (by design) + +- **Shared store (Postgres) briefly unavailable → auth fails closed.** Login and + authenticated requests are rejected (never bypassed) until Postgres returns. + The canonicalization leader steps down rather than risk double-processing, and + resumes automatically. This is a deliberate change from the old always- + available in-memory behavior. +- **DB connection budget**: each replica opens up to `GUARDIAN_DB_POOL_MAX_SIZE` + (default 32 in prod) connections, plus the metadata pool, plus per-request + session lookups. With N replicas the total can approach + `N × (pools × size)`; keep it under Postgres `max_connections`. Prod routes + through RDS Proxy by default, which pools server-side and absorbs much of this. + +## `GUARDIAN_MAX_REPLICAS` and rate limiting + +The configured global limits (`GUARDIAN_RATE_BURST_PER_SEC`, +`GUARDIAN_RATE_PER_MIN`) are divided by `GUARDIAN_MAX_REPLICAS` so each replica +enforces `global / GUARDIAN_MAX_REPLICAS`. With round-robin distribution the +fleet aggregate stays at or below the global limit. + +- Default = the deployment's **autoscaling max capacity** (Terraform). It must be + the *max*, not the count you happen to run now — partitioning by max is + conservative. +- **Drives rate-limiting only.** It has no effect on coordination mode. +- **Tolerance band**: when fewer than max replicas are running, the fleet + over-throttles (stricter than the global limit) — accepted. HTTP keep-alive can + also pin a client to one replica, throttling it at `global / max` (e.g. 1/6) — + also accepted; it is fail-closed (never too loose). +- **Override** (`var.guardian_max_replicas`): an explicit value is clamped **up** + to the autoscaling max, so it can never drop below real capacity (which would + let the aggregate exceed the global limit). Setting it higher only + over-throttles. + +## Validate the coordination behavior locally + +To see this contract in action before deploying — shared sessions, single-owner +lease with failover, fail-closed auth, rate-limit partitioning — run the +[horizontal-scaling guide](../guides/horizontal-scaling/README.md): two replicas +behind a round-robin proxy sharing one Postgres, all on Docker Compose. + +## Filesystem backend is dev-only + +The filesystem backend keeps state local to one task (and does not persist audit +events). In the prod stage the server **refuses to start** on the filesystem +backend with an actionable error. Use it only for local development / single +process. diff --git a/examples/operator-smoke-web/package-lock.json b/examples/operator-smoke-web/package-lock.json index 9065d832..40d3bb21 100644 --- a/examples/operator-smoke-web/package-lock.json +++ b/examples/operator-smoke-web/package-lock.json @@ -27,7 +27,7 @@ }, "../../packages/guardian-operator-client": { "name": "@openzeppelin/guardian-operator-client", - "version": "0.14.7", + "version": "0.15.0", "license": "MIT", "devDependencies": { "typescript": "^5.7.2", diff --git a/infra/data.tf b/infra/data.tf index bb7963e2..aa00dc45 100644 --- a/infra/data.tf +++ b/infra/data.tf @@ -78,56 +78,60 @@ locals { for subnet_id in local.effective_rds_proxy_subnet_ids : data.aws_subnet.rds_proxy_candidate[subnet_id].availability_zone_id ]) - cluster_name = var.cluster_name != "" ? var.cluster_name : "${var.stack_name}-cluster" - server_service_name = var.server_service_name != "" ? var.server_service_name : "${var.stack_name}-server" - alb_name = var.alb_name != "" ? var.alb_name : "${var.stack_name}-alb" - target_group_name = var.target_group_name != "" ? var.target_group_name : "${var.stack_name}-server-tg" - grpc_target_group_name = "${var.stack_name}-grpc-tg" - alb_security_group_name = var.alb_security_group_name != "" ? var.alb_security_group_name : "${var.stack_name}-alb-sg" - server_security_group_name = var.server_security_group_name != "" ? var.server_security_group_name : "${var.stack_name}-server-sg" - postgres_security_group_name = var.postgres_security_group_name != "" ? var.postgres_security_group_name : "${var.stack_name}-postgres-sg" - task_execution_role_name = var.task_execution_role_name != "" ? var.task_execution_role_name : "${var.stack_name}-ecs-task-execution" - task_role_name = var.task_role_name != "" ? var.task_role_name : "${var.stack_name}-ecs-task" - server_task_family = var.server_task_family != "" ? var.server_task_family : "${var.stack_name}-server" - server_container_name = var.server_container_name != "" ? var.server_container_name : "${var.stack_name}-server" - server_log_group_name = var.server_log_group_name != "" ? var.server_log_group_name : "/ecs/${local.server_service_name}" - cluster_log_group_name = "/aws/ecs/${local.cluster_name}/cluster" - postgres_identifier_seed = lower(replace(var.stack_name, "/[^0-9A-Za-z]/", "")) - postgres_identifier_base = local.postgres_identifier_seed != "" ? local.postgres_identifier_seed : "guardian" - postgres_identifier_default = substr(can(regex("^[a-z]", local.postgres_identifier_base)) ? local.postgres_identifier_base : "g${local.postgres_identifier_base}", 0, 63) - postgres_db = var.postgres_db != "" ? var.postgres_db : local.postgres_identifier_default - postgres_user = var.postgres_user != "" ? var.postgres_user : local.postgres_identifier_default - postgres_password = var.postgres_password != "" ? var.postgres_password : "${var.stack_name}_dev_password" - postgres_port = 5432 - rds_instance_identifier = "${var.stack_name}-postgres" - rds_subnet_group_name = "${var.stack_name}-postgres-subnets" - database_secret_name = "${var.stack_name}/server/database-url" - database_credentials_secret_name = "${var.stack_name}/server/database-credentials" - operator_public_keys_secret_name = "${var.stack_name}/server/operator-public-keys" - evm_allowed_chain_ids_secret_name = "${var.stack_name}/server/evm-allowed-chain-ids" - evm_rpc_urls_secret_name = "${var.stack_name}/server/evm-rpc-urls" - ack_falcon_secret_name = var.guardian_ack_falcon_secret_name != "" ? var.guardian_ack_falcon_secret_name : "${var.stack_name}/server/ack-falcon-secret-key" - ack_ecdsa_secret_name = var.guardian_ack_ecdsa_secret_name != "" ? var.guardian_ack_ecdsa_secret_name : "${var.stack_name}/server/ack-ecdsa-secret-key" - managed_storage_encryption_enabled = local.is_prod && var.guardian_storage_encryption_secret_name != "" - storage_encryption_secret_name = local.managed_storage_encryption_enabled ? var.guardian_storage_encryption_secret_name : "" - rds_proxy_name = "${var.stack_name}-postgres-proxy" - rds_proxy_role_name = "${var.stack_name}-rds-proxy" - rds_proxy_security_group_name = "${var.stack_name}-rds-proxy-sg" - rds_master_password = var.postgres_password != "" ? var.postgres_password : random_password.postgres[0].result - effective_rds_instance_class = var.rds_instance_class != "" ? var.rds_instance_class : (local.is_prod ? "db.r6g.large" : "db.t3.micro") - effective_rds_allocated_storage = var.rds_allocated_storage != null ? var.rds_allocated_storage : (local.is_prod ? 50 : 20) - effective_server_desired_count = var.server_desired_count != null ? var.server_desired_count : (local.is_prod ? 2 : 1) - effective_server_autoscaling_enabled = var.server_autoscaling_enabled != null ? var.server_autoscaling_enabled : local.is_prod - effective_server_autoscaling_min_capacity = var.server_autoscaling_min_capacity != null ? var.server_autoscaling_min_capacity : local.effective_server_desired_count - effective_server_autoscaling_max_capacity = var.server_autoscaling_max_capacity != null ? var.server_autoscaling_max_capacity : (local.is_prod ? max(local.effective_server_desired_count, 6) : local.effective_server_desired_count) - effective_server_autoscaling_cpu_target = var.server_autoscaling_cpu_target != null ? var.server_autoscaling_cpu_target : 65 - effective_server_autoscaling_memory_target = var.server_autoscaling_memory_target != null ? var.server_autoscaling_memory_target : 75 - effective_rds_proxy_enabled = var.rds_proxy_enabled != null ? var.rds_proxy_enabled : local.is_prod - effective_rds_proxy_route_database_url = local.effective_rds_proxy_enabled && (var.rds_proxy_route_database_url != null ? var.rds_proxy_route_database_url : true) - effective_rds_max_allocated_storage = var.rds_max_allocated_storage != null ? var.rds_max_allocated_storage : (local.is_prod ? max(local.effective_rds_allocated_storage, 200) : null) - effective_guardian_rate_limit_enabled = var.guardian_rate_limit_enabled != null ? var.guardian_rate_limit_enabled : true - effective_guardian_rate_burst_per_sec = var.guardian_rate_burst_per_sec != null ? var.guardian_rate_burst_per_sec : (local.is_prod ? 200 : 10) - effective_guardian_rate_per_min = var.guardian_rate_per_min != null ? var.guardian_rate_per_min : (local.is_prod ? 5000 : 60) + cluster_name = var.cluster_name != "" ? var.cluster_name : "${var.stack_name}-cluster" + server_service_name = var.server_service_name != "" ? var.server_service_name : "${var.stack_name}-server" + alb_name = var.alb_name != "" ? var.alb_name : "${var.stack_name}-alb" + target_group_name = var.target_group_name != "" ? var.target_group_name : "${var.stack_name}-server-tg" + grpc_target_group_name = "${var.stack_name}-grpc-tg" + alb_security_group_name = var.alb_security_group_name != "" ? var.alb_security_group_name : "${var.stack_name}-alb-sg" + server_security_group_name = var.server_security_group_name != "" ? var.server_security_group_name : "${var.stack_name}-server-sg" + postgres_security_group_name = var.postgres_security_group_name != "" ? var.postgres_security_group_name : "${var.stack_name}-postgres-sg" + task_execution_role_name = var.task_execution_role_name != "" ? var.task_execution_role_name : "${var.stack_name}-ecs-task-execution" + task_role_name = var.task_role_name != "" ? var.task_role_name : "${var.stack_name}-ecs-task" + server_task_family = var.server_task_family != "" ? var.server_task_family : "${var.stack_name}-server" + server_container_name = var.server_container_name != "" ? var.server_container_name : "${var.stack_name}-server" + server_log_group_name = var.server_log_group_name != "" ? var.server_log_group_name : "/ecs/${local.server_service_name}" + cluster_log_group_name = "/aws/ecs/${local.cluster_name}/cluster" + postgres_identifier_seed = lower(replace(var.stack_name, "/[^0-9A-Za-z]/", "")) + postgres_identifier_base = local.postgres_identifier_seed != "" ? local.postgres_identifier_seed : "guardian" + postgres_identifier_default = substr(can(regex("^[a-z]", local.postgres_identifier_base)) ? local.postgres_identifier_base : "g${local.postgres_identifier_base}", 0, 63) + postgres_db = var.postgres_db != "" ? var.postgres_db : local.postgres_identifier_default + postgres_user = var.postgres_user != "" ? var.postgres_user : local.postgres_identifier_default + postgres_password = var.postgres_password != "" ? var.postgres_password : "${var.stack_name}_dev_password" + postgres_port = 5432 + rds_instance_identifier = "${var.stack_name}-postgres" + rds_subnet_group_name = "${var.stack_name}-postgres-subnets" + database_secret_name = "${var.stack_name}/server/database-url" + database_credentials_secret_name = "${var.stack_name}/server/database-credentials" + operator_public_keys_secret_name = "${var.stack_name}/server/operator-public-keys" + evm_allowed_chain_ids_secret_name = "${var.stack_name}/server/evm-allowed-chain-ids" + evm_rpc_urls_secret_name = "${var.stack_name}/server/evm-rpc-urls" + ack_falcon_secret_name = var.guardian_ack_falcon_secret_name != "" ? var.guardian_ack_falcon_secret_name : "${var.stack_name}/server/ack-falcon-secret-key" + ack_ecdsa_secret_name = var.guardian_ack_ecdsa_secret_name != "" ? var.guardian_ack_ecdsa_secret_name : "${var.stack_name}/server/ack-ecdsa-secret-key" + managed_storage_encryption_enabled = local.is_prod && var.guardian_storage_encryption_secret_name != "" + storage_encryption_secret_name = local.managed_storage_encryption_enabled ? var.guardian_storage_encryption_secret_name : "" + rds_proxy_name = "${var.stack_name}-postgres-proxy" + rds_proxy_role_name = "${var.stack_name}-rds-proxy" + rds_proxy_security_group_name = "${var.stack_name}-rds-proxy-sg" + rds_master_password = var.postgres_password != "" ? var.postgres_password : random_password.postgres[0].result + effective_rds_instance_class = var.rds_instance_class != "" ? var.rds_instance_class : (local.is_prod ? "db.r6g.large" : "db.t3.micro") + effective_rds_allocated_storage = var.rds_allocated_storage != null ? var.rds_allocated_storage : (local.is_prod ? 50 : 20) + effective_server_desired_count = var.server_desired_count != null ? var.server_desired_count : (local.is_prod ? 2 : 1) + effective_server_autoscaling_enabled = var.server_autoscaling_enabled != null ? var.server_autoscaling_enabled : local.is_prod + effective_server_autoscaling_min_capacity = var.server_autoscaling_min_capacity != null ? var.server_autoscaling_min_capacity : local.effective_server_desired_count + effective_server_autoscaling_max_capacity = var.server_autoscaling_max_capacity != null ? var.server_autoscaling_max_capacity : (local.is_prod ? max(local.effective_server_desired_count, 6) : local.effective_server_desired_count) + effective_server_autoscaling_cpu_target = var.server_autoscaling_cpu_target != null ? var.server_autoscaling_cpu_target : 65 + effective_server_autoscaling_memory_target = var.server_autoscaling_memory_target != null ? var.server_autoscaling_memory_target : 75 + effective_rds_proxy_enabled = var.rds_proxy_enabled != null ? var.rds_proxy_enabled : local.is_prod + effective_rds_proxy_route_database_url = local.effective_rds_proxy_enabled && (var.rds_proxy_route_database_url != null ? var.rds_proxy_route_database_url : true) + effective_rds_max_allocated_storage = var.rds_max_allocated_storage != null ? var.rds_max_allocated_storage : (local.is_prod ? max(local.effective_rds_allocated_storage, 200) : null) + effective_guardian_rate_limit_enabled = var.guardian_rate_limit_enabled != null ? var.guardian_rate_limit_enabled : true + effective_guardian_rate_burst_per_sec = var.guardian_rate_burst_per_sec != null ? var.guardian_rate_burst_per_sec : (local.is_prod ? 200 : 10) + effective_guardian_rate_per_min = var.guardian_rate_per_min != null ? var.guardian_rate_per_min : (local.is_prod ? 5000 : 60) + # GUARDIAN_MAX_REPLICAS defaults to the autoscaling max capacity. An explicit + # override is clamped UP to that max so it can never drop below real capacity + # (which would let the fleet aggregate exceed the global rate limit). + effective_guardian_max_replicas = var.guardian_max_replicas != null ? max(var.guardian_max_replicas, local.effective_server_autoscaling_max_capacity) : local.effective_server_autoscaling_max_capacity effective_guardian_db_pool_max_size = var.guardian_db_pool_max_size != null ? var.guardian_db_pool_max_size : (local.is_prod ? 32 : 16) effective_guardian_metadata_db_pool_max_size = var.guardian_metadata_db_pool_max_size != null ? var.guardian_metadata_db_pool_max_size : local.effective_guardian_db_pool_max_size managed_evm_allowed_chain_ids_secret_enabled = var.guardian_evm_allowed_chain_ids_secret_arn == "" && var.guardian_evm_allowed_chain_ids != "" diff --git a/infra/ecs.tf b/infra/ecs.tf index a6987c7f..d0b6daee 100644 --- a/infra/ecs.tf +++ b/infra/ecs.tf @@ -144,6 +144,10 @@ resource "aws_ecs_task_definition" "server" { name = "GUARDIAN_RATE_PER_MIN" value = tostring(local.effective_guardian_rate_per_min) }, + { + name = "GUARDIAN_MAX_REPLICAS" + value = tostring(local.effective_guardian_max_replicas) + }, { name = "GUARDIAN_DB_POOL_MAX_SIZE" value = tostring(local.effective_guardian_db_pool_max_size) diff --git a/infra/variables.tf b/infra/variables.tf index a65cffe3..a0be5978 100644 --- a/infra/variables.tf +++ b/infra/variables.tf @@ -302,6 +302,27 @@ variable "guardian_rate_per_min" { default = null } +variable "guardian_max_replicas" { + description = <<-EOT + Optional override for GUARDIAN_MAX_REPLICAS, the maximum replica capacity the + server divides global rate limits by. Defaults to the effective autoscaling + max capacity. Drives rate-limit partitioning only (coordination mode is + backend-derived). A value below the real max would let the aggregate exceed + the global limit, so an explicit override is clamped up to the autoscaling + max in data.tf and can only ever raise the divisor, never lower it. + EOT + type = number + default = null + + validation { + condition = ( + var.guardian_max_replicas == null || + (var.guardian_max_replicas >= 1 && floor(var.guardian_max_replicas) == var.guardian_max_replicas) + ) + error_message = "guardian_max_replicas must be an integer >= 1 when set." + } +} + variable "guardian_rate_limit_enabled" { description = "Optional override to enable or disable Guardian HTTP rate limiting" type = bool diff --git a/packages/miden-multisig-client/.gitignore b/packages/miden-multisig-client/.gitignore index 04c01ba7..11059a0b 100644 --- a/packages/miden-multisig-client/.gitignore +++ b/packages/miden-multisig-client/.gitignore @@ -1,2 +1,3 @@ node_modules/ -dist/ \ No newline at end of file +dist/ +test-results/ \ No newline at end of file diff --git a/speckit/features/010-horizontal-scaling/contracts/config-contract.md b/speckit/features/010-horizontal-scaling/contracts/config-contract.md new file mode 100644 index 00000000..9d0fb972 --- /dev/null +++ b/speckit/features/010-horizontal-scaling/contracts/config-contract.md @@ -0,0 +1,102 @@ +# Contract: Configuration & Startup Guards + +**Feature**: 010-horizontal-scaling + +Operator-facing configuration contract. These are the only externally visible +behavior changes (no client wire contract changes). + +## Environment variables + +| Variable | Status | Behavior | +|---|---|---| +| `GUARDIAN_ENV` | **reused** | Stage signal. `prod` (case-insensitive) activates HA fail-fast guards. Already set from Terraform `var.deployment_stage` (`infra/ecs.tf:128-129`). Currently only gates ACK secrets (`ack/mod.rs:139-145`); `is_prod_environment()` is promoted to a shared `config/stage.rs` helper. | +| `GUARDIAN_DASHBOARD_CURSOR_SECRET` | **enforcement changed** | 64-hex (32-byte) shared secret. Optional in every stage: if unset, warn and fall back to an ephemeral per-process secret (boots, never fails startup). A missing shared secret degrades only dashboard pagination across replicas, so it is not a startup guard (`dashboard/state.rs`). | +| `GUARDIAN_MAX_REPLICAS` | **new** | Positive integer = the deployment's autoscaling **max** capacity. Drives **rate limiting only**: divides `GUARDIAN_RATE_BURST_PER_SEC`/`GUARDIAN_RATE_PER_MIN` per replica (`global / GUARDIAN_MAX_REPLICAS`) so aggregate stays at or below the global limit (over-throttles below max capacity). Defaults from Terraform `effective_server_autoscaling_max_capacity` (see below); overridable, but a value **below** the real max makes per-replica caps too high so the aggregate can exceed the global limit (too loose) — Terraform should validate `>=` effective max. A value above the real max over-throttles. Unset or `1` => current per-process rate-limit behavior. **Does NOT affect coordination mode** — that is backend-derived (FR-020). | +| `DATABASE_URL` | unchanged | Required for the Postgres backend (which the prod image uses). | +| `GUARDIAN_DB_POOL_MAX_SIZE` / `GUARDIAN_METADATA_DB_POOL_MAX_SIZE` | unchanged | Per-replica pool sizes; runbook adds guidance: total ≈ size x replicas x pools must stay under Postgres `max_connections`. | +| `GUARDIAN_RATE_LIMIT_ENABLED` / `GUARDIAN_RATE_BURST_PER_SEC` / `GUARDIAN_RATE_PER_MIN` | unchanged | Now interpreted as global limits when `GUARDIAN_MAX_REPLICAS > 1`. | + +Optional (implementation may add, with documented defaults): lease TTL / renew +interval overrides (e.g. `GUARDIAN_CANON_LEASE_TTL_SECS`, +`GUARDIAN_CANON_LEASE_RENEW_SECS`). Default to safe values if absent. The lease +TTL is sized for renew/failover only and is independent of the canonicalization +`submission_grace_period_seconds`. + +## Terraform wiring (default ships from infra) + +`GUARDIAN_MAX_REPLICAS` MUST default from the deployment's autoscaling max +capacity rather than a manually maintained value: + +- `infra/data.tf` already computes + `local.effective_server_autoscaling_max_capacity` (prod = `max(desired, 6)`). +- `infra/ecs.tf` already injects the server env block (after + `GUARDIAN_RATE_PER_MIN`). Add: + ```hcl + { + name = "GUARDIAN_MAX_REPLICAS" + value = tostring(local.effective_guardian_max_replicas) + } + ``` + where `local.effective_guardian_max_replicas = var.guardian_max_replicas != null ? max(var.guardian_max_replicas, local.effective_server_autoscaling_max_capacity) : local.effective_server_autoscaling_max_capacity` + (new `var.guardian_max_replicas` defaults to `null`, i.e. derive from max + capacity; an explicit override is clamped **up** to the autoscaling max so it + can only ever raise the divisor, never lower it below the real fleet size). + +This keeps the default correct on every deploy with no operator action; the +runbook documents the override, not a required value. + +## Startup guards (fail-fast, prod only) + +The server MUST refuse to start, with a clear actionable error naming the +variable and remedy, when `GUARDIAN_ENV=prod` and any of: + +1. The active storage backend is the **filesystem** backend (US5/FR-012). Remedy: + build/run with the Postgres backend and set `DATABASE_URL`. +2. An enabled global rate limit partitions to **zero** requests per replica + (`GUARDIAN_RATE_BURST_PER_SEC`/`GUARDIAN_RATE_PER_MIN` below + `GUARDIAN_MAX_REPLICAS`, FR-013). Remedy: raise the global limit or lower + `GUARDIAN_MAX_REPLICAS`. + +In non-prod, condition (1) is allowed (dev default) and (2) warns but starts. A +missing `GUARDIAN_DASHBOARD_CURSOR_SECRET` is NOT a startup guard in any stage: +it warns and boots with an ephemeral per-process secret (FR-008), because it +degrades pagination only. + +## Error message contract + +Each guard error MUST: name the offending variable/backend, state the +consequence under multiple replicas, and give the exact remedy. Errors are +startup/config errors (process exits non-zero), not request-path errors — no +change to HTTP/gRPC boundary error shapes. + +## Startup mode log line (FR-019) + +On startup the server logs exactly one coordination-mode line reflecting the +**resolved** state (never operator intent): + +```text +coordination mode=shared backend=postgres stage=prod max_replicas=6 cursor_secret=configured +coordination mode=single-process backend=filesystem stage=dev max_replicas=1 cursor_secret=ephemeral +``` + +`mode=shared` iff coordination is backed by the external store (Postgres); +`mode=single-process` for the in-memory impls (filesystem). This is the +discoverable signal that replaces an explicit `DISTRIBUTED_MODE` toggle — +coordination is determined by the resolved storage backend alone, not a flag and +not a tunable, so the line cannot disagree with reality. (`max_replicas` is shown +for the rate-limit context; it does not affect the mode.) + +## Documentation surface (US6) + +- `docs/runbooks/horizontal-scaling.md` (new) — required env vars, state-store + dependency (shared Postgres), pool sizing vs `max_connections`, + `GUARDIAN_MAX_REPLICAS` guidance (rate-limit partitioning only; + over-throttling/keep-alive tolerance below max capacity; a too-low override can + let aggregate limits exceed the global limit, a too-high override over-throttles), + coordination mode is backend-derived (Postgres = shared), + filesystem = dev-only, failover behavior of the canonicalization lease. +- `docs/CONFIGURATION.md` — add `GUARDIAN_MAX_REPLICAS` (default from autoscaling + max capacity; rate-limiting effect only — does not change coordination mode), + document the prod guards. +- `docs/SERVER_AWS_DEPLOY.md` — HA notes referencing the existing prod profile + (`infra/data.tf` desired 2 / max 6) and `GUARDIAN_MAX_REPLICAS` sourced from it. diff --git a/speckit/features/010-horizontal-scaling/contracts/coordination-traits.md b/speckit/features/010-horizontal-scaling/contracts/coordination-traits.md new file mode 100644 index 00000000..45ba4887 --- /dev/null +++ b/speckit/features/010-horizontal-scaling/contracts/coordination-traits.md @@ -0,0 +1,178 @@ +# Contract: Coordination Traits + +**Feature**: 010-horizontal-scaling + +These are the **internal** server traits introduced by this feature. They are not +part of any client (HTTP/gRPC) wire contract — no proto, payload, status enum, or +error surface changes. The traits exist so that shared coordination has two +interchangeable implementations selected with the storage backend: + +| Trait | In-memory impl (filesystem/dev) | Postgres impl (prod) | Backs table | +|---|---|---|---| +| `SessionStore` | `InMemorySessionStore` | `PgSessionStore` | `auth_sessions` | +| `ChallengeStore` | `InMemoryChallengeStore` | `PgChallengeStore` | `auth_challenges` | +| `LeaderElector` | `AlwaysLeader` | `PgLeaseElector` | `worker_leases` | + +All methods are `async` and return the crate's existing error type; auth-facing +errors MUST map to the **same** boundary errors operators/clients see today +(Constitution IV — no error-surface drift). + +## `SessionStore` + +Each store instance is **realm-bound at construction** (operator vs evm), so the +methods carry no realm. `StoredSession { subject: SessionSubject, issued_at, +expires_at }`; `SessionSubject` is `Operator { operator_id, commitment }` | +`Evm { address }` (no permissions — re-resolved per request). + +```text +trait SessionStore { + async fn insert(&self, key: [u8;32], session: StoredSession) -> Result<()>; + async fn get(&self, key: &[u8;32], now) -> Result>; + async fn revoke(&self, key: &[u8;32]) -> Result>; // logout; returns prior for logging + async fn sweep_expired(&self, now) -> Result; +} +``` + +Behavioral contract: +- `get` returns `Some` only when the session is unrevoked and `now < expires_at`. +- Validity is evaluated against the store's clock (DB clock for Postgres). +- `revoke` returns the prior session (for logout logging) and, once revoked, `get` + MUST reject it on every replica until natural expiry. The Postgres impl marks + `revoked_at` and keeps the row until expiry; the in-memory impl removes it. +- Replaces the `DashboardState` and `EvmSessionState` session maps without + changing the outcome of `authenticate_session` (permissions still re-resolved + from the live allowlist at call time). + +## `ChallengeStore` + +Each store instance is **realm-scoped** at construction (operator vs evm), so the +trait methods don't take a realm. The stored challenge carries a realm-appropriate +`key` and `payload`: + +```text +trait ChallengeStore { + async fn issue(&self, principal: &str, challenge: StoredChallenge, max_outstanding: usize, now) -> Result<()>; + async fn active_for(&self, principal: &str, now) -> Result>; + async fn consume(&self, principal: &str, key: &str, now) -> Result; // true => this caller won the single-use claim + async fn sweep_expired(&self, now) -> Result; +} +struct StoredChallenge { key: String, payload: ChallengePayload, issued_at, expires_at } +enum ChallengePayload { OperatorDigest(Word), EvmChallenge { address, nonce, issued_at, expires_at } } +``` + +**Why match-in-Rust, not match-in-store**: the two realms verify differently and +neither check is expressible in SQL — operator does a Falcon +`public_key.verify(signing_digest, sig)` (`dashboard/state.rs:228-230`), EVM does +a nonce compare then `recover_session_address(challenge, sig)` +(`evm/session.rs:112-127`). So the store returns candidate payloads +(`active_for`), the caller matches one, then `consume(principal, key)` atomically +claims it. `key` is the signing-digest hex (operator) or the nonce (EVM); see +data-model.md table `auth_challenges` `(realm, challenge_key)`. + +Behavioral contract: +- `consume(principal, key)` atomically sets `consumed_at` and returns `true` only + if the challenge was unconsumed and unexpired — a replay (or a lost race) on any + replica returns `false` (FR-003). +- Issue-on-replica-A / match+consume-on-replica-B succeeds (FR-001). + +## `LeaderElector` + +```text +trait LeaderElector { + async fn try_acquire(&self, lease: &str, holder_id: &str, ttl: Duration) -> Result>; + async fn renew(&self, lease: &Lease) -> Result; // false => lease lost + async fn verify_held(&self, lease: &Lease) -> Result; // fence-checked ownership at submission boundary + async fn release(&self, lease: Lease) -> Result<()>; // graceful shutdown +} +struct Lease { name, holder_id, fence_token, expires_at } +``` + +Behavioral contract: +- At most one holder satisfies `renew` at any instant (atomic conditional write + + DB-clock TTL). +- `AlwaysLeader` always returns a lease, always renews `true`, and `verify_held` + always returns `true` (single replica). + +**Renewal concurrency (resolves the long-pass split-brain)**: lease renewal MUST +run on its **own timer** (`renew_interval`, e.g. 5s) in a task **concurrent with** +the canonicalization pass — NOT at tick boundaries. A pass may run longer than the +check interval; renewal at tick boundaries would let the lease expire mid-pass +while a renewal was still pending, allowing another replica to claim it. The +worker therefore becomes: one renewal task + the pass, sharing a cancellation +signal (e.g. `tokio_util::sync::CancellationToken` or a `watch` channel). + +**Cooperative cancellation (makes "abort the current pass" mechanical)**: today +`process_all_accounts()` is a single awaited call with no cancellation hook +(`worker.rs:27-44`). This feature adds a cancellation check that the processor +polls **between accounts** (and before each on-chain submission). When `renew` +returns `false`, the renewal task trips the cancellation signal and the pass stops +at the next checkpoint. "Abort the current pass" is thus a concrete mechanism, not +just a requirement. + +**Fence enforcement at the submission boundary (MUST, not may)**: `fence_token` +advances on each change of holder (a steal). Because cancellation is cooperative +there is a window between losing the lease and the pass actually stopping, so the +processor MUST call `verify_held` (fence/ownership re-check) immediately before +**every** state-mutating write — canonical `submit_state`/`submit_delta` **and** +the retry/discard writes — and MUST skip the write if it returns `false`. + +The fence is **advisory**, not atomic: `verify_held` is a separate round-trip, so +in principle the lease could be stolen between the check and the write (TOCTOU). +This is acceptable because the writes are **idempotent** — canonical promotion is +a deterministic upsert (same delta → identical bytes) and retry/discard are +idempotent per candidate — so a brief two-leader overlap can at most re-apply the +same transition, never corrupt state. The fence + idempotency + cooperative +cancellation together strongly mitigate split-brain; TTL + voluntary abort alone +is not relied on. + +## Selection rule (wiring) + +`builder/storage.rs` already chooses the storage backend. The same decision point +selects the coordination family, keying on the **storage backend alone**: + +- `feature = "postgres"` + `DATABASE_URL` set => Postgres impls (share the + storage/metadata pool or a dedicated small pool — decided at implementation). +- filesystem backend => in-memory impls + `AlwaysLeader`. + +Coordination is **not** gated on `GUARDIAN_MAX_REPLICAS` or any other tunable: +a Postgres deployment always uses shared coordination. This is deliberate and +default-safe — a missing/mis-set tunable must never silently revert a multi-replica +deployment to per-process state (the #242 bug). The single-instance +session-lookup optimization is deferred to a future explicit, guarded opt-in. + +Coordination availability therefore can never diverge from where shared state +lives. Because the session/challenge stores are realm-bound, they are owned by +their realm's consumer, not shared on `AppState`: the builder constructs an +operator-realm `SessionStore`+`ChallengeStore` pair injected into `DashboardState` +and an evm-realm pair injected into `EvmSessionState`. `AppState` +(`builder/state.rs`) carries only `Arc` (used by the +canonicalization worker). + +## Availability & performance trade-offs (explicit behavior changes) + +These are deliberate consequences of moving auth state into Postgres. They are +behavior changes from today's always-available in-memory maps and are stated here +so they are not surprises. + +**Shared-store outage => auth fails closed**: with the Postgres impls, a `get`, +`consume`, `issue`, or `put` that errors because Postgres is briefly unavailable +results in the authenticated request / login being **rejected** (fail-closed), not +allowed through. This is the safe choice for a custody system: a DB blip must +never grant access. It is a change from today, where the in-memory map is always +available and never fails for store reasons. The boundary error returned MUST stay +within the existing auth/transient-error surface (no new error shape); operators +see auth failures during a DB outage, which is expected and documented in the +runbook. The canonicalization lease likewise fails closed: a renewal that errors +is treated as a lost lease (the holder steps down), so an outage stalls +canonicalization rather than risking double-processing — it resumes when the DB +returns. + +**Per-request DB lookup is a deliberate trade-off vs. caching**: FR-003 requires +logout/expiry to be honored on **every** replica **immediately**, which rules out +a local per-replica session cache (a cache would serve revoked sessions until its +TTL). The accepted consequence is that every authenticated request performs one +indexed Postgres `SELECT` (by `token_digest` PK) where today it is an in-memory +map hit. Immediate revocation is chosen over lower per-request latency. This adds +per-request DB load and reinforces the connection-pool sizing concern (see the +horizontal-scaling runbook). Challenges are touched only during login (low +volume); the per-request cost is the session lookup. diff --git a/speckit/features/010-horizontal-scaling/contracts/db-schema.md b/speckit/features/010-horizontal-scaling/contracts/db-schema.md new file mode 100644 index 00000000..87b87ded --- /dev/null +++ b/speckit/features/010-horizontal-scaling/contracts/db-schema.md @@ -0,0 +1,69 @@ +# Contract: Database Schema (new migrations) + +**Feature**: 010-horizontal-scaling + +Three new Diesel migrations under `crates/server/migrations/`, embedded and run at +startup. Postgres backend only. Column details and lifecycle rules are in +[data-model.md](../data-model.md); this file fixes the migration contract. + +## Migration: `_auth_sessions` + +`up.sql` creates `auth_sessions` (composite PK `(realm TEXT, token_digest +BYTEA)` so operator and EVM sessions are namespaced rather than relying on token +randomness, `subject JSONB`, `issued_at`, `expires_at`, `revoked_at` nullable) + +index on `expires_at` and `(realm, expires_at)`. `down.sql` drops it. + +## Migration: `_auth_challenges` + +`up.sql` creates `auth_challenges` with composite PK `(realm TEXT, challenge_key +TEXT)` — `challenge_key` is the operator signing-digest hex or the EVM nonce — +plus `principal TEXT`, `payload JSONB` (realm-specific match/recover fields, see +data-model.md), `issued_at`, `expires_at`, `consumed_at` nullable + index on +`(realm, principal)` and `expires_at`. `down.sql` drops it. + +## Migration: `_worker_leases` + +`up.sql` creates `worker_leases` (PK `lease_name TEXT`, `holder_id TEXT`, +`acquired_at`, `renewed_at`, `expires_at`, `fence_token BIGINT NOT NULL DEFAULT +0`). `down.sql` drops it. + +## Migration execution under concurrent replica startup — REQUIRED + +All replicas run the embedded migrations against one Postgres at boot. The runner +(`storage/postgres.rs`) MUST wrap `run_pending_migrations` in a Postgres +**session-level advisory lock** on a fixed key, acquired with a **bounded wait**: +poll `SELECT pg_try_advisory_lock($key)` until it succeeds or a timeout elapses -> +migrate -> `SELECT pg_advisory_unlock($key)`. One replica migrates; the rest poll, +then find nothing pending. The bounded wait (vs. an unbounded `pg_advisory_lock`) +means a replica stuck mid-migration fails the others fast rather than wedging the +fleet on boot. Without the lock, simultaneous first-deploy boots can race/deadlock +on identical migrations. (Acceptable here — short, single-connection — unlike the +canonicalization lease, which spans pool churn and uses a lease row instead.) + +## Constraints on the migration set + +- **Additive only**: no changes to existing tables (`states`, `deltas`, + `delta_proposals`, `account_metadata`, `admin_actions`). No FKs to custody + tables (keeps append-only delta lineage isolated — Constitution III). +- **Reversible**: every `up.sql` has a matching `down.sql`. +- **No data migration / backfill**: these tables start empty; sessions and + challenges are ephemeral and challenges/sessions in flight at deploy time + simply require a re-login (acceptable, documented in the runbook). +- **Filesystem backend creates none of these** — it uses in-memory stores. + +## Atomic operations the impls rely on + +- Challenge single-use consume: conditional `UPDATE ... SET consumed_at = now() + WHERE realm = $1 AND challenge_key = $2 AND consumed_at IS NULL AND now() < + expires_at` — affected-row-count `1` => this caller won the claim, `0` => + already consumed/expired. +- Lease acquire/steal: `INSERT ... ON CONFLICT (lease_name) DO UPDATE ... WHERE + worker_leases.expires_at < now() OR worker_leases.holder_id = excluded.holder_id`. +- Lease renew: `UPDATE ... WHERE lease_name = $1 AND holder_id = $2 AND now() < + expires_at`. +- Lease fence verify (submission boundary, mandatory): `SELECT 1 FROM + worker_leases WHERE lease_name = $1 AND holder_id = $2 AND fence_token = $3 AND + now() < expires_at`. + +These must be single round-trip statements (no read-modify-write races across +replicas). diff --git a/speckit/features/010-horizontal-scaling/data-model.md b/speckit/features/010-horizontal-scaling/data-model.md new file mode 100644 index 00000000..c6b68c73 --- /dev/null +++ b/speckit/features/010-horizontal-scaling/data-model.md @@ -0,0 +1,199 @@ +# Phase 1 Data Model: Horizontal Scaling Correctness + +**Feature**: 010-horizontal-scaling | **Date**: 2026-06-20 + +Three new Postgres tables, added as Diesel migrations under +`crates/server/migrations/` (embedded via `embed_migrations!`, run at startup — +`crates/server/src/storage/postgres.rs:29-47`). All three exist **only** in the +Postgres backend; the filesystem/dev backend uses in-memory equivalents and +creates no tables. + +All timestamps are `TIMESTAMPTZ` and all expiry comparisons use the database +clock (`now()`), giving a single authoritative clock across replicas. + +## Migration concurrency (multi-replica startup) — REQUIRED + +With 2-6 replicas booting simultaneously (ECS rolling deploy or cold start), +every replica runs `embed_migrations!` against the **one** shared Postgres at the +same time. Diesel's embedded runner does not serialize concurrent runners safely +by default, so first-deploy startup can race or deadlock applying the same +migration. This feature MUST guard migration with a **Postgres session-level +advisory lock**: + +```text +run_migrations(conn): + until pg_try_advisory_lock() or deadline: -- bounded wait, polls + sleep(poll_interval) + run_pending_migrations(conn); -- no-op for replicas that lose the race + SELECT pg_advisory_unlock(); +``` + +The first replica to grab the lock migrates; the others poll `pg_try_advisory_lock` +until it frees, then find no pending migrations and proceed. The wait is **bounded +by a timeout** (rather than `pg_advisory_lock`'s unbounded block) so a replica +stuck mid-migration fails the others fast instead of wedging the whole fleet on +boot; the holder still releases on unlock and on connection drop. This is a short, +single-connection critical section — not across request/pool churn (unlike the +canonicalization lease). This change lives in `storage/postgres.rs` +(`run_migrations`). See the matching edge case in spec.md and the db-schema.md +contract. + +--- + +## Entity: Auth Session → table `auth_sessions` + +Replaces the per-process `Arc>>` +(`dashboard/state.rs:30`) and the EVM equivalent (`evm/session.rs`). + +| Column | Type | Notes | +|---|---|---| +| `realm` | `TEXT` NOT NULL | `operator` \| `evm` (discriminator); part of the composite PK | +| `token_digest` | `BYTEA` (32) NOT NULL | SHA-256 of the session token (never store plaintext; matches current `[u8;32]` keying); part of the composite PK | +| `subject` | `JSONB` NOT NULL | Realm-specific identity: operator `AuthenticatedOperator` or EVM `address`. Permissions are re-resolved from the live allowlist at use time (preserves `authenticate_session` behavior, `dashboard/state.rs:290-332`) | +| `issued_at` | `TIMESTAMPTZ` NOT NULL | | +| `expires_at` | `TIMESTAMPTZ` NOT NULL | indexed for TTL sweep | +| `revoked_at` | `TIMESTAMPTZ` NULL | set on logout; a non-null value => session rejected on every replica (FR-003) | + +**Indexes**: composite PK on `(realm, token_digest)`; index on `expires_at` +(sweep); index on `(realm, expires_at)`. + +**Lifecycle / validation**: +- Created on successful `verify`. +- Valid iff `revoked_at IS NULL AND now() < expires_at`. +- Logout sets `revoked_at = now()` (idempotent). +- A revoked row is **kept until its original `expires_at`** so the revocation is + honored across every replica for as long as the token would otherwise have been + valid; setting `revoked_at` (not deleting) is what makes logout effective fleet- + wide. The sweep then deletes any row where `expires_at < now()` (covers both + naturally expired and revoked-then-expired rows). There is no separate + "revocation grace" — a revoked token is rejected immediately via `revoked_at` + and the row is reclaimed at natural expiry. +- **Invariant**: stored subject identity is authoritative, but authorization + (permissions) is always recomputed from the current allowlist — no stale + permission capture. + +--- + +## Entity: Auth Challenge → table `auth_challenges` + +Replaces per-process `Arc>>>` +(`dashboard/state.rs:29`) and the EVM equivalent. Supports issue-on-A / +verify-on-B (FR-001). + +| Column | Type | Notes | +|---|---|---| +| `realm` | `TEXT` NOT NULL | `operator` \| `evm` (part of PK) | +| `challenge_key` | `TEXT` NOT NULL | per-challenge unique key **within a realm** (part of PK). Operator: the signing digest hex. EVM: the challenge nonce. This is what `consume` targets. | +| `principal` | `TEXT` NOT NULL | operator commitment (`dashboard/state.rs:108-168`) or EVM address; indexed for `active_for` lookup | +| `payload` | `JSONB` NOT NULL | realm-specific fields needed to match/recover at verify time (see below) | +| `issued_at` | `TIMESTAMPTZ` NOT NULL | | +| `expires_at` | `TIMESTAMPTZ` NOT NULL | indexed | +| `consumed_at` | `TIMESTAMPTZ` NULL | set when `verify` succeeds; single-use across replicas | + +**Realm-aware payload (resolves the EVM modeling gap)**: the two realms verify +differently, so a single `signing_digest` column does not model both: +- **Operator** matches by Falcon-verifying the stored signing digest (a `Word`) + against the submitted signature (`dashboard/state.rs:228-230`). `payload` = + `{ "signing_digest": "" }`; `challenge_key` = that hex. +- **EVM** matches by **nonce**, then recovers the signer from the **full original + challenge** (`address`, `nonce`, `issued_at`, `expires_at`) via + `recover_session_address` (`evm/session.rs:112-127`). `payload` = + `{ "address", "nonce", "issued_at", "expires_at" }`; `challenge_key` = the nonce. + +Verification matching (Falcon verify / nonce compare + ECDSA recover) runs in +Rust, not SQL: `active_for(principal)` returns the unexpired, unconsumed payloads; +the caller matches one; then `consume(challenge_key)` atomically claims it. + +**Primary key**: `(realm, challenge_key)`. **Indexes**: PK; index on +`(realm, principal)` for `active_for`; index on `expires_at` for the sweep. + +**Lifecycle / validation**: +- Created by `issue_challenge` (per-principal cap via `max_outstanding`, oldest + pruned, matching today's `Vec` cap). +- Consumable iff `consumed_at IS NULL AND now() < expires_at`; `consume` + conditionally sets `consumed_at = now()` and reports whether it won the race + (single-use; a replay on any replica fails — FR-003, US1 scenario 3). +- Multiple pending challenges per principal allowed; the `(realm, principal)` + index supports `active_for`. + +--- + +## Entity: Worker Lease → table `worker_leases` + +Backs single-owner canonicalization (FR-004/005/006, US2). Generic enough for +future background workers. + +| Column | Type | Notes | +|---|---|---| +| `lease_name` | `TEXT` PRIMARY KEY | e.g. `canonicalization` | +| `holder_id` | `TEXT` NOT NULL | replica identity (e.g. hostname/task-id + random suffix, generated at boot) | +| `acquired_at` | `TIMESTAMPTZ` NOT NULL | when current holder first took the lease | +| `renewed_at` | `TIMESTAMPTZ` NOT NULL | last heartbeat | +| `expires_at` | `TIMESTAMPTZ` NOT NULL | `renewed_at + ttl`; another replica may claim only when `now() >= expires_at` | +| `fence_token` | `BIGINT` NOT NULL | monotonically incremented on each (re)acquisition; guards against a stale holder acting after losing the lease | + +**Acquire / renew (single atomic statement)**: +- Acquire/steal: `INSERT ... ON CONFLICT (lease_name) DO UPDATE SET holder_id = + excluded.holder_id, acquired_at = now(), renewed_at = now(), expires_at = now() + + ttl, fence_token = worker_leases.fence_token + 1 WHERE worker_leases.expires_at + < now() OR worker_leases.holder_id = excluded.holder_id` (claim only if expired + or already mine). +- Renew: `UPDATE ... SET renewed_at = now(), expires_at = now() + ttl WHERE + lease_name = $1 AND holder_id = $2 AND now() < expires_at` — runs on its **own + timer concurrent with the pass** (not at tick boundaries); a failed renew (0 + rows) means the lease was lost; the renewal task trips the pass's cancellation + signal (see coordination-traits.md "Renewal concurrency"). +- Verify-held (fence check at submission boundary): `SELECT 1 FROM worker_leases + WHERE lease_name = $1 AND holder_id = $2 AND fence_token = $3 AND now() < + expires_at` — the processor MUST run this immediately before any on-chain + submission / canonical promotion and skip the write if it returns no row. + +**Timing constraints**: +- `renew_interval << ttl` (e.g. renew every 5s, ttl 30s). +- The TTL is sized **solely** for renew/failover: it must comfortably exceed one + renew interval (a healthy holder never loses its lease) and it sets the failover + bound — another replica may claim only after `ttl` elapses without a renew, so + failover (SC-003) happens within `ttl` of the holder dying. +- The TTL is **independent of** the canonicalization `submission_grace_period` + (600s default) and the check interval (10s); those govern delta promotion + timing, not lease ownership. Do not couple them. + +**Invariant (no split-brain)**: at most one `holder_id` can satisfy the renew +predicate at a time because acquisition is a single atomic conditional write +against the DB clock. The cooperative-cancellation abort path has a small window +between lease loss and the pass stopping; the **mandatory** fence check +(`verify_held`) immediately before every state-mutating write strongly mitigates +that window. Note the fence is **advisory** (a separate round-trip, TOCTOU): a +lease could in principle be stolen between the check and the write. That residual +window is benign here because the canonical writes are **idempotent deterministic +upserts** — the same delta produces identical state/delta bytes regardless of +which replica writes — and retry/discard writes are likewise idempotent for a +given candidate. So a brief overlap cannot corrupt state; it can at most +re-apply the same transition. TTL + voluntary abort alone is NOT relied on. + +--- + +## In-memory equivalents (filesystem/dev backend) + +No tables. The `coordination` module provides: +- `InMemorySessionStore` / `InMemoryChallengeStore` — the current + `Arc>` behavior, byte-for-byte. +- `AlwaysLeader` — `try_acquire`/`renew`/`verify_held` always succeed (single + replica is always the leader). + +Selected when the filesystem backend is active (backend-derived selection, R9 — +**not** gated on `GUARDIAN_MAX_REPLICAS`). A Postgres deployment always uses the +shared (table-backed) impls. This keeps single-replica/dev behavior identical to +today and requires no database (FR-014; constitution dev-default invariant). + +--- + +## Relationships & boundaries + +- `auth_sessions` / `auth_challenges` are independent of the custody record + tables (`states`, `deltas`, `delta_proposals`, `account_metadata`) — no FKs, + no impact on append-only delta lineage (Constitution III). +- `worker_leases` is pure coordination metadata; it never participates in or + alters the pending->candidate->canonical/discarded transitions — it only gates + **which replica** executes them. +- None of these tables are exposed on any client wire contract. diff --git a/speckit/features/010-horizontal-scaling/spec.md b/speckit/features/010-horizontal-scaling/spec.md new file mode 100644 index 00000000..65e3633e --- /dev/null +++ b/speckit/features/010-horizontal-scaling/spec.md @@ -0,0 +1,480 @@ +# Feature Specification: Horizontal Scaling Correctness Across Multiple Guardian Instances + +**Feature Branch**: `010-horizontal-scaling` +**Created**: 2026-06-20 +**Status**: Draft +**Input**: User description: "Ensure horizontal scaling works correctly across multiple Guardian instances (issue #242)" +**Tracking issue**: [#242](https://github.com/OpenZeppelin/guardian/issues/242) + +## Overview + +The production deployment runs the Guardian server as 2-6 ECS tasks behind a +round-robin load balancer. Several subsystems were written under an implicit +single-instance assumption, so a request that begins on one replica and +continues on another can fail, and background work runs redundantly on every +replica. This feature makes the server correct under horizontal scaling: any +request may land on any replica, replicas may be added or removed at any time, +and operators have a documented configuration for a highly-available (HA) +deployment. + +The scope is **correctness and operability under multiple replicas**, not new +end-user functionality. Each subsystem below is independently testable and +independently shippable. + +## User Scenarios & Testing *(mandatory)* + +### User Story 1 - Operator login succeeds with multiple replicas (Priority: P1) + +An operator authenticates to the dashboard while the load balancer routes the +challenge request and the verification request to different replicas. The login +must complete successfully regardless of which replica handles each step, and an +established session must be honored by every replica. + +**Why this priority**: Authentication is the entry point to all operator +functionality. Today the auth challenge and the session record live in +per-process memory, so a login or any subsequent authenticated call that lands +on a different replica than the one that issued the challenge/session fails. +This makes the dashboard effectively unusable with more than one replica - the +highest-impact breakage in the issue. + +**Independent Test**: Run 2+ replicas behind the load balancer and complete the +full challenge -> sign -> verify -> authenticated-request flow, forcing each step +onto a different replica. Login completes and the session is accepted on every +replica. + +**Acceptance Scenarios**: + +1. **Given** 2+ replicas behind the load balancer, **When** an operator requests + a login challenge from replica A and submits the signed response to replica B, + **Then** verification succeeds and a session is established. +2. **Given** an established operator session, **When** an authenticated request + is routed to any replica, **Then** the session is recognized and the request + is authorized without re-login. +3. **Given** a pending challenge issued by one replica, **When** the operator + never completes it, **Then** the challenge expires consistently and cannot be + replayed on any replica after expiry. +4. **Given** an operator logs out on one replica, **When** a subsequent request + with the same session token reaches any other replica, **Then** the session + is rejected. + +--- + +### User Story 2 - A delta is canonicalized exactly once (Priority: P1) + +The background canonicalization worker promotes pending candidate deltas to +canonical state after verifying them against on-chain state. With multiple +replicas, each pending candidate must be processed exactly once, regardless of +how many replicas are running. + +**Why this priority**: The canonicalization worker currently runs on every +replica with no leader election or shared lock, so every replica independently +re-processes the same candidates. This causes duplicate work and races on state +transitions (promote/discard/retry-budget), which can corrupt the proposal +nonce sequence and lead to permanent state-commitment mismatches. Correctness of +custody state is paramount. + +**Independent Test**: Run 2+ replicas, create pending candidates, and confirm +each candidate transitions exactly once (one promotion or one discard), with no +duplicate submissions or double-counted retries, across the full replica set. + +**Acceptance Scenarios**: + +1. **Given** N replicas running and a pending candidate delta, **When** the + canonicalization interval elapses, **Then** exactly one replica processes the + candidate and it is promoted or discarded exactly once. +2. **Given** the replica currently performing canonicalization stops or crashes, + **When** the next interval elapses, **Then** another replica takes over + canonicalization with no manual intervention. +3. **Given** a candidate's retry budget, **When** processing fails, **Then** the + retry count is incremented exactly once per interval across the whole fleet + (not once per replica). +4. **Given** only a single replica is running, **When** canonicalization runs, + **Then** behavior is unchanged from today (no regression). + +--- + +### User Story 3 - Pagination cursors are valid across all replicas (Priority: P2) + +An operator pages through dashboard list results (e.g. accounts, deltas) where +successive page requests are routed to different replicas. Cursors returned by +one replica remain valid on every other replica. + +**Why this priority**: Cursors are signed/verified with a secret that, when +unset, is generated randomly per process. Across replicas this silently breaks +pagination (a cursor from replica A fails verification on replica B). It is +high-frequency operator pain but degrades to "start over" rather than corrupting +state, so it ranks below auth and canonicalization. + +**Independent Test**: With 2+ replicas and a shared cursor secret configured, +request page 1 from one replica and page 2 (using the returned cursor) from +another; the second page returns the correct continuation. With the secret +unset in a multi-replica configuration, startup surfaces the misconfiguration +with a warning and still boots. + +**Acceptance Scenarios**: + +1. **Given** a shared cursor secret configured on all replicas, **When** a cursor + issued by one replica is submitted to another, **Then** it verifies and + returns the correct next page. +2. **Given** a multi-replica configuration with no shared cursor secret, **When** + the server starts, **Then** the operator is clearly warned that pagination + will break across replicas and the server boots (in every stage) with an + ephemeral per-process secret, rather than silently proceeding without notice. +3. **Given** a tampered or expired cursor, **When** it is submitted to any + replica, **Then** it is rejected consistently. + +--- + +### User Story 4 - Rate limits are enforced consistently across replicas (Priority: P2) + +A client making requests that are spread across replicas by the load balancer is +subject to rate limits that reflect total traffic, within a documented +tolerance - not per-replica limits that multiply with replica count. + +**Why this priority**: The rate limiter is per-process, so the effective limit +scales with replica count (e.g. 2 replicas ~ 2x the configured burst). This +weakens an abuse-prevention control, but it fails open (more lenient) rather +than blocking legitimate traffic, so it ranks below correctness-critical items. + +**Independent Test**: With 2+ replicas and `GUARDIAN_MAX_REPLICAS` set to the +autoscaling max capacity, drive traffic exceeding the global limit through the +load balancer and confirm the aggregate accepted rate stays at or below the +global limit (stricter when running below max capacity), rather than scaling by +replica count. + +**Acceptance Scenarios**: + +1. **Given** a configured global request limit and 2+ replicas, **When** a client + exceeds the limit across replicas, **Then** excess requests are throttled so + the aggregate accepted rate stays at or below the global limit, regardless of + how the load balancer distributes them. +2. **Given** rate limiting is disabled by configuration, **When** running with + multiple replicas, **Then** no throttling occurs (no regression). +3. **Given** `GUARDIAN_MAX_REPLICAS` is set to the autoscaling max capacity, + **When** fewer than that many replicas are running, **Then** aggregate + enforcement is stricter than the global limit (never looser), and no request + depends on an external coordination service. + +--- + +### User Story 5 - Filesystem backend is refused in the prod stage (Priority: P3) + +When the server is configured for the production stage, it refuses to start with +the filesystem storage backend, because filesystem storage is local to a single +task and cannot be shared across replicas. The filesystem backend remains fully +supported for local development. + +**Why this priority**: The filesystem backend cannot back a multi-replica +deployment (each replica would have divergent local state and audit events are +not persisted). Refusing it in prod prevents a silent, dangerous +misconfiguration. It is a guardrail rather than core multi-replica plumbing, and +the published prod image is already built with the Postgres backend, so it ranks +P3. + +**Independent Test**: Start the server in the prod stage with the filesystem +backend selected and confirm it fails fast with a clear, actionable error. Start +the same configuration in a non-prod stage and confirm it starts (dev-only path +preserved). + +**Acceptance Scenarios**: + +1. **Given** the server is configured for the prod stage, **When** it would use + the filesystem storage backend, **Then** startup fails with an error + identifying the misconfiguration and the required remedy (use a shared + database backend). +2. **Given** the server is configured for a non-prod stage, **When** it uses the + filesystem backend, **Then** it starts normally (development workflow + unaffected). +3. **Given** the prod stage with a shared database backend, **When** the server + starts, **Then** there is no filesystem-related failure. + +--- + +### User Story 6 - Operators have an HA configuration runbook (Priority: P3) + +An operator deploying multiple replicas can follow a single runbook listing +every environment variable and external state-store dependency required for a +correct HA deployment, and understands what breaks if each is omitted. + +**Why this priority**: Several of the fixes above depend on operator +configuration (shared secrets, shared state stores, stage selection). Without +documentation the feature is not safely usable, but it depends on the other +stories being defined first, so it is sequenced last. + +**Independent Test**: A reviewer follows only the runbook to configure a 2+ +replica deployment and all P1/P2 acceptance scenarios pass without consulting +source code. + +**Acceptance Scenarios**: + +1. **Given** the operator runbook, **When** an operator configures an HA + deployment using only the runbook, **Then** all required environment + variables and state-store dependencies are covered. +2. **Given** the runbook, **When** an operator reads it, **Then** each HA-related + setting documents the consequence of omitting it. +3. **Given** the runbook, **When** an operator reviews stage guidance, **Then** + the dev-only status of the filesystem backend is clearly stated. + +### Edge Cases + +- **Replica added mid-session**: A newly started replica must immediately honor + existing sessions, challenges, cursors, and the elected canonicalization owner + without restart of the fleet. +- **Replica removed mid-flight**: When the replica that holds canonicalization + leadership disappears, leadership must transfer within a bounded time so + canonicalization is not stalled. +- **Clock skew between replicas**: Challenge/session expiry and lease/lock + timing must remain correct (or fail safe) when replica clocks differ within a + reasonable bound. +- **Concurrent migrations on simultaneous startup**: When 2-6 replicas boot at + once (rolling deploy / cold start) they all run schema migrations against the + one shared store at the same time. Applying migrations MUST be serialized (one + replica migrates, the rest wait then proceed) so first-deploy startup cannot + race or deadlock. See FR-017. +- **Shared store outage**: If the shared coordination/state store is temporarily + unavailable, each affected subsystem MUST have a defined, documented behavior + rather than undefined behavior or a crash loop. Specifically: authenticated + requests and login **fail closed** (auth rejected, never bypassed) and the + canonicalization leader **steps down** (work stalls, never double-processes), + both recovering automatically when the store returns. See FR-018. +- **Split brain during leadership handoff**: Two replicas must never both believe + they own canonicalization long enough to double-process a candidate. +- **Mixed configuration across replicas**: Replicas configured with different + shared secrets (e.g. one missing the cursor secret) - the failure mode must be + detectable rather than silent. +- **Single-replica deployments**: All changes must preserve current behavior when + exactly one replica runs (no new mandatory infrastructure for dev/local). + +## Requirements *(mandatory)* + +### Functional Requirements + +- **FR-001**: Operator dashboard auth challenges MUST be resolvable by any + replica, so a challenge issued by one replica can be verified by another. +- **FR-002**: Operator (and EVM, where applicable) sessions MUST be recognized by + any replica, so an authenticated request succeeds on any replica without + re-login. +- **FR-003**: Session and challenge lifecycle events (issuance, consumption, + expiry, logout/revocation) MUST be consistent across replicas; a logged-out or + expired session/challenge MUST be rejected on every replica. +- **FR-004**: Canonicalization of any pending candidate MUST occur exactly once + across the entire fleet per processing interval, regardless of replica count. +- **FR-005**: The system MUST elect, or otherwise coordinate, a single owner for + canonicalization at any given time, and MUST transfer ownership automatically + when the current owner becomes unavailable. Ownership renewal MUST run + concurrently with (not gated on) the canonicalization pass, the pass MUST be + cooperatively cancellable so a lost owner can stop promptly, and every + state-mutating write (canonical promotion **and** retry/discard) MUST be gated + by an advisory fencing check so a superseded owner is prevented from committing + during the cancellation window. (The fence is a pre-write ownership re-check; + combined with idempotent writes — same delta ⇒ identical bytes — a brief + two-leader overlap can at most re-apply the same transition, never corrupt + state.) +- **FR-006**: Canonicalization retry budgets and state transitions + (promote/discard) MUST be counted once per interval across the fleet, never + once per replica. +- **FR-007**: Pagination cursors MUST be issued and verified using a shared + secret so a cursor issued by one replica is valid on all replicas. +- **FR-008**: When a shared cursor secret is not configured, the system MUST + surface the misconfiguration at startup with a warning in every stage, rather + than silently generating a per-process secret without notice. The server still + boots, using an ephemeral per-process secret; a missing cursor secret degrades + only dashboard pagination across replicas (a cursor minted on one replica is + rejected on another) and never affects correctness or auth, so it is not a + startup guard. +- **FR-009**: The aggregate request rate enforced across all replicas MUST NOT + exceed the configured global limit. This is achieved by dividing the global + limit by the deployment's **maximum replica capacity** (`GUARDIAN_MAX_REPLICAS`), + so each replica enforces `global_limit / GUARDIAN_MAX_REPLICAS`. When fewer than + the maximum number of replicas are running, aggregate enforcement is stricter + than the global limit (never looser); the resulting tolerance band MUST be + documented. `GUARDIAN_MAX_REPLICAS` MUST default from the deployment's + autoscaling max capacity (set by infrastructure), not from a manually maintained + value, and MUST remain operator-overridable. +- **FR-010**: Rate limiting MUST NOT introduce any external coordination + dependency on the request hot path; enforcement is per-process arithmetic over + the partitioned budget and therefore has no shared-store failure mode. Any + future shared/global limiter would have to define and document its + fail-open/fail-closed behavior; none is introduced by this feature. +- **FR-011**: The system MUST provide a single configuration value that + identifies the deployment stage (at minimum distinguishing "prod" from + non-prod) usable by HA guardrails. +- **FR-012**: In the prod stage, the system MUST refuse to start with a storage + backend that cannot be shared across replicas (the filesystem backend), + failing fast with an actionable error. +- **FR-013**: In the prod stage, the system MUST fail fast when a setting is + missing or misconfigured in a way that would make every replica serve + incorrectly (e.g. a global rate limit that partitions to zero requests per + replica); in non-prod the same condition MUST warn but allow startup. A missing + shared cursor secret is explicitly NOT such a setting: it warns but boots in + every stage (FR-008), because it degrades pagination only, not correctness. +- **FR-014**: All HA behaviors MUST preserve existing single-replica behavior; + running exactly one replica MUST NOT require new external infrastructure for + local/dev use. +- **FR-015**: The Rust and TypeScript clients MUST observe no behavior drift as a + result of these changes; the wire contract for clients MUST remain unchanged + unless an explicit, documented contract change is made. +- **FR-016**: Operator-facing documentation MUST enumerate every environment + variable and external state-store dependency required for a correct HA + deployment, including the consequence of omitting each, and MUST mark the + filesystem backend as dev-only. +- **FR-017**: Schema migrations MUST be safe under concurrent execution by + multiple replicas starting simultaneously; migration application MUST be + serialized across the fleet so a first deploy cannot race or deadlock, with no + manual "migrate first, then start" operator step required. +- **FR-018**: When the shared state store is briefly unavailable, authentication + (login and authenticated requests) MUST fail closed (rejected, never bypassed) + and the canonicalization owner MUST step down rather than risk double-processing; + both MUST recover automatically when the store returns. This fail-closed auth + behavior is an accepted, documented change from the previous always-available + in-memory behavior. +- **FR-019**: At startup the server MUST emit a single, unambiguous log line + stating which coordination mode is active — "shared" (backed by the external + store, replica-safe) or "single-process" (in-memory, single-replica only) — + together with the effective HA-relevant settings it derives from configuration: + the storage backend, the deployment stage, the maximum replica capacity, and + whether the pagination cursor secret was supplied or generated. This makes the + active mode explicit and diagnosable without inferring it from other logs, and + is the discoverable signal that replaces an explicit mode toggle (coordination + capability is determined by resolved configuration, not a separate flag). The + line MUST reflect the actual resolved state, never operator intent. +- **FR-020**: The coordination mode MUST be determined by the **storage backend + alone**: the Postgres backend MUST use shared coordination (sessions, + challenges, leadership) and the filesystem backend MUST use in-memory + coordination. Shared coordination MUST be the default whenever Postgres is + active and MUST NOT be disabled by any tunable — a missing, mis-overridden, or + low `GUARDIAN_MAX_REPLICAS` (or any other knob) MUST NEVER silently reintroduce + per-process auth/canonicalization state on a Postgres deployment. (Skipping the + per-request session lookup for a deployment known to be single-instance is a + possible future optimization behind an explicit, guarded opt-in; it is out of + scope here and MUST NOT be inferred from a rate-limit signal.) + +### Key Entities + +- **Auth Challenge**: A short-lived, one-time login challenge bound to an + operator identity; must be readable and consumable by any replica until it + expires or is consumed. +- **Operator Session**: An authenticated session with an issue and expiry time + and a revocation (logout) state; must be authoritative across replicas. +- **Canonicalization Lease / Leadership**: The right, held by at most one replica + at a time, to run the canonicalization worker; has a holder identity, an + expiry/heartbeat so it can be reclaimed, and a fencing token (advancing on each + steal) re-checked before every state-mutating write so a superseded holder is + prevented from committing (advisory check, made safe by idempotent writes). +- **Pagination Cursor**: An opaque, integrity-protected continuation token whose + validity depends on a secret shared by all replicas. +- **Maximum Replica Capacity** (`GUARDIAN_MAX_REPLICAS`): The + infrastructure-derived signal for how many replicas the deployment can scale to. + It feeds **rate-limit partitioning only** (`global_limit / GUARDIAN_MAX_REPLICAS`). + It MUST NOT influence the coordination mode (which is backend-derived, FR-020). +- **Effective Rate-Limit Budget**: The per-replica share of the global limit, + computed as `global_limit / GUARDIAN_MAX_REPLICAS`. Per-client burst/sustained + counters remain per-process; they are partitioned, not aggregated, so total + enforcement stays at or below the global limit. +- **Deployment Stage**: A configuration value identifying the environment (prod + vs. non-prod) that gates HA guardrails. + +## Success Criteria *(mandatory)* + +### Measurable Outcomes + +- **SC-001**: With 2+ replicas behind the load balancer, an operator completes + the full login flow with a 100% success rate across 20 consecutive attempts, + including attempts where challenge and verification are forced onto different + replicas. +- **SC-002**: With 2+ replicas, every pending candidate is canonicalized exactly + once - zero duplicate promotions, discards, or submissions - across a test of + at least 50 candidates. +- **SC-003**: When the replica holding canonicalization leadership is terminated, + canonicalization resumes on another replica within the configured lease TTL + (the failover bound, independent of the delta submission grace period), with no + manual intervention. +- **SC-004**: With 2+ replicas and a shared cursor secret, 100% of pagination + cursors issued by one replica are accepted by other replicas across a paging + test of at least 100 page transitions. +- **SC-005**: With N replicas, the aggregate accepted request rate for a client + exceeding the configured limit stays at or below the configured global limit + (rather than ~ Nx the limit). The documented tolerance band MUST also state the + two-sided imprecision: (a) running below the autoscaling max capacity enforces + stricter than the global limit, and (b) HTTP keep-alive can pin a single client + to one replica, so that client may be throttled at + `global_limit / GUARDIAN_MAX_REPLICAS` (e.g. 1/6) — an over-strict, fail-closed + outcome for that client. Both are accepted trade-offs of partitioning without + shared hot-path state. +- **SC-006**: A prod-stage server configured with the filesystem backend (or with + a global rate limit that partitions to zero requests per replica) fails to + start 100% of the time with an error that names the misconfiguration and the + remedy. +- **SC-007**: A reviewer who has never seen the code can stand up a correct 2+ + replica deployment using only the operator runbook, and all P1/P2 acceptance + scenarios pass. +- **SC-008**: All existing single-replica test suites pass unchanged, confirming + no regression for dev/local deployments. +- **SC-009**: On startup, the server logs exactly one coordination-mode line that + correctly reports "shared" when backed by the external store and + "single-process" otherwise, including the resolved backend, stage, max replica + capacity, and cursor-secret source; an operator can determine the active mode + from that single line alone (mode follows the storage backend). + +## Assumptions + +- The shipped production image is built with the Postgres storage backend, so a + shared relational database is available to replicas and is the natural shared + coordination/state store for sessions, challenges, and leadership. (Rate + limiting is partitioned per-process, not a shared counter — see FR-010.) No + new infrastructure component (e.g. a separate cache or + queue) is assumed to be mandatory; if one is proposed it will be justified in + planning. +- "Prod stage" is represented by the existing `GUARDIAN_ENV=prod` signal (today + used only for ACK secret sourcing), extended to gate HA guardrails. Confirming + this versus introducing a dedicated stage variable is a planning decision. +- The cursor secret environment variable already exists + (`GUARDIAN_DASHBOARD_CURSOR_SECRET`); this feature changes its enforcement, not + its format. +- The load balancer does not provide sticky sessions; correctness must not depend + on session affinity. +- Replica clocks are synchronized within a few seconds (standard for the ECS + environment); expiry/lease logic must tolerate small skew. +- Rate limiting is partitioned conservatively against the autoscaling **max** + capacity (not the current replica count), so it is never silently looser than + the global limit during scale-out and over-throttles (conservatively stricter) + when running below max capacity. A documented tolerance band for this + over-throttling is acceptable, consistent with the issue's "within some + documented tolerance". +- The infrastructure already computes the autoscaling max capacity + (`infra/data.tf` `effective_server_autoscaling_max_capacity`, prod = + `max(desired, 6)`); `GUARDIAN_MAX_REPLICAS` defaults from it via Terraform + rather than a manually maintained value. It drives **rate-limit partitioning + only**; the coordination mode is backend-derived (FR-020). + +## Dependencies + +- Issue [#190](https://github.com/OpenZeppelin/guardian/issues/190) (single + canonicalization owner / no leader election) is subsumed by User Story 2. +- Existing configuration surface: `GUARDIAN_DASHBOARD_CURSOR_SECRET`, + `GUARDIAN_ENV`, `GUARDIAN_RATE_LIMIT_ENABLED`, `GUARDIAN_RATE_BURST_PER_SEC`, + `GUARDIAN_RATE_PER_MIN`, `DATABASE_URL`, `GUARDIAN_STORAGE_PATH`, + `GUARDIAN_METADATA_PATH`. +- New configuration: `GUARDIAN_MAX_REPLICAS` (maximum replica capacity; drives + **rate-limit partitioning only**; defaults from + `effective_server_autoscaling_max_capacity`). +- Infrastructure wiring (in scope): `infra/data.tf` + (`effective_server_autoscaling_max_capacity`) and `infra/ecs.tf` (server env + block) must set `GUARDIAN_MAX_REPLICAS` so the correct default ships without + operator action. +- Operator documentation set (`docs/CONFIGURATION.md`, AWS deploy docs, runbooks) + must be updated per the contributor docs table. + +## Out of Scope + +- Autoscaling policy, ALB/ECS provisioning, or Terraform changes beyond the + `GUARDIAN_MAX_REPLICAS` env-var wiring (in scope above) and documenting required + configuration. +- Skipping shared coordination for a known single-instance Postgres deployment + (a per-request-lookup optimization); if pursued later it MUST be an explicit, + guarded opt-in, never inferred from `GUARDIAN_MAX_REPLICAS` or another tunable. +- Changing the storage backend selection from a compile-time feature to a runtime + switch. +- Multi-region or active/active cross-region deployment. +- End-user (custody client) facing feature changes; this work is server-side + correctness and operability only.