diff --git a/Cargo.lock b/Cargo.lock index 2087043..00dc4ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1733,6 +1733,7 @@ dependencies = [ name = "source-data-proxy" version = "2.1.2" dependencies = [ + "chrono", "console_error_panic_hook", "getrandom 0.4.2", "hmac", diff --git a/Cargo.toml b/Cargo.toml index 6b27c8a..e4c141b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,10 @@ path = "tests/backend_auth.rs" name = "authz" path = "tests/authz.rs" +[[test]] +name = "sts_cache" +path = "tests/sts_cache.rs" + [dependencies] # Multistore multistore = { version = "0.6.2", features = ["azure"] } @@ -48,6 +52,12 @@ sha2 = "0.10" # Tracing tracing = "0.1" +# Time — parse the STS response `` for the L2 credential cache TTL. +# Already in the tree transitively (multistore); named directly here so sts_cache +# can use it. `alloc` is enough for `parse_from_rfc3339`; feature unification adds +# whatever the wasm build needs. +chrono = { version = "0.4", default-features = false, features = ["alloc"] } + # Wasm-only dependencies (Cloudflare Workers runtime) [target.'cfg(target_arch = "wasm32")'.dependencies] # Pulled in transitively via object_store -> rand. getrandom doesn't support diff --git a/src/lib.rs b/src/lib.rs index 4660e49..b25f43d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -16,6 +16,7 @@ mod location; mod pagination; mod source_api; mod sts; +mod sts_cache; use crate::source_api::{ApiAuth, SourceCoopRegistry}; use analytics::log_analytics; @@ -87,6 +88,17 @@ impl HttpExchange for FetchHttpExchange { url: &str, form: &[(&str, &str)], ) -> std::result::Result { + // L2 (cross-isolate, per-colo) cache for the AssumeRoleWithWebIdentity + // response, keyed by RoleArn. On a hit we skip the slow STS round-trip + // entirely — the one thing that stalls the request hot path on a cold + // isolate. multistore's in-isolate cache is L1; this sits under it. + let cache_key = sts_cache::role_arn_from_form(form).map(sts_cache::cache_key); + if let Some(ref key) = cache_key { + if let Some(body) = sts_cache_get(key).await { + return Ok(body); + } + } + let resp = self .client .post(url) @@ -100,9 +112,45 @@ impl HttpExchange for FetchHttpExchange { // body on 4xx/5xx, and multistore's `parse_response` reads the error // (code + message) out of that body. Discarding it on a non-2xx would // lose the diagnostic and the precise ProxyError mapping. - resp.text() + let body = resp + .text() .await - .map_err(|e| OidcProviderError::HttpError(e.to_string())) + .map_err(|e| OidcProviderError::HttpError(e.to_string()))?; + + // Cache only a successful, not-near-expiry credential response — + // `ttl_secs` returns None for an STS error document, so failures are + // never cached. + if let Some(ref key) = cache_key { + let now = (worker::Date::now().as_millis() / 1000) as i64; + if let Some(ttl) = sts_cache::ttl_secs(&body, now) { + sts_cache_put(key, &body, ttl).await; + } + } + Ok(body) + } +} + +/// L2 read. Best-effort: any cache error degrades to a miss (we just call STS). +async fn sts_cache_get(key: &str) -> Option { + let mut resp = worker::Cache::default().get(key, false).await.ok()??; + resp.text().await.ok() +} + +/// L2 write. Best-effort: a failed put just means the next request re-mints. +async fn sts_cache_put(key: &str, body: &str, ttl_secs: u32) { + let headers = worker::Headers::new(); + let _ = headers.set("content-type", "text/xml"); + let _ = headers.set("cache-control", &format!("max-age={ttl_secs}")); + match worker::Response::ok(body) { + Ok(resp) => { + if let Err(e) = worker::Cache::default() + .put(key, resp.with_headers(headers)) + .await + { + tracing::warn!("STS L2 cache put failed: {e}"); + } + } + Err(e) => tracing::warn!("STS L2 cache response build failed: {e}"), } } diff --git a/src/sts_cache.rs b/src/sts_cache.rs new file mode 100644 index 0000000..365a87f --- /dev/null +++ b/src/sts_cache.rs @@ -0,0 +1,68 @@ +//! Cross-isolate (per-colo) L2 cache for the STS `AssumeRoleWithWebIdentity` +//! response, layered UNDER multistore's in-isolate credential cache. +//! +//! The slow part of backend federation is the STS round-trip, and multistore's +//! credential cache lives in per-isolate memory — so every cold isolate re-runs +//! it on the request hot path. Caching the STS response in the Cloudflare Cache +//! API (shared across isolates within a colo) means only one isolate per colo +//! per credential lifetime actually calls STS; the rest serve the cached body. +//! +//! Only the pure helpers live here so they are host-testable via +//! `tests/sts_cache.rs` (the lib is `cdylib` with `test = false`). The Cache API +//! I/O lives in `lib.rs`, where the wasm-only `worker` types are available. + +use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC}; + +/// Stop serving a cached response this many seconds before the credential +/// actually expires. Kept >= multistore's own 60s in-isolate refresh lead, so an +/// L2 entry always expires before L1 would consider the derived credential stale +/// — the two tiers never hand out an about-to-expire credential. +pub const REFRESH_LEAD_SECS: i64 = 300; // 5 minutes + +/// The `RoleArn` iff `form` is an `AssumeRoleWithWebIdentity` request — the only +/// exchange we cache (other STS actions and the Azure/GCP bearer flows return +/// `None` and bypass the cache). `RoleArn` is multistore's own L1 cache key, so +/// L2 keys line up with L1 exactly. +pub fn role_arn_from_form<'a>(form: &'a [(&'a str, &'a str)]) -> Option<&'a str> { + let is_assume_role = form + .iter() + .any(|&(k, v)| k == "Action" && v == "AssumeRoleWithWebIdentity"); + if !is_assume_role { + return None; + } + form.iter().find(|&&(k, _)| k == "RoleArn").map(|&(_, v)| v) +} + +/// Cache key: a synthetic, non-routable URL. Cache API entries are only returned +/// when a request URL matches the key, and this host never arrives as a real edge +/// request — so a cached (short-lived, role-scoped) credential is not externally +/// addressable. +pub fn cache_key(role_arn: &str) -> String { + format!( + "https://sts-creds.cache.internal/v1/{}", + utf8_percent_encode(role_arn, NON_ALPHANUMERIC) + ) +} + +/// Seconds the STS response may be cached: time until its `` minus +/// the refresh lead. `None` means **do not cache** — either an STS error document +/// (no parseable ``) or a response already inside the lead window. +/// `now_unix` is injected (rather than read from a clock) so this stays pure and +/// host-testable. +pub fn ttl_secs(sts_response_xml: &str, now_unix: i64) -> Option { + let exp = extract_tag(sts_response_xml, "Expiration")?; + let exp_unix = chrono::DateTime::parse_from_rfc3339(exp.trim()) + .ok()? + .timestamp(); + let ttl = exp_unix - now_unix - REFRESH_LEAD_SECS; + (ttl > 0).then_some(ttl as u32) +} + +/// First-match text of ``. The STS body is a fixed, trusted shape, so +/// a full XML parser is not warranted. +fn extract_tag<'a>(xml: &'a str, tag: &str) -> Option<&'a str> { + let (open, close) = (format!("<{tag}>"), format!("")); + let start = xml.find(&open)? + open.len(); + let end = xml[start..].find(&close)? + start; + Some(&xml[start..end]) +} diff --git a/tests/sts_cache.rs b/tests/sts_cache.rs new file mode 100644 index 0000000..250d3ba --- /dev/null +++ b/tests/sts_cache.rs @@ -0,0 +1,89 @@ +//! Native unit tests for the wasm-free `sts_cache` helpers, included via +//! `#[path]` (the lib itself is `cdylib` with `test = false`). Mirrors the +//! pattern in `tests/backend_auth.rs`. + +#[path = "../src/sts_cache.rs"] +mod sts_cache; + +use sts_cache::{cache_key, role_arn_from_form, ttl_secs, REFRESH_LEAD_SECS}; + +const OK_RESP: &str = "\ + AKID\ + 2026-06-30T01:00:00Z\ + "; + +fn exp_unix() -> i64 { + chrono::DateTime::parse_from_rfc3339("2026-06-30T01:00:00Z") + .unwrap() + .timestamp() +} + +// ── role_arn_from_form ───────────────────────────────────────────── + +#[test] +fn role_arn_only_for_assume_role() { + let assume = [ + ("Action", "AssumeRoleWithWebIdentity"), + ("RoleArn", "arn:aws:iam::1:role/r"), + ("WebIdentityToken", "jwt"), + ]; + assert_eq!(role_arn_from_form(&assume), Some("arn:aws:iam::1:role/r")); +} + +#[test] +fn non_assume_role_action_is_not_cached() { + // A different action must bypass the cache even if a RoleArn is present. + let other = [("Action", "GetCallerIdentity"), ("RoleArn", "arn:x")]; + assert_eq!(role_arn_from_form(&other), None); +} + +#[test] +fn assume_role_without_role_arn_is_none() { + let no_arn = [("Action", "AssumeRoleWithWebIdentity")]; + assert_eq!(role_arn_from_form(&no_arn), None); +} + +// ── cache_key ────────────────────────────────────────────────────── + +#[test] +fn cache_key_is_non_routable_and_encodes_arn() { + let k = cache_key("arn:aws:iam::1:role/r"); + assert!(k.starts_with("https://sts-creds.cache.internal/v1/")); + // The `:` and `/` in the ARN must be percent-encoded so the key is one + // well-formed, collision-free path segment. + assert!(!k + .trim_start_matches("https://sts-creds.cache.internal/v1/") + .contains(':')); + assert_ne!(cache_key("arn:a"), cache_key("arn:b")); +} + +// ── ttl_secs ─────────────────────────────────────────────────────── + +#[test] +fn ttl_is_time_to_expiry_minus_lead() { + let now = exp_unix() - 3600; // one hour before expiry + assert_eq!( + ttl_secs(OK_RESP, now), + Some((3600 - REFRESH_LEAD_SECS) as u32) + ); +} + +#[test] +fn near_expiry_is_not_cached() { + let now = exp_unix() - 60; // inside the 300s lead window + assert_eq!(ttl_secs(OK_RESP, now), None); +} + +#[test] +fn expired_is_not_cached() { + let now = exp_unix() + 10; // already past expiry + assert_eq!(ttl_secs(OK_RESP, now), None); +} + +#[test] +fn sts_error_document_is_not_cached() { + // No → never cache a failure as if it were a credential. + let err = "AccessDenied\ + nope"; + assert_eq!(ttl_secs(err, 0), None); +}