Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ path = "tests/backend_auth.rs"
name = "authz"
path = "tests/authz.rs"

[[test]]
name = "sts_cache"
path = "tests/sts_cache.rs"

[dependencies]
# Multistore
multistore = { version = "0.6.2", features = ["azure"] }
Expand All @@ -48,6 +52,12 @@ sha2 = "0.10"
# Tracing
tracing = "0.1"

# Time — parse the STS response `<Expiration>` for the L2 credential cache TTL.
# Already in the tree transitively (multistore); named directly here so sts_cache
# can use it. `alloc` is enough for `parse_from_rfc3339`; feature unification adds
# whatever the wasm build needs.
chrono = { version = "0.4", default-features = false, features = ["alloc"] }

# Wasm-only dependencies (Cloudflare Workers runtime)
[target.'cfg(target_arch = "wasm32")'.dependencies]
# Pulled in transitively via object_store -> rand. getrandom doesn't support
Expand Down
52 changes: 50 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ mod location;
mod pagination;
mod source_api;
mod sts;
mod sts_cache;

use crate::source_api::{ApiAuth, SourceCoopRegistry};
use analytics::log_analytics;
Expand Down Expand Up @@ -87,6 +88,17 @@ impl HttpExchange for FetchHttpExchange {
url: &str,
form: &[(&str, &str)],
) -> std::result::Result<String, OidcProviderError> {
// L2 (cross-isolate, per-colo) cache for the AssumeRoleWithWebIdentity
// response, keyed by RoleArn. On a hit we skip the slow STS round-trip
// entirely — the one thing that stalls the request hot path on a cold
// isolate. multistore's in-isolate cache is L1; this sits under it.
let cache_key = sts_cache::role_arn_from_form(form).map(sts_cache::cache_key);
if let Some(ref key) = cache_key {
if let Some(body) = sts_cache_get(key).await {
return Ok(body);
}
}

let resp = self
.client
.post(url)
Expand All @@ -100,9 +112,45 @@ impl HttpExchange for FetchHttpExchange {
// body on 4xx/5xx, and multistore's `parse_response` reads the error
// (code + message) out of that body. Discarding it on a non-2xx would
// lose the diagnostic and the precise ProxyError mapping.
resp.text()
let body = resp
.text()
.await
.map_err(|e| OidcProviderError::HttpError(e.to_string()))
.map_err(|e| OidcProviderError::HttpError(e.to_string()))?;

// Cache only a successful, not-near-expiry credential response —
// `ttl_secs` returns None for an STS error document, so failures are
// never cached.
if let Some(ref key) = cache_key {
let now = (worker::Date::now().as_millis() / 1000) as i64;
if let Some(ttl) = sts_cache::ttl_secs(&body, now) {
sts_cache_put(key, &body, ttl).await;
}
}
Ok(body)
}
}

/// L2 read. Best-effort: any cache error degrades to a miss (we just call STS).
async fn sts_cache_get(key: &str) -> Option<String> {
let mut resp = worker::Cache::default().get(key, false).await.ok()??;
resp.text().await.ok()
}

/// L2 write. Best-effort: a failed put just means the next request re-mints.
async fn sts_cache_put(key: &str, body: &str, ttl_secs: u32) {
let headers = worker::Headers::new();
let _ = headers.set("content-type", "text/xml");
let _ = headers.set("cache-control", &format!("max-age={ttl_secs}"));
match worker::Response::ok(body) {
Ok(resp) => {
if let Err(e) = worker::Cache::default()
.put(key, resp.with_headers(headers))
.await
{
tracing::warn!("STS L2 cache put failed: {e}");
}
}
Err(e) => tracing::warn!("STS L2 cache response build failed: {e}"),
}
}

Expand Down
68 changes: 68 additions & 0 deletions src/sts_cache.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
//! Cross-isolate (per-colo) L2 cache for the STS `AssumeRoleWithWebIdentity`
//! response, layered UNDER multistore's in-isolate credential cache.
//!
//! The slow part of backend federation is the STS round-trip, and multistore's
//! credential cache lives in per-isolate memory — so every cold isolate re-runs
//! it on the request hot path. Caching the STS response in the Cloudflare Cache
//! API (shared across isolates within a colo) means only one isolate per colo
//! per credential lifetime actually calls STS; the rest serve the cached body.
//!
//! Only the pure helpers live here so they are host-testable via
//! `tests/sts_cache.rs` (the lib is `cdylib` with `test = false`). The Cache API
//! I/O lives in `lib.rs`, where the wasm-only `worker` types are available.

use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};

/// Stop serving a cached response this many seconds before the credential
/// actually expires. Kept >= multistore's own 60s in-isolate refresh lead, so an
/// L2 entry always expires before L1 would consider the derived credential stale
/// — the two tiers never hand out an about-to-expire credential.
pub const REFRESH_LEAD_SECS: i64 = 300; // 5 minutes

/// The `RoleArn` iff `form` is an `AssumeRoleWithWebIdentity` request — the only
/// exchange we cache (other STS actions and the Azure/GCP bearer flows return
/// `None` and bypass the cache). `RoleArn` is multistore's own L1 cache key, so
/// L2 keys line up with L1 exactly.
pub fn role_arn_from_form<'a>(form: &'a [(&'a str, &'a str)]) -> Option<&'a str> {
let is_assume_role = form
.iter()
.any(|&(k, v)| k == "Action" && v == "AssumeRoleWithWebIdentity");
if !is_assume_role {
return None;
}
form.iter().find(|&&(k, _)| k == "RoleArn").map(|&(_, v)| v)
}

/// Cache key: a synthetic, non-routable URL. Cache API entries are only returned
/// when a request URL matches the key, and this host never arrives as a real edge
/// request — so a cached (short-lived, role-scoped) credential is not externally
/// addressable.
pub fn cache_key(role_arn: &str) -> String {
format!(
"https://sts-creds.cache.internal/v1/{}",
utf8_percent_encode(role_arn, NON_ALPHANUMERIC)
)
}

/// Seconds the STS response may be cached: time until its `<Expiration>` minus
/// the refresh lead. `None` means **do not cache** — either an STS error document
/// (no parseable `<Expiration>`) or a response already inside the lead window.
/// `now_unix` is injected (rather than read from a clock) so this stays pure and
/// host-testable.
pub fn ttl_secs(sts_response_xml: &str, now_unix: i64) -> Option<u32> {
let exp = extract_tag(sts_response_xml, "Expiration")?;
let exp_unix = chrono::DateTime::parse_from_rfc3339(exp.trim())
.ok()?
.timestamp();
let ttl = exp_unix - now_unix - REFRESH_LEAD_SECS;
(ttl > 0).then_some(ttl as u32)
}

/// First-match text of `<tag>…</tag>`. The STS body is a fixed, trusted shape, so
/// a full XML parser is not warranted.
fn extract_tag<'a>(xml: &'a str, tag: &str) -> Option<&'a str> {
let (open, close) = (format!("<{tag}>"), format!("</{tag}>"));
let start = xml.find(&open)? + open.len();
let end = xml[start..].find(&close)? + start;
Some(&xml[start..end])
}
89 changes: 89 additions & 0 deletions tests/sts_cache.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
//! Native unit tests for the wasm-free `sts_cache` helpers, included via
//! `#[path]` (the lib itself is `cdylib` with `test = false`). Mirrors the
//! pattern in `tests/backend_auth.rs`.

#[path = "../src/sts_cache.rs"]
mod sts_cache;

use sts_cache::{cache_key, role_arn_from_form, ttl_secs, REFRESH_LEAD_SECS};

const OK_RESP: &str = "<AssumeRoleWithWebIdentityResponse><Credentials>\
<AccessKeyId>AKID</AccessKeyId>\
<Expiration>2026-06-30T01:00:00Z</Expiration>\
</Credentials></AssumeRoleWithWebIdentityResponse>";

fn exp_unix() -> i64 {
chrono::DateTime::parse_from_rfc3339("2026-06-30T01:00:00Z")
.unwrap()
.timestamp()
}

// ── role_arn_from_form ─────────────────────────────────────────────

#[test]
fn role_arn_only_for_assume_role() {
let assume = [
("Action", "AssumeRoleWithWebIdentity"),
("RoleArn", "arn:aws:iam::1:role/r"),
("WebIdentityToken", "jwt"),
];
assert_eq!(role_arn_from_form(&assume), Some("arn:aws:iam::1:role/r"));
}

#[test]
fn non_assume_role_action_is_not_cached() {
// A different action must bypass the cache even if a RoleArn is present.
let other = [("Action", "GetCallerIdentity"), ("RoleArn", "arn:x")];
assert_eq!(role_arn_from_form(&other), None);
}

#[test]
fn assume_role_without_role_arn_is_none() {
let no_arn = [("Action", "AssumeRoleWithWebIdentity")];
assert_eq!(role_arn_from_form(&no_arn), None);
}

// ── cache_key ──────────────────────────────────────────────────────

#[test]
fn cache_key_is_non_routable_and_encodes_arn() {
let k = cache_key("arn:aws:iam::1:role/r");
assert!(k.starts_with("https://sts-creds.cache.internal/v1/"));
// The `:` and `/` in the ARN must be percent-encoded so the key is one
// well-formed, collision-free path segment.
assert!(!k
.trim_start_matches("https://sts-creds.cache.internal/v1/")
.contains(':'));
assert_ne!(cache_key("arn:a"), cache_key("arn:b"));
}

// ── ttl_secs ───────────────────────────────────────────────────────

#[test]
fn ttl_is_time_to_expiry_minus_lead() {
let now = exp_unix() - 3600; // one hour before expiry
assert_eq!(
ttl_secs(OK_RESP, now),
Some((3600 - REFRESH_LEAD_SECS) as u32)
);
}

#[test]
fn near_expiry_is_not_cached() {
let now = exp_unix() - 60; // inside the 300s lead window
assert_eq!(ttl_secs(OK_RESP, now), None);
}

#[test]
fn expired_is_not_cached() {
let now = exp_unix() + 10; // already past expiry
assert_eq!(ttl_secs(OK_RESP, now), None);
}

#[test]
fn sts_error_document_is_not_cached() {
// No <Expiration> → never cache a failure as if it were a credential.
let err = "<ErrorResponse><Error><Code>AccessDenied</Code>\
<Message>nope</Message></Error></ErrorResponse>";
assert_eq!(ttl_secs(err, 0), None);
}
Loading