From e4b19cecff3f1d81f20c96f6c4cdde62acf003a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Fri, 26 Jun 2026 19:21:03 +1200 Subject: [PATCH 1/7] fix(alertd): report version_drift as broken when podman ps fails A failed `podman ps` means the check could not run, so it says nothing about the system. That is a broken check, not a warning (which implies a degraded system). Co-authored-by: Claude Opus 4.8 --- .../alertd/src/doctor/checks/version_drift.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/crates/alertd/src/doctor/checks/version_drift.rs b/crates/alertd/src/doctor/checks/version_drift.rs index bde07a1b..3e69e42e 100644 --- a/crates/alertd/src/doctor/checks/version_drift.rs +++ b/crates/alertd/src/doctor/checks/version_drift.rs @@ -90,12 +90,13 @@ pub async fn run(ctx: CheckContext) -> Check { evaluate_drift(&running, &expected_versions, &expectations) } -/// Warning result for when `podman ps` couldn't be read at all. We can't judge -/// drift, and must not report a pass — that would dress up a blind check as a -/// healthy one. Most often this is alertd lacking access to the root-owned -/// containers (see the podman-socket / privilege notes). +/// Broken result for when `podman ps` couldn't be read at all. The check itself +/// couldn't run, so it says nothing about the system — that's broken, not a +/// warning (which would imply a degraded system) and not a pass (which would +/// dress up a blind check as a healthy one). Most often this is alertd lacking +/// access to the root-owned containers (see the podman-socket / privilege notes). fn unreadable_check(reason: &str) -> Check { - Check::warning( + Check::broken( "version_drift", "could not read running container versions", format!("`podman ps` failed, so version drift can't be checked: {reason}"), @@ -194,10 +195,11 @@ mod tests { } #[test] - fn unreadable_is_warning_not_pass() { - // The regression that motivated this: a blind check must NOT look healthy. + fn unreadable_is_broken_not_pass() { + // The check couldn't run at all, so it says nothing about the system: + // broken, not a pass that would dress up a blind check as healthy. let check = unreadable_check("podman not found on PATH"); - assert!(matches!(check.status, CheckStatus::Warning(_)), "{check:?}"); + assert!(matches!(check.status, CheckStatus::Broken(_)), "{check:?}"); } #[test] From 460d9162ec943df3ebaf9dfa41e2b9f2c84aa3e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Fri, 26 Jun 2026 19:25:23 +1200 Subject: [PATCH 2/7] fix(alertd): report caddy_version as broken when caddy can't be read caddy exiting non-zero or emitting unparseable output means the check couldn't determine a version, so it says nothing about the system. That's a broken check, not a skip (which implies a precondition like caddy not being installed). Co-authored-by: Claude Opus 4.8 --- crates/alertd/src/doctor/checks/caddy_version.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/crates/alertd/src/doctor/checks/caddy_version.rs b/crates/alertd/src/doctor/checks/caddy_version.rs index 4b2f6d94..8f095421 100644 --- a/crates/alertd/src/doctor/checks/caddy_version.rs +++ b/crates/alertd/src/doctor/checks/caddy_version.rs @@ -39,8 +39,11 @@ pub async fn run(_ctx: SweepContext) -> Check { let output = match Command::new(caddy::program()).arg("version").output().await { Ok(o) if o.status.success() => o, Ok(o) => { + // caddy ran but couldn't tell us its version, so the check couldn't + // run — that's broken, not a skip (which would imply a precondition + // like caddy not being installed). let stderr = String::from_utf8_lossy(&o.stderr); - return Check::skip( + return Check::broken( CHECK_NAME, "caddy version failed", format!("caddy exited {}: {}", o.status, stderr.trim()), @@ -59,7 +62,9 @@ pub async fn run(_ctx: SweepContext) -> Check { let version = match parse_caddy_version(&stdout) { Some(v) => v, None => { - return Check::skip( + // caddy answered but with output we can't read a version from, so + // the check couldn't run: broken, not skip. + return Check::broken( CHECK_NAME, "caddy version unparseable", format!("could not parse semver from `caddy version` output: {stdout:?}"), From 739127e39e8d66e8e11095b3b90e07c3808201ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Fri, 26 Jun 2026 19:25:36 +1200 Subject: [PATCH 3/7] fix(alertd): report inodes as broken when df fails to run A non-zero `df` exit means inode usage couldn't be read, so the check couldn't run and says nothing about the system: broken, not skip (skip stays for df not being on PATH). Co-authored-by: Claude Opus 4.8 --- crates/alertd/src/doctor/checks/inodes.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crates/alertd/src/doctor/checks/inodes.rs b/crates/alertd/src/doctor/checks/inodes.rs index 41fbcba5..8bd44ee3 100644 --- a/crates/alertd/src/doctor/checks/inodes.rs +++ b/crates/alertd/src/doctor/checks/inodes.rs @@ -33,7 +33,10 @@ pub async fn run(_ctx: SweepContext) -> Check { let output = match Command::new("df").args(["-P", "-i", "-T"]).output().await { Ok(o) if o.status.success() => String::from_utf8_lossy(&o.stdout).into_owned(), Ok(o) => { - return Check::skip( + // df ran but failed, so we couldn't read inode usage at all — the + // check couldn't run. That's broken, not a skip (which is for df not + // being present at all, handled below). + return Check::broken( NAME, "df failed", format!( From d2e636cb20ff0019aff1afb48f1fb94647bb5c24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Fri, 26 Jun 2026 19:25:57 +1200 Subject: [PATCH 4/7] fix(alertd): report kopia_backup as broken on undecodable output When kopia exits cleanly but emits output we can't parse, the check couldn't run and says nothing about backups: broken, not skip. The non-zero-exit arms stay skip because on Linux they legitimately catch sudo/elevation denial, a privilege precondition. Co-authored-by: Claude Opus 4.8 --- crates/alertd/src/doctor/checks/kopia_backup.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/crates/alertd/src/doctor/checks/kopia_backup.rs b/crates/alertd/src/doctor/checks/kopia_backup.rs index f866e8a4..290fcc5e 100644 --- a/crates/alertd/src/doctor/checks/kopia_backup.rs +++ b/crates/alertd/src/doctor/checks/kopia_backup.rs @@ -91,7 +91,9 @@ async fn run_linux() -> Check { let snapshots: Vec = match serde_json::from_slice(&output.stdout) { Ok(s) => s, Err(err) => { - return Check::skip( + // kopia succeeded but handed us output we can't decode, so the check + // couldn't run — broken, not skip. + return Check::broken( CHECK_NAME, "kopia output unparseable", format!("could not decode kopia snapshot list JSON: {err}"), @@ -158,7 +160,9 @@ async fn run_windows() -> Check { let snapshots: Vec = match serde_json::from_slice(&output.stdout) { Ok(s) => s, Err(err) => { - return Check::skip( + // kopia succeeded but handed us output we can't decode, so the check + // couldn't run — broken, not skip. + return Check::broken( CHECK_NAME, "kopia output unparseable", format!("could not decode kopia snapshot list JSON: {err}"), From 7c6b19cc4921053a22607013ba2df6ab33b9047a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Fri, 26 Jun 2026 19:26:07 +1200 Subject: [PATCH 5/7] fix(alertd): report migrations as broken on row decode failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A row that comes back but won't decode is a schema/check mismatch, not a system fault — mirror how query_error_check treats 42xxx schema errors and report broken rather than failed. Co-authored-by: Claude Opus 4.8 --- crates/alertd/src/doctor/checks/migrations.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crates/alertd/src/doctor/checks/migrations.rs b/crates/alertd/src/doctor/checks/migrations.rs index 9151f1c4..66a278a4 100644 --- a/crates/alertd/src/doctor/checks/migrations.rs +++ b/crates/alertd/src/doctor/checks/migrations.rs @@ -11,7 +11,10 @@ pub async fn run(ctx: CheckContext) -> Check { Ok(Some(row)) => match row.try_get::<_, String>(0) { Ok(name) => Check::pass("migrations", format!("last: {name}")) .with_detail("last_migration", name), - Err(err) => Check::fail("migrations", "row decode failed", err.to_string()), + // The query ran and returned a row, but its column didn't decode as + // expected — a schema/check mismatch, not a system fault. Mirrors how + // `query_error_check` treats 42xxx schema errors. + Err(err) => Check::broken("migrations", "row decode failed", err.to_string()), }, Ok(None) => Check::warning( "migrations", From 35b36e032eacbd5b6dc9581d60e536a786cec0b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Fri, 26 Jun 2026 19:26:16 +1200 Subject: [PATCH 6/7] fix(alertd): report db_version as broken on row decode failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A version() row that won't decode as text is a check mismatch, not a database fault — mirror query_error_check's handling of 42xxx schema errors and report broken rather than failed. Co-authored-by: Claude Opus 4.8 --- crates/alertd/src/doctor/checks/db_version.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crates/alertd/src/doctor/checks/db_version.rs b/crates/alertd/src/doctor/checks/db_version.rs index e8caedca..73aeca56 100644 --- a/crates/alertd/src/doctor/checks/db_version.rs +++ b/crates/alertd/src/doctor/checks/db_version.rs @@ -9,7 +9,10 @@ pub async fn run(ctx: CheckContext) -> Check { match client.query_one("SELECT version()", &[]).await { Ok(row) => match row.try_get::<_, String>(0) { Ok(v) => Check::pass("db_version", v.clone()).with_detail("pg_version", v), - Err(err) => Check::fail("db_version", "row decode failed", err.to_string()), + // version() returned a row that didn't decode as text — a check + // mismatch, not a database fault. Mirrors how `query_error_check` + // treats 42xxx schema errors. + Err(err) => Check::broken("db_version", "row decode failed", err.to_string()), }, Err(err) => query_error_check("db_version", &err), } From 88a0e4279f813735136b9c5ffeed5f9c03d03027 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Fri, 26 Jun 2026 19:33:10 +1200 Subject: [PATCH 7/7] fix(alertd): split external_users skip from broken Enumeration failures (who/quser ran but errored) previously collapsed into the same skip as the platform-unsupported fallback, hiding a check that couldn't run behind a precondition skip. Split CollectOutcome into Failed (broken: the tool ran but we got no answer) and Unsupported (skip: no enumeration on this platform). Co-authored-by: Claude Opus 4.8 --- .../src/doctor/checks/external_users.rs | 32 ++++++++++++++----- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/crates/alertd/src/doctor/checks/external_users.rs b/crates/alertd/src/doctor/checks/external_users.rs index 68057a4e..71757ebe 100644 --- a/crates/alertd/src/doctor/checks/external_users.rs +++ b/crates/alertd/src/doctor/checks/external_users.rs @@ -72,11 +72,17 @@ struct ExternalUser { pub async fn run(_ctx: SweepContext) -> Check { let mut users = match collect_users().await { Ok(CollectOutcome::Users(u)) => u, - Ok(CollectOutcome::Unavailable(reason)) => { - return Check::skip("external_users", "could not enumerate logins", reason); + Ok(CollectOutcome::Unsupported(reason)) => { + return Check::skip("external_users", "login enumeration unsupported", reason); + } + Ok(CollectOutcome::Failed(reason)) => { + // The enumeration tool ran but errored, so we couldn't read the login + // list — the check couldn't run. That's broken, not a skip (which + // would imply a precondition like the platform not being supported). + return Check::broken("external_users", "could not enumerate logins", reason); } Err(err) => { - return Check::skip( + return Check::broken( "external_users", "could not enumerate logins", err.to_string(), @@ -279,14 +285,24 @@ fn humanise_age(d: Duration) -> String { /// Outcome of trying to enumerate sessions. /// -/// `Unavailable` is distinct from `Users(empty)`: on Windows, `quser` exits +/// `Failed` is distinct from `Users(empty)`: on Windows, `quser` exits /// non-zero (typically with "No User exists for *") both when there genuinely /// are no sessions *and* when the caller doesn't have the privilege to list /// them. Treating both the same way silently turned a permission failure into /// a falsely cheerful PASS, which is the opposite of what the operator needs. +/// +/// `Failed` (the enumeration tool ran but errored) is in turn distinct from +/// `Unsupported` (no enumeration implemented for this platform): the former is +/// a broken check — we couldn't get an answer we expected to — while the latter +/// is a genuine skip, a precondition that was never going to be met here. enum CollectOutcome { Users(Vec), - Unavailable(String), + Failed(String), + #[cfg_attr( + any(unix, windows), + expect(dead_code, reason = "only constructed on platforms without who/quser") + )] + Unsupported(String), } #[cfg(unix)] @@ -305,7 +321,7 @@ async fn collect_users() -> miette::Result { if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string(); debug!(status = ?output.status, %stderr, "who returned non-zero"); - return Ok(CollectOutcome::Unavailable(format!( + return Ok(CollectOutcome::Failed(format!( "who returned non-zero{}", if stderr.is_empty() { String::new() @@ -580,7 +596,7 @@ async fn collect_users() -> miette::Result { if stderr.to_lowercase().contains("no user exists for *") { return Ok(CollectOutcome::Users(Vec::new())); } - return Ok(CollectOutcome::Unavailable(format!( + return Ok(CollectOutcome::Failed(format!( "quser returned non-zero{}", if stderr.trim().is_empty() { String::new() @@ -596,7 +612,7 @@ async fn collect_users() -> miette::Result { #[cfg(not(any(unix, windows)))] async fn collect_users() -> miette::Result { - Ok(CollectOutcome::Unavailable( + Ok(CollectOutcome::Unsupported( "session enumeration not implemented for this platform".into(), )) }