Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions crates/alertd/src/doctor/checks/caddy_version.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,11 @@ pub async fn run(_ctx: SweepContext) -> Check {
let output = match Command::new(caddy::program()).arg("version").output().await {
Ok(o) if o.status.success() => o,
Ok(o) => {
// caddy ran but couldn't tell us its version, so the check couldn't
// run β€” that's broken, not a skip (which would imply a precondition
// like caddy not being installed).
let stderr = String::from_utf8_lossy(&o.stderr);
return Check::skip(
return Check::broken(
CHECK_NAME,
"caddy version failed",
format!("caddy exited {}: {}", o.status, stderr.trim()),
Expand All @@ -59,7 +62,9 @@ pub async fn run(_ctx: SweepContext) -> Check {
let version = match parse_caddy_version(&stdout) {
Some(v) => v,
None => {
return Check::skip(
// caddy answered but with output we can't read a version from, so
// the check couldn't run: broken, not skip.
return Check::broken(
CHECK_NAME,
"caddy version unparseable",
format!("could not parse semver from `caddy version` output: {stdout:?}"),
Expand Down
5 changes: 4 additions & 1 deletion crates/alertd/src/doctor/checks/db_version.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ pub async fn run(ctx: CheckContext) -> Check {
match client.query_one("SELECT version()", &[]).await {
Ok(row) => match row.try_get::<_, String>(0) {
Ok(v) => Check::pass("db_version", v.clone()).with_detail("pg_version", v),
Err(err) => Check::fail("db_version", "row decode failed", err.to_string()),
// version() returned a row that didn't decode as text β€” a check
// mismatch, not a database fault. Mirrors how `query_error_check`
// treats 42xxx schema errors.
Err(err) => Check::broken("db_version", "row decode failed", err.to_string()),
},
Err(err) => query_error_check("db_version", &err),
}
Expand Down
32 changes: 24 additions & 8 deletions crates/alertd/src/doctor/checks/external_users.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,17 @@ struct ExternalUser {
pub async fn run(_ctx: SweepContext) -> Check {
let mut users = match collect_users().await {
Ok(CollectOutcome::Users(u)) => u,
Ok(CollectOutcome::Unavailable(reason)) => {
return Check::skip("external_users", "could not enumerate logins", reason);
Ok(CollectOutcome::Unsupported(reason)) => {
return Check::skip("external_users", "login enumeration unsupported", reason);
}
Ok(CollectOutcome::Failed(reason)) => {
// The enumeration tool ran but errored, so we couldn't read the login
// list β€” the check couldn't run. That's broken, not a skip (which
// would imply a precondition like the platform not being supported).
return Check::broken("external_users", "could not enumerate logins", reason);
}
Err(err) => {
return Check::skip(
return Check::broken(
"external_users",
"could not enumerate logins",
err.to_string(),
Expand Down Expand Up @@ -279,14 +285,24 @@ fn humanise_age(d: Duration) -> String {

/// Outcome of trying to enumerate sessions.
///
/// `Unavailable` is distinct from `Users(empty)`: on Windows, `quser` exits
/// `Failed` is distinct from `Users(empty)`: on Windows, `quser` exits
/// non-zero (typically with "No User exists for *") both when there genuinely
/// are no sessions *and* when the caller doesn't have the privilege to list
/// them. Treating both the same way silently turned a permission failure into
/// a falsely cheerful PASS, which is the opposite of what the operator needs.
///
/// `Failed` (the enumeration tool ran but errored) is in turn distinct from
/// `Unsupported` (no enumeration implemented for this platform): the former is
/// a broken check β€” we couldn't get an answer we expected to β€” while the latter
/// is a genuine skip, a precondition that was never going to be met here.
enum CollectOutcome {
Users(Vec<ExternalUser>),
Unavailable(String),
Failed(String),
#[cfg_attr(
any(unix, windows),
expect(dead_code, reason = "only constructed on platforms without who/quser")
)]
Unsupported(String),
}

#[cfg(unix)]
Expand All @@ -305,7 +321,7 @@ async fn collect_users() -> miette::Result<CollectOutcome> {
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
debug!(status = ?output.status, %stderr, "who returned non-zero");
return Ok(CollectOutcome::Unavailable(format!(
return Ok(CollectOutcome::Failed(format!(
"who returned non-zero{}",
if stderr.is_empty() {
String::new()
Expand Down Expand Up @@ -580,7 +596,7 @@ async fn collect_users() -> miette::Result<CollectOutcome> {
if stderr.to_lowercase().contains("no user exists for *") {
return Ok(CollectOutcome::Users(Vec::new()));
}
return Ok(CollectOutcome::Unavailable(format!(
return Ok(CollectOutcome::Failed(format!(
"quser returned non-zero{}",
if stderr.trim().is_empty() {
String::new()
Expand All @@ -596,7 +612,7 @@ async fn collect_users() -> miette::Result<CollectOutcome> {

#[cfg(not(any(unix, windows)))]
async fn collect_users() -> miette::Result<CollectOutcome> {
Ok(CollectOutcome::Unavailable(
Ok(CollectOutcome::Unsupported(
"session enumeration not implemented for this platform".into(),
))
}
Expand Down
5 changes: 4 additions & 1 deletion crates/alertd/src/doctor/checks/inodes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ pub async fn run(_ctx: SweepContext) -> Check {
let output = match Command::new("df").args(["-P", "-i", "-T"]).output().await {
Ok(o) if o.status.success() => String::from_utf8_lossy(&o.stdout).into_owned(),
Ok(o) => {
return Check::skip(
// df ran but failed, so we couldn't read inode usage at all β€” the
// check couldn't run. That's broken, not a skip (which is for df not
// being present at all, handled below).
return Check::broken(
NAME,
"df failed",
format!(
Expand Down
8 changes: 6 additions & 2 deletions crates/alertd/src/doctor/checks/kopia_backup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,9 @@ async fn run_linux() -> Check {
let snapshots: Vec<Snapshot> = match serde_json::from_slice(&output.stdout) {
Ok(s) => s,
Err(err) => {
return Check::skip(
// kopia succeeded but handed us output we can't decode, so the check
// couldn't run β€” broken, not skip.
return Check::broken(
CHECK_NAME,
"kopia output unparseable",
format!("could not decode kopia snapshot list JSON: {err}"),
Expand Down Expand Up @@ -158,7 +160,9 @@ async fn run_windows() -> Check {
let snapshots: Vec<Snapshot> = match serde_json::from_slice(&output.stdout) {
Ok(s) => s,
Err(err) => {
return Check::skip(
// kopia succeeded but handed us output we can't decode, so the check
// couldn't run β€” broken, not skip.
return Check::broken(
CHECK_NAME,
"kopia output unparseable",
format!("could not decode kopia snapshot list JSON: {err}"),
Expand Down
5 changes: 4 additions & 1 deletion crates/alertd/src/doctor/checks/migrations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ pub async fn run(ctx: CheckContext) -> Check {
Ok(Some(row)) => match row.try_get::<_, String>(0) {
Ok(name) => Check::pass("migrations", format!("last: {name}"))
.with_detail("last_migration", name),
Err(err) => Check::fail("migrations", "row decode failed", err.to_string()),
// The query ran and returned a row, but its column didn't decode as
// expected β€” a schema/check mismatch, not a system fault. Mirrors how
// `query_error_check` treats 42xxx schema errors.
Err(err) => Check::broken("migrations", "row decode failed", err.to_string()),
},
Ok(None) => Check::warning(
"migrations",
Expand Down
18 changes: 10 additions & 8 deletions crates/alertd/src/doctor/checks/version_drift.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,13 @@ pub async fn run(ctx: CheckContext) -> Check {
evaluate_drift(&running, &expected_versions, &expectations)
}

/// Warning result for when `podman ps` couldn't be read at all. We can't judge
/// drift, and must not report a pass β€” that would dress up a blind check as a
/// healthy one. Most often this is alertd lacking access to the root-owned
/// containers (see the podman-socket / privilege notes).
/// Broken result for when `podman ps` couldn't be read at all. The check itself
/// couldn't run, so it says nothing about the system β€” that's broken, not a
/// warning (which would imply a degraded system) and not a pass (which would
/// dress up a blind check as a healthy one). Most often this is alertd lacking
/// access to the root-owned containers (see the podman-socket / privilege notes).
fn unreadable_check(reason: &str) -> Check {
Check::warning(
Check::broken(
"version_drift",
"could not read running container versions",
format!("`podman ps` failed, so version drift can't be checked: {reason}"),
Expand Down Expand Up @@ -194,10 +195,11 @@ mod tests {
}

#[test]
fn unreadable_is_warning_not_pass() {
// The regression that motivated this: a blind check must NOT look healthy.
fn unreadable_is_broken_not_pass() {
// The check couldn't run at all, so it says nothing about the system:
// broken, not a pass that would dress up a blind check as healthy.
let check = unreadable_check("podman not found on PATH");
assert!(matches!(check.status, CheckStatus::Warning(_)), "{check:?}");
assert!(matches!(check.status, CheckStatus::Broken(_)), "{check:?}");
}

#[test]
Expand Down
Loading