Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions crates/database/src/backup/refs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ pub const RECONCILE_REPORT_GAP: &str = "backup-reconcile-report-gap";
/// maintenance-cadence threshold. Group-scoped, `Error`.
pub const MAINTENANCE_STALE: &str = "backup-maintenance-stale";

/// A group whose most recently *finished* maintenance run failed. Distinct
/// from [`MAINTENANCE_STALE`] (which fires on absence of success): this fires
/// when maintenance is running but erroring. Group-scoped, `Error`. Clears
/// when a newer run finishes successfully.
pub const MAINTENANCE_ERROR: &str = "backup-maintenance-error";

/// A run reported success but no matching repo snapshot landed (the device
/// lied or the upload didn't persist). Group-scoped, `Error`.
pub const RECONCILE_MISSING: &str = "backup-reconcile-missing";
Expand Down
62 changes: 57 additions & 5 deletions crates/database/src/backup/staleness.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
use std::collections::HashMap;

use commons_errors::Result;
use commons_types::{backup::BackupType, issue::Severity};
use commons_types::{
backup::{BackupType, RunOutcome},
issue::Severity,
};
use diesel::prelude::*;
use diesel_async::{AsyncPgConnection, RunQueryDsl};
use jiff::{SignedDuration, Span, SpanRelativeTo, SpanRound, Timestamp, Unit};
Expand Down Expand Up @@ -326,10 +329,17 @@ pub async fn sweep(db: &mut AsyncPgConnection, rows: &[ScanRow]) -> Result<usize
Ok(filed)
}

/// Group-level maintenance staleness: a `status='ready'` group whose latest
/// successful maintenance run (any kind) is older than [`MAINTENANCE_STALE_AFTER`]
/// (or has none at all, past the threshold from its config creation) fires
/// `backup-maintenance-stale`; a fresh success clears it.
/// Group-level maintenance health, per `status='ready'` group:
///
/// - **Staleness** ([`refs::MAINTENANCE_STALE`]): latest *successful* run (any
/// kind) older than [`MAINTENANCE_STALE_AFTER`] β€” or none at all, past the
/// threshold from config creation. A fresh success clears it.
/// - **Failure** ([`refs::MAINTENANCE_ERROR`]): the most recently *finished*
/// run failed. Distinct from staleness β€” maintenance can run on cadence yet
/// error every time. A newer successful run clears it.
///
/// Both are `Error` severity (open an incident + page). A group that is both
/// stale and erroring files both, independently keyed.
async fn sweep_maintenance(db: &mut AsyncPgConnection, now: Timestamp) -> Result<usize> {
use crate::schema::{backup_maintenance_runs as mr, server_group_backup_config as cfg};

Expand Down Expand Up @@ -385,6 +395,48 @@ async fn sweep_maintenance(db: &mut AsyncPgConnection, now: Timestamp) -> Result
.await?;
filed += 1;
}

// Failure leg: a group can run maintenance on cadence yet error every
// time, which staleness (absence-of-success) never catches. Key off the
// most recently *finished* run.
let latest_completed =
crate::backups::BackupMaintenanceRun::latest_completed_for_group(db, group_id).await?;
let err_active = open_group_issue_active(db, group_id, refs::MAINTENANCE_ERROR).await?;
match latest_completed {
Some(run) if run.outcome == Some(RunOutcome::Failure) => {
raise_group_event(
db,
group_id,
refs::MAINTENANCE_ERROR,
Severity::Error,
None,
&format!(
"Repo maintenance ({}) failed: {}",
run.kind,
run.error.as_deref().unwrap_or("(no detail reported)"),
),
true,
)
.await?;
filed += 1;
}
// Most recent finished run succeeded (or there is none): clear any
// open failure issue.
_ if err_active => {
raise_group_event(
db,
group_id,
refs::MAINTENANCE_ERROR,
Severity::Info,
None,
"Repo maintenance completed successfully again",
false,
)
.await?;
filed += 1;
}
_ => {}
}
}
Ok(filed)
}
Expand Down
19 changes: 19 additions & 0 deletions crates/database/src/backups.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1062,6 +1062,25 @@ impl BackupMaintenanceRun {
.map_err(AppError::from)
}

/// The most recently *finished* maintenance run for the group (any
/// outcome), ignoring runs still in flight (`outcome IS NULL`). Used by the
/// detection sweep to decide whether the latest concluded run failed.
pub async fn latest_completed_for_group(
db: &mut AsyncPgConnection,
group_id: Uuid,
) -> Result<Option<Self>> {
use crate::schema::backup_maintenance_runs::dsl;

dsl::backup_maintenance_runs
.filter(dsl::group_id.eq(group_id))
.filter(dsl::outcome.is_not_null())
.order(dsl::finished_at.desc())
.first(db)
.await
.optional()
.map_err(AppError::from)
}

/// Whether a run row still exists and is open (`outcome IS NULL`). Used by
/// the scheduler's crash-detection to mark a run failed when its Job
/// finished without ever reporting.
Expand Down
155 changes: 155 additions & 0 deletions crates/database/tests/backup_detection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,31 @@ async fn insert_ready_config(conn: &mut AsyncPgConnection, group_id: Uuid, age:
.expect("backdate config created_at");
}

/// Insert a finished `backup_maintenance_runs` row, backdating both
/// `started_at` and `finished_at` by `finished_age`.
async fn insert_maintenance_run(
conn: &mut AsyncPgConnection,
group_id: Uuid,
kind: &str,
outcome: &str,
error: Option<&str>,
finished_age: SignedDuration,
) {
let secs = finished_age.as_secs().to_string();
sql_query(
"INSERT INTO backup_maintenance_runs (group_id, kind, started_at, finished_at, outcome, error) \
VALUES ($1, $2, NOW() - ($5 || ' seconds')::INTERVAL, NOW() - ($5 || ' seconds')::INTERVAL, $3, $4)",
)
.bind::<sql_types::Uuid, _>(group_id)
.bind::<sql_types::Text, _>(kind)
.bind::<sql_types::Text, _>(outcome)
.bind::<sql_types::Nullable<sql_types::Text>, _>(error)
.bind::<sql_types::Text, _>(secs)
.execute(conn)
.await
.expect("insert maintenance run");
}

async fn insert_schedule(
conn: &mut AsyncPgConnection,
group_id: Uuid,
Expand Down Expand Up @@ -774,3 +799,133 @@ async fn group_event_pages_even_when_all_members_unmonitored() {
})
.await;
}

// ===========================================================================
// Case 6 β€” maintenance failure (backup-maintenance-error)
// ===========================================================================

#[tokio::test(flavor = "multi_thread")]
async fn sweep_files_maintenance_error_when_latest_run_failed_then_clears_on_success() {
TestDb::run(|mut conn, _url| async move {
let group_id = insert_group(&mut conn, "g").await;
// Freshly-created config so maintenance-STALE does NOT also fire β€” this
// isolates the failure signal from absence-of-success.
insert_ready_config(&mut conn, group_id, SignedDuration::from_hours(1)).await;

// The most recently finished run failed an hour ago.
insert_maintenance_run(
&mut conn,
group_id,
"full",
"failure",
Some("kopia maintenance: connection refused"),
SignedDuration::from_hours(1),
)
.await;

let rows = database::backup::staleness::scan_rows(&mut conn)
.await
.expect("scan");
database::backup::staleness::sweep(&mut conn, &rows)
.await
.expect("sweep");

let issue = group_issue(&mut conn, group_id, refs::MAINTENANCE_ERROR)
.await
.expect("maintenance-error issue filed");
assert_eq!(issue.severity, Severity::Error.to_string());
assert!(issue.active, "failure issue is active");
assert_eq!(
group_issue_open_links(&mut conn, group_id, refs::MAINTENANCE_ERROR).await,
1,
"maintenance failure opens an incident",
);
// Staleness must NOT fire for a freshly-created group.
assert!(
group_issue(&mut conn, group_id, refs::MAINTENANCE_STALE)
.await
.is_none(),
"a recent config is not maintenance-stale",
);

// A newer successful run is now the latest finished run β†’ clears it.
insert_maintenance_run(
&mut conn,
group_id,
"full",
"success",
None,
SignedDuration::from_secs(0),
)
.await;
database::backup::staleness::sweep(&mut conn, &rows)
.await
.expect("re-sweep");

let cleared = group_issue(&mut conn, group_id, refs::MAINTENANCE_ERROR)
.await
.expect("issue row persists");
assert!(
!cleared.active,
"failure issue cleared after a successful run",
);
assert_eq!(
group_issue_open_links(&mut conn, group_id, refs::MAINTENANCE_ERROR).await,
0,
"recovery removes the failure issue from its incident",
);
})
.await;
}

#[tokio::test(flavor = "multi_thread")]
async fn in_flight_run_does_not_clear_an_open_maintenance_error() {
TestDb::run(|mut conn, _url| async move {
let group_id = insert_group(&mut conn, "g").await;
insert_ready_config(&mut conn, group_id, SignedDuration::from_hours(1)).await;
insert_maintenance_run(
&mut conn,
group_id,
"full",
"failure",
Some("boom"),
SignedDuration::from_hours(1),
)
.await;

let rows = database::backup::staleness::scan_rows(&mut conn)
.await
.expect("scan");
database::backup::staleness::sweep(&mut conn, &rows)
.await
.expect("sweep");
assert!(
group_issue(&mut conn, group_id, refs::MAINTENANCE_ERROR)
.await
.expect("error filed")
.active
);

// A run that has started but not finished (outcome NULL) must be ignored:
// it is not evidence that the failure recovered.
database::BackupMaintenanceRun::start(
&mut conn,
group_id,
commons_types::backup::MaintenanceKind::Full,
)
.await
.expect("start in-flight run");
database::backup::staleness::sweep(&mut conn, &rows)
.await
.expect("re-sweep");

assert!(
group_issue(&mut conn, group_id, refs::MAINTENANCE_ERROR)
.await
.expect("error still present")
.active,
"an in-flight run must not clear the failure issue",
);
})
.await;
}
Loading