From 0e50a7522cbd360aed858b174f6ac3144e12f55b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Sat, 27 Jun 2026 05:53:34 +1200 Subject: [PATCH 1/3] feat(backups): surface repo maintenance + alert on failed runs Adds a "Repo maintenance" panel to the group backup page: an at-a-glance health summary (last successful maintenance / failed / running) plus a table of recent kopia maintenance cycles, mirroring the recent-runs panel. recent_maintenance was already returned by the stats endpoint but never rendered. Adds a group-level backup-maintenance-error incident (Error, paging) raised when the most recently *finished* maintenance run failed, cleared by a later successful run. This complements the existing backup-maintenance-stale absence-of-success check, which catches a different failure mode (maintenance not running at all). Co-Authored-By: Claude Opus 4.8 --- crates/database/src/backup/refs.rs | 6 + crates/database/src/backup/staleness.rs | 62 +++++++- crates/database/src/backups.rs | 19 +++ crates/database/tests/backup_detection.rs | 155 ++++++++++++++++++++ private-web/e2e/backups.spec.ts | 105 ++++++++++++++ private-web/e2e/seed.ts | 32 +++++ private-web/src/routes/BackupPanel.tsx | 168 ++++++++++++++++++++++ 7 files changed, 542 insertions(+), 5 deletions(-) diff --git a/crates/database/src/backup/refs.rs b/crates/database/src/backup/refs.rs index 6e837818..bcc81c82 100644 --- a/crates/database/src/backup/refs.rs +++ b/crates/database/src/backup/refs.rs @@ -33,6 +33,12 @@ pub const RECONCILE_REPORT_GAP: &str = "backup-reconcile-report-gap"; /// maintenance-cadence threshold. Group-scoped, `Error`. pub const MAINTENANCE_STALE: &str = "backup-maintenance-stale"; +/// A group whose most recently *finished* maintenance run failed. Distinct +/// from [`MAINTENANCE_STALE`] (which fires on absence of success): this fires +/// when maintenance is running but erroring. Group-scoped, `Error`. Clears +/// when a newer run finishes successfully. +pub const MAINTENANCE_ERROR: &str = "backup-maintenance-error"; + /// A run reported success but no matching repo snapshot landed (the device /// lied or the upload didn't persist). Group-scoped, `Error`. pub const RECONCILE_MISSING: &str = "backup-reconcile-missing"; diff --git a/crates/database/src/backup/staleness.rs b/crates/database/src/backup/staleness.rs index 8d800f0d..c23a2be7 100644 --- a/crates/database/src/backup/staleness.rs +++ b/crates/database/src/backup/staleness.rs @@ -12,7 +12,10 @@ use std::collections::HashMap; use commons_errors::Result; -use commons_types::{backup::BackupType, issue::Severity}; +use commons_types::{ + backup::{BackupType, RunOutcome}, + issue::Severity, +}; use diesel::prelude::*; use diesel_async::{AsyncPgConnection, RunQueryDsl}; use jiff::{SignedDuration, Span, SpanRelativeTo, SpanRound, Timestamp, Unit}; @@ -326,10 +329,17 @@ pub async fn sweep(db: &mut AsyncPgConnection, rows: &[ScanRow]) -> Result Result { use crate::schema::{backup_maintenance_runs as mr, server_group_backup_config as cfg}; @@ -385,6 +395,48 @@ async fn sweep_maintenance(db: &mut AsyncPgConnection, now: Timestamp) -> Result .await?; filed += 1; } + + // Failure leg: a group can run maintenance on cadence yet error every + // time, which staleness (absence-of-success) never catches. Key off the + // most recently *finished* run. + let latest_completed = + crate::backups::BackupMaintenanceRun::latest_completed_for_group(db, group_id).await?; + let err_active = open_group_issue_active(db, group_id, refs::MAINTENANCE_ERROR).await?; + match latest_completed { + Some(run) if run.outcome == Some(RunOutcome::Failure) => { + raise_group_event( + db, + group_id, + refs::MAINTENANCE_ERROR, + Severity::Error, + None, + &format!( + "Repo maintenance ({}) failed: {}", + run.kind, + run.error.as_deref().unwrap_or("(no detail reported)"), + ), + true, + ) + .await?; + filed += 1; + } + // Most recent finished run succeeded (or there is none): clear any + // open failure issue. + _ if err_active => { + raise_group_event( + db, + group_id, + refs::MAINTENANCE_ERROR, + Severity::Info, + None, + "Repo maintenance completed successfully again", + false, + ) + .await?; + filed += 1; + } + _ => {} + } } Ok(filed) } diff --git a/crates/database/src/backups.rs b/crates/database/src/backups.rs index 90e3182c..7639ee62 100644 --- a/crates/database/src/backups.rs +++ b/crates/database/src/backups.rs @@ -1062,6 +1062,25 @@ impl BackupMaintenanceRun { .map_err(AppError::from) } + /// The most recently *finished* maintenance run for the group (any + /// outcome), ignoring runs still in flight (`outcome IS NULL`). Used by the + /// detection sweep to decide whether the latest concluded run failed. + pub async fn latest_completed_for_group( + db: &mut AsyncPgConnection, + group_id: Uuid, + ) -> Result> { + use crate::schema::backup_maintenance_runs::dsl; + + dsl::backup_maintenance_runs + .filter(dsl::group_id.eq(group_id)) + .filter(dsl::outcome.is_not_null()) + .order(dsl::finished_at.desc()) + .first(db) + .await + .optional() + .map_err(AppError::from) + } + /// Whether a run row still exists and is open (`outcome IS NULL`). Used by /// the scheduler's crash-detection to mark a run failed when its Job /// finished without ever reporting. diff --git a/crates/database/tests/backup_detection.rs b/crates/database/tests/backup_detection.rs index 04808392..fd2e2d59 100644 --- a/crates/database/tests/backup_detection.rs +++ b/crates/database/tests/backup_detection.rs @@ -107,6 +107,31 @@ async fn insert_ready_config(conn: &mut AsyncPgConnection, group_id: Uuid, age: .expect("backdate config created_at"); } +/// Insert a finished `backup_maintenance_runs` row, backdating both +/// `started_at` and `finished_at` by `finished_age`. +async fn insert_maintenance_run( + conn: &mut AsyncPgConnection, + group_id: Uuid, + kind: &str, + outcome: &str, + error: Option<&str>, + finished_age: SignedDuration, +) { + let secs = finished_age.as_secs().to_string(); + sql_query( + "INSERT INTO backup_maintenance_runs (group_id, kind, started_at, finished_at, outcome, error) \ + VALUES ($1, $2, NOW() - ($5 || ' seconds')::INTERVAL, NOW() - ($5 || ' seconds')::INTERVAL, $3, $4)", + ) + .bind::(group_id) + .bind::(kind) + .bind::(outcome) + .bind::, _>(error) + .bind::(secs) + .execute(conn) + .await + .expect("insert maintenance run"); +} + async fn insert_schedule( conn: &mut AsyncPgConnection, group_id: Uuid, @@ -774,3 +799,133 @@ async fn group_event_pages_even_when_all_members_unmonitored() { }) .await; } + +// =========================================================================== +// Case 6 — maintenance failure (backup-maintenance-error) +// =========================================================================== + +#[tokio::test(flavor = "multi_thread")] +async fn sweep_files_maintenance_error_when_latest_run_failed_then_clears_on_success() { + TestDb::run(|mut conn, _url| async move { + let group_id = insert_group(&mut conn, "g").await; + // Freshly-created config so maintenance-STALE does NOT also fire — this + // isolates the failure signal from absence-of-success. + insert_ready_config(&mut conn, group_id, SignedDuration::from_hours(1)).await; + + // The most recently finished run failed an hour ago. + insert_maintenance_run( + &mut conn, + group_id, + "full", + "failure", + Some("kopia maintenance: connection refused"), + SignedDuration::from_hours(1), + ) + .await; + + let rows = database::backup::staleness::scan_rows(&mut conn) + .await + .expect("scan"); + database::backup::staleness::sweep(&mut conn, &rows) + .await + .expect("sweep"); + + let issue = group_issue(&mut conn, group_id, refs::MAINTENANCE_ERROR) + .await + .expect("maintenance-error issue filed"); + assert_eq!(issue.severity, Severity::Error.to_string()); + assert!(issue.active, "failure issue is active"); + assert_eq!( + group_issue_open_links(&mut conn, group_id, refs::MAINTENANCE_ERROR).await, + 1, + "maintenance failure opens an incident", + ); + // Staleness must NOT fire for a freshly-created group. + assert!( + group_issue(&mut conn, group_id, refs::MAINTENANCE_STALE) + .await + .is_none(), + "a recent config is not maintenance-stale", + ); + + // A newer successful run is now the latest finished run → clears it. + insert_maintenance_run( + &mut conn, + group_id, + "full", + "success", + None, + SignedDuration::from_secs(0), + ) + .await; + database::backup::staleness::sweep(&mut conn, &rows) + .await + .expect("re-sweep"); + + let cleared = group_issue(&mut conn, group_id, refs::MAINTENANCE_ERROR) + .await + .expect("issue row persists"); + assert!( + !cleared.active, + "failure issue cleared after a successful run", + ); + assert_eq!( + group_issue_open_links(&mut conn, group_id, refs::MAINTENANCE_ERROR).await, + 0, + "recovery removes the failure issue from its incident", + ); + }) + .await; +} + +#[tokio::test(flavor = "multi_thread")] +async fn in_flight_run_does_not_clear_an_open_maintenance_error() { + TestDb::run(|mut conn, _url| async move { + let group_id = insert_group(&mut conn, "g").await; + insert_ready_config(&mut conn, group_id, SignedDuration::from_hours(1)).await; + insert_maintenance_run( + &mut conn, + group_id, + "full", + "failure", + Some("boom"), + SignedDuration::from_hours(1), + ) + .await; + + let rows = database::backup::staleness::scan_rows(&mut conn) + .await + .expect("scan"); + database::backup::staleness::sweep(&mut conn, &rows) + .await + .expect("sweep"); + assert!( + group_issue(&mut conn, group_id, refs::MAINTENANCE_ERROR) + .await + .expect("error filed") + .active + ); + + // A run that has started but not finished (outcome NULL) must be ignored: + // it is not evidence that the failure recovered. + database::BackupMaintenanceRun::start( + &mut conn, + group_id, + commons_types::backup::MaintenanceKind::Full, + ) + .await + .expect("start in-flight run"); + database::backup::staleness::sweep(&mut conn, &rows) + .await + .expect("re-sweep"); + + assert!( + group_issue(&mut conn, group_id, refs::MAINTENANCE_ERROR) + .await + .expect("error still present") + .active, + "an in-flight run must not clear the failure issue", + ); + }) + .await; +} diff --git a/private-web/e2e/backups.spec.ts b/private-web/e2e/backups.spec.ts index a4b65433..5cad59bf 100644 --- a/private-web/e2e/backups.spec.ts +++ b/private-web/e2e/backups.spec.ts @@ -2,6 +2,7 @@ import { expect, test } from "./test-fixtures"; import { resetSeededTables, seedBackupCredentialIssuance, + seedBackupMaintenanceRun, seedBackupRepoStats, seedBackupRun, seedDevice, @@ -833,3 +834,107 @@ test.describe("server backup capabilities", () => { await expect(toggle).toBeChecked(); }); }); + +test.describe("backups ready: repo maintenance panel", () => { + test.beforeEach(async ({ sql }) => { + await resetSeededTables(sql); + }); + + test("zero-state shows maintenance has never run", async ({ page, sql }) => { + const group = await seedServerGroup(sql, { name: "maint-empty" }); + await seedServerGroupBackupConfig(sql, { + groupId: group.id, + status: "ready", + }); + + await page.goto(`/groups/${group.id}/backups`); + const panel = page + .getByRole("heading", { name: /repo maintenance/i }) + .locator(".."); + await expect(panel.getByText(/no maintenance has run yet/i)).toBeVisible(); + }); + + test("a successful run shows Healthy, last-success time, and reclaimed bytes", async ({ + page, + sql, + }) => { + const group = await seedServerGroup(sql, { name: "maint-ok" }); + await seedServerGroupBackupConfig(sql, { + groupId: group.id, + status: "ready", + }); + await seedBackupMaintenanceRun(sql, { + groupId: group.id, + kind: "full", + outcome: "success", + bytesReclaimed: 1048576, // 1.0 MiB + finishedAgoSecs: 3600, + }); + + await page.goto(`/groups/${group.id}/backups`); + const panel = page + .getByRole("heading", { name: /repo maintenance/i }) + .locator(".."); + await expect(panel.getByText("Healthy")).toBeVisible(); + await expect(panel.getByText(/last successful maintenance/i)).toBeVisible(); + await expect(panel.getByText("Full")).toBeVisible(); + await expect(panel.getByText("success")).toBeVisible(); + await expect(panel.getByText("1.0 MiB")).toBeVisible(); + }); + + test("a failed latest run shows the failure and expands to its error", async ({ + page, + sql, + }) => { + const group = await seedServerGroup(sql, { name: "maint-failed" }); + await seedServerGroupBackupConfig(sql, { + groupId: group.id, + status: "ready", + }); + // An older success, then a newer failure — the panel reads the latest. + await seedBackupMaintenanceRun(sql, { + groupId: group.id, + kind: "full", + outcome: "success", + finishedAgoSecs: 7 * 86400, + }); + await seedBackupMaintenanceRun(sql, { + groupId: group.id, + kind: "full", + outcome: "failure", + error: "kopia maintenance: connection refused", + finishedAgoSecs: 3600, + }); + + await page.goto(`/groups/${group.id}/backups`); + const panel = page + .getByRole("heading", { name: /repo maintenance/i }) + .locator(".."); + await expect(panel.getByText(/last run failed/i)).toBeVisible(); + // Error detail is hidden until the failed row is expanded. + await expect(page.getByText(/connection refused/i)).toBeHidden(); + await panel.getByRole("button", { name: /show error/i }).click(); + await expect(page.getByText(/connection refused/i)).toBeVisible(); + }); + + test("an in-flight run renders as running", async ({ page, sql }) => { + const group = await seedServerGroup(sql, { name: "maint-running" }); + await seedServerGroupBackupConfig(sql, { + groupId: group.id, + status: "ready", + }); + await seedBackupMaintenanceRun(sql, { + groupId: group.id, + kind: "quick", + outcome: null, // still in flight + finishedAgoSecs: 60, + }); + + await page.goto(`/groups/${group.id}/backups`); + const panel = page + .getByRole("heading", { name: /repo maintenance/i }) + .locator(".."); + await expect(panel.getByText("running")).toBeVisible(); + await expect(panel.getByText("Quick")).toBeVisible(); + }); +}); diff --git a/private-web/e2e/seed.ts b/private-web/e2e/seed.ts index 748bb084..988990da 100644 --- a/private-web/e2e/seed.ts +++ b/private-web/e2e/seed.ts @@ -443,6 +443,38 @@ export async function seedBackupRun( return { id }; } +/** Seed a `backup_maintenance_runs` row. `finishedAgoSecs` backdates both + * `started_at` and `finished_at`; omit `outcome` for an in-flight run. */ +export async function seedBackupMaintenanceRun( + sql: Sql, + opts: { + groupId: string; + kind?: "quick" | "full"; + outcome?: "success" | "failure" | null; + error?: string | null; + bytesReclaimed?: number | null; + finishedAgoSecs?: number; + }, +): Promise { + const ago = String(opts.finishedAgoSecs ?? 0); + const outcome = opts.outcome ?? null; + await sql.query( + `INSERT INTO backup_maintenance_runs + (group_id, kind, started_at, finished_at, outcome, error, bytes_reclaimed) + VALUES ($1, $2, NOW() - ($3 || ' seconds')::interval, + CASE WHEN $4::text IS NULL THEN NULL ELSE NOW() - ($3 || ' seconds')::interval END, + $4, $5, $6)`, + [ + opts.groupId, + opts.kind ?? "full", + ago, + outcome, + opts.error ?? null, + opts.bytesReclaimed ?? null, + ], + ); +} + /** Seed a `backup_credential_issuances` row. `issuedAgoSecs` controls how long * ago the creds were issued (default 0 = now); `ttlSecs` their lifetime. */ export async function seedBackupCredentialIssuance( diff --git a/private-web/src/routes/BackupPanel.tsx b/private-web/src/routes/BackupPanel.tsx index 0ada8ae9..7a718d4a 100644 --- a/private-web/src/routes/BackupPanel.tsx +++ b/private-web/src/routes/BackupPanel.tsx @@ -49,6 +49,7 @@ import { BACKUP_STATUS_LABEL, type BackupConfigStatus, type BackupConfigView, + type BackupMaintenanceRun, type BackupRun, type ServerInfo, } from "../types"; @@ -213,6 +214,7 @@ export default function BackupPanel() { <> + )} @@ -860,6 +862,172 @@ function RecentRunsPanel({ ); } +const MAINT_KIND_LABEL: Record = { + quick: "Quick", + full: "Full", +}; + +/// One row of the maintenance table. Failed runs get an expand toggle that +/// reveals the error in a collapsible sub-row (mirrors RunRow). +function MaintRow({ run }: { run: BackupMaintenanceRun }) { + const [open, setOpen] = useState(false); + const hasError = Boolean(run.error); + const running = run.outcome == null; + return ( + <> + *": { borderBottom: "unset" } } : undefined} + > + + {hasError && ( + setOpen((o) => !o)} + > + {open ? : } + + )} + + + + + {MAINT_KIND_LABEL[run.kind] ?? run.kind} + + {running ? ( + + ) : ( + + )} + + + {run.finished_at ? : "—"} + + + {run.bytes_reclaimed == null ? "—" : formatBytes(run.bytes_reclaimed)} + + + {hasError && ( + + + + + + {run.error} + + + + + + )} + + ); +} + +/// At-a-glance "has maintenance run, and did it succeed" indicator derived from +/// the recent runs. The authoritative overdue/failed alerting is the group +/// incident (backup-maintenance-stale / backup-maintenance-error); this is the +/// quick read for an operator looking at the panel. +function MaintenanceSummary({ runs }: { runs: BackupMaintenanceRun[] }) { + if (runs.length === 0) { + return ( + + No maintenance has run yet. + + ); + } + const lastFinished = runs.find((r) => r.outcome != null); + const lastSuccess = runs.find((r) => r.outcome === "success"); + const failing = lastFinished?.outcome === "failure"; + return ( + + + {failing ? ( + + ) : lastFinished ? ( + + ) : ( + + )} + + + {lastSuccess ? ( + <> + Last successful maintenance{" "} + + + ) : ( + "No successful maintenance recorded yet." + )} + + + ); +} + +/// Repo maintenance: the at-a-glance health summary plus recent kopia +/// maintenance cycles (full maintenance is what expires/reclaims; quick is the +/// lighter compaction). Failed runs expand to their error. +function MaintenancePanel({ groupId }: { groupId: string }) { + const stats = useApi( + "backups", + "stats", + { server_group_id: groupId }, + [groupId], + ); + return ( + + + Repo maintenance + + {stats.status === "loading" || stats.status === "idle" ? ( + + ) : stats.status === "error" ? ( + {stats.error.message} + ) : ( + + + {stats.data.recent_maintenance.length > 0 && ( + + + + + + Started + Kind + Outcome + Finished + Reclaimed + + + + {stats.data.recent_maintenance.map((m) => ( + + ))} + +
+
+ )} +
+ )} +
+ ); +} + /// The group's servers and their backup types: per (server, type) the schedule /// state, when the next backup is expected (per-server, so a lagging member /// isn't masked by a freshly-backed-up sibling), the latest snapshot, and the From e0af17be8b7cb05ef82504f2c96129d43569aa28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Sat, 27 Jun 2026 06:04:41 +1200 Subject: [PATCH 2/3] test(backups): fix strict-mode matches in maintenance e2e getByText("success"/"running") also matched the "Last successful maintenance" caption / the "Running" summary chip. Use exact matches. Co-Authored-By: Claude Opus 4.8 --- private-web/e2e/backups.spec.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/private-web/e2e/backups.spec.ts b/private-web/e2e/backups.spec.ts index 5cad59bf..f4e361a5 100644 --- a/private-web/e2e/backups.spec.ts +++ b/private-web/e2e/backups.spec.ts @@ -878,7 +878,8 @@ test.describe("backups ready: repo maintenance panel", () => { await expect(panel.getByText("Healthy")).toBeVisible(); await expect(panel.getByText(/last successful maintenance/i)).toBeVisible(); await expect(panel.getByText("Full")).toBeVisible(); - await expect(panel.getByText("success")).toBeVisible(); + // exact: the "Last successful maintenance" caption also contains "success". + await expect(panel.getByText("success", { exact: true })).toBeVisible(); await expect(panel.getByText("1.0 MiB")).toBeVisible(); }); @@ -934,7 +935,8 @@ test.describe("backups ready: repo maintenance panel", () => { const panel = page .getByRole("heading", { name: /repo maintenance/i }) .locator(".."); - await expect(panel.getByText("running")).toBeVisible(); + // exact: the summary chip reads "Running" (capitalised); the row chip "running". + await expect(panel.getByText("running", { exact: true })).toBeVisible(); await expect(panel.getByText("Quick")).toBeVisible(); }); }); From 5b83b7a7fb6b87b176cdd47436873ea296fe9fac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Saparelli?= Date: Sat, 27 Jun 2026 06:43:50 +1200 Subject: [PATCH 3/3] docs(backups): correct expiry wording in maintenance panel comment Snapshot expiry (kopia snapshot expire --delete) runs on every maintenance cycle, not just full; full additionally reclaims the freed space. Co-Authored-By: Claude Opus 4.8 --- private-web/src/routes/BackupPanel.tsx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/private-web/src/routes/BackupPanel.tsx b/private-web/src/routes/BackupPanel.tsx index 7a718d4a..0186f3e5 100644 --- a/private-web/src/routes/BackupPanel.tsx +++ b/private-web/src/routes/BackupPanel.tsx @@ -980,8 +980,10 @@ function MaintenanceSummary({ runs }: { runs: BackupMaintenanceRun[] }) { } /// Repo maintenance: the at-a-glance health summary plus recent kopia -/// maintenance cycles (full maintenance is what expires/reclaims; quick is the -/// lighter compaction). Failed runs expand to their error. +/// maintenance cycles. Every cycle expires snapshots per the retention policy +/// (`kopia snapshot expire --delete`); full maintenance additionally reclaims +/// the freed space, while quick is the lighter compaction. Failed runs expand +/// to their error. function MaintenancePanel({ groupId }: { groupId: string }) { const stats = useApi( "backups",