From 520ad4fa6d52faae4290452855b14e523a6c66e4 Mon Sep 17 00:00:00 2001 From: Remylus Losius Date: Mon, 15 Jun 2026 23:21:21 -0400 Subject: [PATCH 1/5] feat(packaging): automatic, safe DB migration on package upgrade MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `dnf update openwatch*` (or apt) now applies pending DB migrations automatically, with a restore point and a fail-safe service state — for the single-instance appliance model. The operator runs one command. Mechanism: - internal/dbbackup: builds a pg_dump command that takes connection params (esp. the password) via PG* env, NEVER argv (no ps leak). - `openwatch migrate` gains --status (report pending without applying) and --backup-dir (pg_dump restore point BEFORE Apply; skipped on a fresh DB; FAILS CLOSED — never migrates if the backup fails). - packaging/common/openwatch-upgrade.sh (RPM %post when $1>=2 / DEB postinst when old-version $2 is set — upgrade only, never fresh install): stop -> openwatch migrate --backup-dir -> start on success; on failure leave the service STOPPED + print the restore path + exit non-zero so the package manager surfaces it. Migrations are transactional, so a failure rolls back atomically (data intact). - Backup retention: openwatch-backup-cleanup.timer (daily) prunes dumps past BACKUP_RETENTION_DAYS but always keeps the most recent. Operator-tunable via /etc/openwatch/upgrade.conf (AUTO_BACKUP, etc.). Deliberately OUT of scope (documented): PostgreSQL ENGINE major-version upgrades (operator-supervised pg_upgrade, never from a scriptlet), and multi-instance/zero-downtime upgrades. Spec release-upgrade (C-01..05 / AC-01..07); docs/runbooks/UPGRADING.md. Verified end to end: migrate --status + --backup-dir produce a real pg_dump then migrate against a live DB; both packages ship + render the upgrade-only guard. --- cmd/openwatch/main.go | 70 +++++++-- docs/runbooks/UPGRADING.md | 89 +++++++++++ internal/dbbackup/dbbackup.go | 102 +++++++++++++ internal/dbbackup/dbbackup_test.go | 71 +++++++++ packaging/common/cleanup-backups.sh | 32 ++++ .../common/openwatch-backup-cleanup.service | 15 ++ .../common/openwatch-backup-cleanup.timer | 11 ++ packaging/common/openwatch-upgrade.sh | 77 ++++++++++ packaging/common/upgrade.conf | 14 ++ packaging/deb/build-deb.sh | 8 + packaging/deb/conffiles | 1 + packaging/deb/postinst | 10 ++ packaging/deb/prerm | 1 + packaging/rpm/build-rpm.sh | 7 + packaging/rpm/openwatch.spec | 26 ++++ packaging/tests/upgrade_test.go | 140 ++++++++++++++++++ specs/release/upgrade.spec.yaml | 93 ++++++++++++ specter.yaml | 1 + 18 files changed, 757 insertions(+), 11 deletions(-) create mode 100644 docs/runbooks/UPGRADING.md create mode 100644 internal/dbbackup/dbbackup.go create mode 100644 internal/dbbackup/dbbackup_test.go create mode 100755 packaging/common/cleanup-backups.sh create mode 100644 packaging/common/openwatch-backup-cleanup.service create mode 100644 packaging/common/openwatch-backup-cleanup.timer create mode 100755 packaging/common/openwatch-upgrade.sh create mode 100644 packaging/common/upgrade.conf create mode 100644 packaging/tests/upgrade_test.go create mode 100644 specs/release/upgrade.spec.yaml diff --git a/cmd/openwatch/main.go b/cmd/openwatch/main.go index e7201c86..5ab69ed9 100644 --- a/cmd/openwatch/main.go +++ b/cmd/openwatch/main.go @@ -34,6 +34,7 @@ import ( "github.com/Hanalyx/openwatch/internal/db" "github.com/Hanalyx/openwatch/internal/db/migrations" + "github.com/Hanalyx/openwatch/internal/dbbackup" "github.com/Hanalyx/openwatch/internal/eventbus" "github.com/Hanalyx/openwatch/internal/exception" "github.com/Hanalyx/openwatch/internal/group" @@ -646,15 +647,36 @@ func parseLogLevel(s string) slog.Level { } } -// cmdMigrate connects to the configured database, runs goose Up for every -// pending migration, and prints the resulting version. -func cmdMigrate(cfg *config.Config, _ []string, stdout, stderr *os.File) int { +// cmdMigrate connects to the configured database and runs goose Up for +// every pending migration. +// +// Flags: +// +// --status report the current version + whether migrations are +// pending, WITHOUT applying anything (used by the +// package upgrade scriptlet and by operators). +// --backup-dir pg_dump to as a restore point BEFORE applying. +// Skipped when the DB has no schema yet (fresh install, +// nothing to back up). If the backup fails the command +// fails WITHOUT migrating — we never migrate without the +// restore point we promised. +func cmdMigrate(cfg *config.Config, args []string, stdout, stderr *os.File) int { + fs := flag.NewFlagSet("migrate", flag.ContinueOnError) + fs.SetOutput(stderr) + backupDir := fs.String("backup-dir", "", "pg_dump to this directory before applying (skipped when the DB has no schema yet)") + statusOnly := fs.Bool("status", false, "report current version + pending count without applying") + if err := fs.Parse(args); err != nil { + return 1 + } + if err := cfg.Validate(); err != nil { fmt.Fprintf(stderr, "openwatch migrate: invalid config:\n%v\n", err) return 1 } - ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + // Generous timeout: a pg_dump of a large DB before migrating can take + // minutes; this is an operator/scriptlet command, not a hot path. + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) defer cancel() pool, err := db.NewPool(ctx, cfg.Database.DSN, cfg.Database.MaxConnections) @@ -664,23 +686,47 @@ func cmdMigrate(cfg *config.Config, _ []string, stdout, stderr *os.File) int { } defer pool.Close() + curr, files, err := migrations.Status(ctx, pool) + if err != nil { + fmt.Fprintf(stderr, "openwatch migrate: status: %v\n", err) + return 1 + } + total := len(files) + + if *statusOnly { + fmt.Fprintf(stdout, "current version: %d\n", curr) + if int(curr) >= total { + fmt.Fprintln(stdout, "up to date — no migrations pending") + } else { + fmt.Fprintf(stdout, "PENDING: %d migration(s) not yet applied — run `openwatch migrate`\n", total-int(curr)) + } + return 0 + } + + // Restore point before applying — only when a schema already exists; a + // fresh DB (curr == 0) has nothing to dump. Fail closed on backup error. + if *backupDir != "" && curr > 0 { + stamp := time.Now().UTC().Format("20060102T150405Z") + path, berr := dbbackup.Run(ctx, cfg.Database.DSN, *backupDir, version.Version, stamp) + if berr != nil { + fmt.Fprintf(stderr, "openwatch migrate: backup failed, refusing to migrate: %v\n", berr) + return 1 + } + fmt.Fprintf(stdout, "backed up to %s\n", path) + } + fmt.Fprintf(stdout, "applying migrations against %s ...\n", config.RedactDSN(cfg.Database.DSN)) if err := migrations.Apply(ctx, pool); err != nil { fmt.Fprintf(stderr, "openwatch migrate: %v\n", err) return 1 } - version, files, err := migrations.Status(ctx, pool) + newVer, _, err := migrations.Status(ctx, pool) if err != nil { fmt.Fprintf(stderr, "openwatch migrate: status: %v\n", err) return 1 } - fmt.Fprintf(stdout, " current version: %d\n", version) - fmt.Fprintf(stdout, " migration files: %d\n", len(files)) - for _, name := range files { - fmt.Fprintf(stdout, " - %s\n", name) - } - fmt.Fprintln(stdout, "migrations applied") + fmt.Fprintf(stdout, "migrations applied — version %d -> %d\n", curr, newVer) return 0 } @@ -792,6 +838,8 @@ subcommands: serve run the HTTPS API server (default) worker run the scan-job claimer/dispatcher loop migrate apply pending goose migrations + --status report version + pending count, don't apply + --backup-dir pg_dump a restore point before applying create-admin create the first admin user (requires --username --email --password) check-config validate and print resolved config diff --git a/docs/runbooks/UPGRADING.md b/docs/runbooks/UPGRADING.md new file mode 100644 index 00000000..41cdbcbc --- /dev/null +++ b/docs/runbooks/UPGRADING.md @@ -0,0 +1,89 @@ +# Upgrading OpenWatch + +OpenWatch upgrades are **one command**. The package post-install scriptlet +applies any pending database migrations automatically — taking a backup +restore point first — and restarts the service. + +```bash +# RHEL / CentOS / Rocky / Alma / Fedora +sudo dnf update -y 'openwatch*' 'kensa-rules*' + +# Debian / Ubuntu +sudo apt update && sudo apt install --only-upgrade openwatch kensa-rules +``` + +That's it. On a single-instance install this is all an operator needs to do. + +## What happens automatically (on upgrade) + +The scriptlet runs **only on upgrade**, never on a fresh install, and does: + +1. **Checks the database is reachable.** If it isn't, migrations are skipped + with a warning (the upgrade doesn't fail) — run `openwatch migrate` + manually once the DB is back, then `systemctl restart openwatch`. +2. **Stops the service** — so the new binary never runs against an old schema + and vice versa. +3. **Backs up the database** with `pg_dump` to `/var/lib/openwatch/backups/` + (this is your restore point; the password is passed to `pg_dump` via the + environment, never on the command line). +4. **Applies pending migrations.** Each migration runs in a transaction, so a + failure rolls back atomically — your data is never left half-migrated. +5. **On success → starts the service** on the new version. + **On failure → leaves the service stopped**, prints the restore path, and + exits non-zero so `dnf`/`apt` flag that the upgrade needs attention. + +### Pre-flight (optional) + +See what would change before upgrading: + +```bash +sudo openwatch migrate --status +# -> "up to date — no migrations pending" OR "PENDING: N migration(s) ..." +``` + +## If a migration fails + +The service is left **stopped** and your data is intact (the failed migration +rolled back). Recover with: + +```bash +# 1. read the error in the dnf/apt output or: journalctl -u openwatch +# 2. fix the cause, then re-apply: +sudo openwatch migrate +sudo systemctl start openwatch +# on Debian, also clear the half-configured state: +sudo dpkg --configure -a +``` + +### Restoring from the pre-upgrade backup (last resort) + +```bash +ls -t /var/lib/openwatch/backups/ # newest dump first +sudo systemctl stop openwatch +# DSN is in /etc/openwatch/secrets.env (OPENWATCH_DATABASE_DSN) +psql "$OPENWATCH_DATABASE_DSN" < /var/lib/openwatch/backups/openwatch-pre-upgrade-<...>.sql +``` + +## Backups: location, retention, opt-out + +- Dumps live in `/var/lib/openwatch/backups/`. +- A systemd timer (`openwatch-backup-cleanup.timer`, runs daily) prunes dumps + older than `BACKUP_RETENTION_DAYS` (default 30) but **always keeps the most + recent one**, so you never lose your last restore point. +- Tune behavior in `/etc/openwatch/upgrade.conf`: + - `AUTO_BACKUP=yes|no` — set `no` only if you run your own verified + pre-upgrade backups. + - `BACKUP_DIR`, `BACKUP_RETENTION_DAYS`. + +## Scope and limits + +- **App schema migrations: automatic and safe** (this document). +- **Minor PostgreSQL / dependency updates**: handled by `dnf`/`apt` itself via + package dependencies — nothing extra to do. +- **PostgreSQL MAJOR-version upgrade** (e.g. 15 → 16): **NOT** performed by the + OpenWatch scriptlet. That is a data-directory migration (`pg_upgrade` or + dump/restore) that needs both server versions and must be operator-supervised + — doing it silently from a package upgrade risks the whole database. Plan it + separately, with its own backup. +- **Brief downtime** during the migrate step is expected (the appliance model). + Multi-instance / zero-downtime upgrades are out of scope. diff --git a/internal/dbbackup/dbbackup.go b/internal/dbbackup/dbbackup.go new file mode 100644 index 00000000..8bec97b0 --- /dev/null +++ b/internal/dbbackup/dbbackup.go @@ -0,0 +1,102 @@ +// Package dbbackup creates a plain-SQL pg_dump of the OpenWatch database, +// used as the pre-upgrade restore point before migrations run. +// +// The cardinal rule: connection parameters (especially the password) go to +// pg_dump via PG* environment variables, NEVER on the command line — so the +// password never appears in the process argv (visible in `ps`). This mirrors +// how the rest of the codebase keeps credentials out of argv. +package dbbackup + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "strconv" + + "github.com/jackc/pgx/v5/pgxpool" +) + +// dumpArgs are the pg_dump flags shared by Command and Run. --no-owner / +// --no-privileges keep the dump restorable regardless of the target role +// layout (the restore path is `psql < file`); -f writes to the file. +func dumpArgs(outPath string) []string { + return []string{"--no-owner", "--no-privileges", "-f", outPath} +} + +// dumpEnv translates dsn into the PG* environment pg_dump reads, so no +// connection parameter (least of all the password) ends up in argv. +func dumpEnv(dsn string) ([]string, error) { + cfg, err := pgxpool.ParseConfig(dsn) + if err != nil { + return nil, fmt.Errorf("dbbackup: parse dsn: %w", err) + } + cc := cfg.ConnConfig + env := append(os.Environ(), + "PGHOST="+cc.Host, + "PGPORT="+strconv.Itoa(int(cc.Port)), + "PGUSER="+cc.User, + "PGPASSWORD="+cc.Password, + "PGDATABASE="+cc.Database, + ) + // pgxpool leaves TLSConfig nil for sslmode=disable; map that back so + // pg_dump doesn't attempt TLS against a non-TLS local Postgres. When TLS + // IS configured we leave PGSSLMODE unset (pg_dump defaults to prefer). + if cc.TLSConfig == nil { + env = append(env, "PGSSLMODE=disable") + } + return env, nil +} + +// DumpFileName returns the backup filename for a dump taken at stamp (an +// already-formatted timestamp, e.g. "20260616T010203Z") for version. Pure, +// so the caller owns the clock and tests are deterministic. +func DumpFileName(version, stamp string) string { + v := version + if v == "" { + v = "unknown" + } + safe := make([]rune, 0, len(v)) + for _, r := range v { + switch { + case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9', r == '.', r == '-', r == '_': + safe = append(safe, r) + default: + safe = append(safe, '_') + } + } + return fmt.Sprintf("openwatch-pre-upgrade-%s-%s.sql", string(safe), stamp) +} + +// Command builds the pg_dump command writing a plain-SQL dump to outPath. +// Connection parameters live in cmd.Env (PG*), NONE in cmd.Args — the tests +// pin that the password never reaches argv. +func Command(dsn, outPath string) (*exec.Cmd, error) { + env, err := dumpEnv(dsn) + if err != nil { + return nil, err + } + cmd := exec.Command("pg_dump", dumpArgs(outPath)...) + cmd.Env = env + return cmd, nil +} + +// Run writes a dump to dir (created if absent) and returns the file path. +// The caller supplies version + stamp for the filename. +func Run(ctx context.Context, dsn, dir, version, stamp string) (string, error) { + if err := os.MkdirAll(dir, 0o750); err != nil { + return "", fmt.Errorf("dbbackup: mkdir %s: %w", dir, err) + } + out := filepath.Join(dir, DumpFileName(version, stamp)) + env, err := dumpEnv(dsn) + if err != nil { + return "", err + } + cmd := exec.CommandContext(ctx, "pg_dump", dumpArgs(out)...) + cmd.Env = env + if combined, err := cmd.CombinedOutput(); err != nil { + return "", fmt.Errorf("dbbackup: pg_dump failed: %w: %s", err, string(combined)) + } + return out, nil +} diff --git a/internal/dbbackup/dbbackup_test.go b/internal/dbbackup/dbbackup_test.go new file mode 100644 index 00000000..2f46a733 --- /dev/null +++ b/internal/dbbackup/dbbackup_test.go @@ -0,0 +1,71 @@ +// @spec release-upgrade +// +// AC traceability (this file): +// +// AC-01 TestCommand_PasswordNeverInArgv +// AC-02 TestDumpFileName_DeterministicAndSanitized + +package dbbackup + +import ( + "strings" + "testing" +) + +// @ac AC-01 +// AC-01: the pg_dump command carries connection params (incl. the password) +// in the environment only — NEVER in argv, where `ps` would expose it. +func TestCommand_PasswordNeverInArgv(t *testing.T) { + t.Run("release-upgrade/AC-01", func(t *testing.T) { + const secret = "s3cr3t-p@ss" + dsn := "postgres://owuser:" + secret + "@db.example:6543/owdb?sslmode=disable" + cmd, err := Command(dsn, "/var/lib/openwatch/backups/x.sql") + if err != nil { + t.Fatalf("Command: %v", err) + } + // Password must not appear anywhere in argv. + argv := strings.Join(cmd.Args, " ") + if strings.Contains(argv, secret) { + t.Errorf("password leaked into argv: %q", argv) + } + // Connection params must be in the env instead. + joinEnv := strings.Join(cmd.Env, "\n") + for _, want := range []string{ + "PGPASSWORD=" + secret, + "PGHOST=db.example", + "PGPORT=6543", + "PGUSER=owuser", + "PGDATABASE=owdb", + "PGSSLMODE=disable", // sslmode=disable -> nil TLS -> mapped back + } { + if !strings.Contains(joinEnv, want) { + t.Errorf("env missing %q", want) + } + } + // pg_dump must write to the file, not stdout. + if !strings.Contains(argv, "-f /var/lib/openwatch/backups/x.sql") { + t.Errorf("pg_dump not directed at the output file: %q", argv) + } + }) +} + +// @ac AC-02 +// AC-02: backup filenames are deterministic given (version, stamp) and +// sanitize unsafe characters in the version token. +func TestDumpFileName_DeterministicAndSanitized(t *testing.T) { + t.Run("release-upgrade/AC-02", func(t *testing.T) { + got := DumpFileName("0.2.0-rc.8", "20260616T010203Z") + want := "openwatch-pre-upgrade-0.2.0-rc.8-20260616T010203Z.sql" + if got != want { + t.Errorf("DumpFileName = %q, want %q", got, want) + } + // Unsafe characters (slashes, spaces) are replaced. + if name := DumpFileName("v1/2 3", "S"); strings.ContainsAny(name, "/ ") { + t.Errorf("unsafe chars survived: %q", name) + } + // Empty version falls back to a non-empty token. + if name := DumpFileName("", "S"); !strings.Contains(name, "unknown") { + t.Errorf("empty version not defaulted: %q", name) + } + }) +} diff --git a/packaging/common/cleanup-backups.sh b/packaging/common/cleanup-backups.sh new file mode 100755 index 00000000..1e99c8bd --- /dev/null +++ b/packaging/common/cleanup-backups.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# Prune old OpenWatch pre-upgrade database dumps. Run by the +# openwatch-backup-cleanup.timer. Keeps the most recent dump ALWAYS (so the +# last restore point is never lost) and deletes the rest older than +# BACKUP_RETENTION_DAYS. +set -uo pipefail + +CONF=/etc/openwatch/upgrade.conf +BACKUP_DIR=/var/lib/openwatch/backups +BACKUP_RETENTION_DAYS=30 +# shellcheck source=/dev/null +[ -f "$CONF" ] && . "$CONF" + +[ -d "$BACKUP_DIR" ] || exit 0 + +# Newest first, so index 0 is the one we always keep. Dump filenames are +# package-controlled (openwatch-pre-upgrade--.sql, no spaces or +# newlines), so the ls-based sort is safe here. +# shellcheck disable=SC2012 +mapfile -t dumps < <(ls -1t "$BACKUP_DIR"/openwatch-pre-upgrade-*.sql 2>/dev/null || true) +[ "${#dumps[@]}" -gt 1 ] || exit 0 + +idx=0 +for f in "${dumps[@]}"; do + if [ "$idx" -eq 0 ]; then + idx=1 + continue # always keep the most recent dump + fi + if [ -n "$(find "$f" -maxdepth 0 -mtime "+${BACKUP_RETENTION_DAYS}" 2>/dev/null)" ]; then + rm -f "$f" && echo "openwatch backup-cleanup: pruned $f" + fi +done diff --git a/packaging/common/openwatch-backup-cleanup.service b/packaging/common/openwatch-backup-cleanup.service new file mode 100644 index 00000000..4bd61971 --- /dev/null +++ b/packaging/common/openwatch-backup-cleanup.service @@ -0,0 +1,15 @@ +[Unit] +Description=Prune old OpenWatch pre-upgrade database backups +Documentation=file:///usr/share/doc/openwatch/UPGRADING.md + +[Service] +Type=oneshot +ExecStart=/usr/lib/openwatch/cleanup-backups.sh +# Backups are root-owned (written by the upgrade scriptlet); prune as root. +User=root +# Light hardening — the job only needs to read config and delete dumps. +ProtectSystem=strict +ReadWritePaths=/var/lib/openwatch/backups +ProtectHome=true +PrivateTmp=true +NoNewPrivileges=true diff --git a/packaging/common/openwatch-backup-cleanup.timer b/packaging/common/openwatch-backup-cleanup.timer new file mode 100644 index 00000000..56caf4dd --- /dev/null +++ b/packaging/common/openwatch-backup-cleanup.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Daily prune of OpenWatch pre-upgrade database backups + +[Timer] +OnCalendar=daily +# Run on next boot if the machine was off at the scheduled time. +Persistent=true +RandomizedDelaySec=1h + +[Install] +WantedBy=timers.target diff --git a/packaging/common/openwatch-upgrade.sh b/packaging/common/openwatch-upgrade.sh new file mode 100755 index 00000000..b8f727d2 --- /dev/null +++ b/packaging/common/openwatch-upgrade.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +# Apply pending OpenWatch DB migrations during a package UPGRADE, with an +# auto-backup restore point and a fail-safe service state. Invoked by the +# RPM %post / DEB postinst ONLY on upgrade (never on a fresh install — a +# fresh install has no database yet). +# +# Sequence (single-instance appliance model): +# 1. confirm the DB is reachable + already initialized (else skip, warn) +# 2. stop the service -> never run the old binary against a new schema +# 3. `openwatch migrate --backup-dir` -> pg_dump restore point, then apply +# (each migration is transactional; a failure rolls back atomically) +# 4. success -> start the service +# failure -> leave it STOPPED, print the restore path, exit non-zero so +# the package manager surfaces that the upgrade needs attention +# +# NOT `set -e`: we handle each failure explicitly to control the fail mode. +set -uo pipefail + +CONF=/etc/openwatch/upgrade.conf +AUTO_BACKUP=yes +BACKUP_DIR=/var/lib/openwatch/backups +# shellcheck source=/dev/null +[ -f "$CONF" ] && . "$CONF" + +# The migrate command reads the same config the service does; load the DB +# secret the systemd unit normally injects so a root scriptlet can connect. +SECRETS=/etc/openwatch/secrets.env +if [ -f "$SECRETS" ]; then + set -a + # shellcheck source=/dev/null + . "$SECRETS" + set +a +fi + +log() { echo "openwatch upgrade: $*"; } +err() { echo "openwatch upgrade: $*" >&2; } + +if ! command -v openwatch >/dev/null 2>&1; then + err "openwatch binary not found; skipping auto-migrate." + exit 0 +fi + +# DB reachable + initialized? `migrate --status` exits 0 when it can connect. +# If not (DB down / not yet provisioned), do NOT fail the package — warn and +# let the operator migrate manually once the DB is up. +if ! openwatch migrate --status >/dev/null 2>&1; then + err "database not reachable — migrations were NOT applied." + err "once the database is up, run: openwatch migrate && systemctl restart openwatch" + exit 0 +fi + +log "stopping service for migration" +systemctl stop openwatch.service >/dev/null 2>&1 || : + +args=() +if [ "${AUTO_BACKUP:-yes}" = "yes" ]; then + args+=(--backup-dir "$BACKUP_DIR") +fi + +if openwatch migrate "${args[@]}"; then + log "migration succeeded; starting service" + systemctl start openwatch.service >/dev/null 2>&1 || : + exit 0 +fi + +# Fail-safe: migration failed (and rolled back — DDL is transactional). Leave +# the service stopped rather than run the new binary against an un-migrated +# schema, and tell the operator exactly how to recover. +err "MIGRATION FAILED. The openwatch service has been left STOPPED to avoid" +err "running against an un-migrated schema. Your data is intact (each" +err "migration runs in a transaction and rolled back)." +if [ "${AUTO_BACKUP:-yes}" = "yes" ]; then + err "A pre-migration backup is in ${BACKUP_DIR}/ . To restore if needed:" + err " psql \"\$OPENWATCH_DATABASE_DSN\" < ${BACKUP_DIR}/openwatch-pre-upgrade-*.sql" +fi +err "After resolving the cause: openwatch migrate && systemctl start openwatch" +exit 1 diff --git a/packaging/common/upgrade.conf b/packaging/common/upgrade.conf new file mode 100644 index 00000000..391e5d83 --- /dev/null +++ b/packaging/common/upgrade.conf @@ -0,0 +1,14 @@ +# OpenWatch upgrade behavior. Sourced (KEY=value, no spaces) by the package +# upgrade scriptlet and the backup-cleanup timer. + +# Auto-backup the database (pg_dump) before applying migrations on upgrade. +# This is the restore point if a migration fails. Set to "no" ONLY if you run +# your own verified pre-upgrade backups. +AUTO_BACKUP=yes + +# Where pre-upgrade dumps are written. +BACKUP_DIR=/var/lib/openwatch/backups + +# How many days of dumps to keep. The most recent dump is ALWAYS kept, +# regardless of age, so you never lose your last restore point. +BACKUP_RETENTION_DAYS=30 diff --git a/packaging/deb/build-deb.sh b/packaging/deb/build-deb.sh index a747ad88..c2ee14e9 100755 --- a/packaging/deb/build-deb.sh +++ b/packaging/deb/build-deb.sh @@ -57,13 +57,21 @@ mkdir -p "$STAGE/etc/openwatch/keys" chmod 0750 "$STAGE/etc/openwatch/keys" mkdir -p "$STAGE/etc/systemd/system" mkdir -p "$STAGE/var/lib/openwatch" +# Pre-upgrade DB dumps land here (written by the upgrade scriptlet). +mkdir -p "$STAGE/var/lib/openwatch/backups" +chmod 0750 "$STAGE/var/lib/openwatch/backups" mkdir -p "$STAGE/var/log/openwatch" # Binary + config + unit. install -m 0755 "$DIST_DIR/openwatch" "$STAGE/usr/bin/openwatch" install -m 0640 "$APP_DIR/packaging/common/openwatch.toml" "$STAGE/etc/openwatch/openwatch.toml" +install -m 0640 "$APP_DIR/packaging/common/upgrade.conf" "$STAGE/etc/openwatch/upgrade.conf" install -m 0644 "$APP_DIR/packaging/common/openwatch.service" "$STAGE/etc/systemd/system/openwatch.service" +install -m 0644 "$APP_DIR/packaging/common/openwatch-backup-cleanup.service" "$STAGE/etc/systemd/system/openwatch-backup-cleanup.service" +install -m 0644 "$APP_DIR/packaging/common/openwatch-backup-cleanup.timer" "$STAGE/etc/systemd/system/openwatch-backup-cleanup.timer" install -m 0755 "$APP_DIR/packaging/common/provision-identity-keys.sh" "$STAGE/usr/lib/openwatch/provision-identity-keys.sh" +install -m 0755 "$APP_DIR/packaging/common/openwatch-upgrade.sh" "$STAGE/usr/lib/openwatch/openwatch-upgrade.sh" +install -m 0755 "$APP_DIR/packaging/common/cleanup-backups.sh" "$STAGE/usr/lib/openwatch/cleanup-backups.sh" # Demo TLS cert (chmod inside the script). bash "$APP_DIR/packaging/common/gen-demo-cert.sh" "$STAGE/etc/openwatch/tls" >/dev/null diff --git a/packaging/deb/conffiles b/packaging/deb/conffiles index a659cd5d..99b5dea4 100644 --- a/packaging/deb/conffiles +++ b/packaging/deb/conffiles @@ -1 +1,2 @@ /etc/openwatch/openwatch.toml +/etc/openwatch/upgrade.conf diff --git a/packaging/deb/postinst b/packaging/deb/postinst index 320fcaa6..033f7066 100755 --- a/packaging/deb/postinst +++ b/packaging/deb/postinst @@ -8,11 +8,21 @@ set -e if [ "$1" = "configure" ]; then if command -v systemctl >/dev/null 2>&1; then systemctl daemon-reload || : + # Enable the daily pre-upgrade-backup cleanup timer (install + upgrade). + systemctl enable --now openwatch-backup-cleanup.timer >/dev/null 2>&1 || : fi # Generate-if-absent, so reconfigure/upgrade never clobbers live keys. # preinst already created the openwatch user/group. Not guarded: a real # failure should abort configure rather than leave an unbootable service. /usr/lib/openwatch/provision-identity-keys.sh + + # UPGRADE: dpkg passes the previously-configured version as $2 (empty on a + # fresh install). Apply pending DB migrations with an auto-backup restore + # point + fail-safe service state. `set -e` propagates a migration failure + # (the helper exits non-zero) so dpkg leaves the package needing attention. + if [ -n "${2:-}" ]; then + /usr/lib/openwatch/openwatch-upgrade.sh + fi fi exit 0 diff --git a/packaging/deb/prerm b/packaging/deb/prerm index 9bffc40b..6462a3eb 100755 --- a/packaging/deb/prerm +++ b/packaging/deb/prerm @@ -7,6 +7,7 @@ if [ "$1" = "remove" ] || [ "$1" = "purge" ]; then if command -v systemctl >/dev/null 2>&1; then systemctl stop openwatch.service >/dev/null 2>&1 || : systemctl disable openwatch.service >/dev/null 2>&1 || : + systemctl disable --now openwatch-backup-cleanup.timer >/dev/null 2>&1 || : fi fi diff --git a/packaging/rpm/build-rpm.sh b/packaging/rpm/build-rpm.sh index 947b9b31..5f34025c 100755 --- a/packaging/rpm/build-rpm.sh +++ b/packaging/rpm/build-rpm.sh @@ -62,6 +62,13 @@ cp "$DIST_DIR/openwatch" "$SRC_DIR/openwatch" cp "$APP_DIR/packaging/common/openwatch.toml" "$SRC_DIR/openwatch.toml" cp "$APP_DIR/packaging/common/openwatch.service" "$SRC_DIR/openwatch.service" cp "$APP_DIR/packaging/common/provision-identity-keys.sh" "$SRC_DIR/provision-identity-keys.sh" +# Upgrade automation: migrate-on-upgrade helper, backup cleanup, systemd +# timer + units, and the operator-tunable upgrade.conf. +cp "$APP_DIR/packaging/common/openwatch-upgrade.sh" "$SRC_DIR/openwatch-upgrade.sh" +cp "$APP_DIR/packaging/common/cleanup-backups.sh" "$SRC_DIR/cleanup-backups.sh" +cp "$APP_DIR/packaging/common/upgrade.conf" "$SRC_DIR/upgrade.conf" +cp "$APP_DIR/packaging/common/openwatch-backup-cleanup.service" "$SRC_DIR/openwatch-backup-cleanup.service" +cp "$APP_DIR/packaging/common/openwatch-backup-cleanup.timer" "$SRC_DIR/openwatch-backup-cleanup.timer" # Demo TLS cert. bash "$APP_DIR/packaging/common/gen-demo-cert.sh" "$SRC_DIR" >/dev/null diff --git a/packaging/rpm/openwatch.spec b/packaging/rpm/openwatch.spec index e5ed75ae..71f7c342 100644 --- a/packaging/rpm/openwatch.spec +++ b/packaging/rpm/openwatch.spec @@ -62,16 +62,23 @@ install -d -m 0750 %{buildroot}/etc/openwatch/tls # (they must be unique per install); %post generates them into here. install -d -m 0750 %{buildroot}/etc/openwatch/keys install -m 0640 openwatch.toml %{buildroot}/etc/openwatch/openwatch.toml +install -m 0640 upgrade.conf %{buildroot}/etc/openwatch/upgrade.conf install -m 0644 cert.pem %{buildroot}/etc/openwatch/tls/cert.pem install -m 0600 key.pem %{buildroot}/etc/openwatch/tls/key.pem install -d -m 0755 %{buildroot}/usr/lib/openwatch install -m 0755 provision-identity-keys.sh %{buildroot}/usr/lib/openwatch/provision-identity-keys.sh +install -m 0755 openwatch-upgrade.sh %{buildroot}/usr/lib/openwatch/openwatch-upgrade.sh +install -m 0755 cleanup-backups.sh %{buildroot}/usr/lib/openwatch/cleanup-backups.sh install -d -m 0755 %{buildroot}/etc/systemd/system install -m 0644 openwatch.service %{buildroot}/etc/systemd/system/openwatch.service +install -m 0644 openwatch-backup-cleanup.service %{buildroot}/etc/systemd/system/openwatch-backup-cleanup.service +install -m 0644 openwatch-backup-cleanup.timer %{buildroot}/etc/systemd/system/openwatch-backup-cleanup.timer install -d -m 0750 %{buildroot}/var/lib/openwatch +# Pre-upgrade DB dumps land here (written by the upgrade scriptlet). +install -d -m 0750 %{buildroot}/var/lib/openwatch/backups install -d -m 0750 %{buildroot}/var/log/openwatch %pre @@ -92,11 +99,24 @@ systemctl daemon-reload || : # as a scriptlet warning, not silently leave an unbootable service. /usr/lib/openwatch/provision-identity-keys.sh +# Enable the daily pre-upgrade-backup cleanup timer (install + upgrade). +systemctl enable --now openwatch-backup-cleanup.timer >/dev/null 2>&1 || : + +# On UPGRADE ($1 -ge 2), apply pending DB migrations with an auto-backup +# restore point and a fail-safe service state (stop -> backup+migrate -> +# start, or leave stopped on failure). Skipped on a FRESH install ($1 == 1): +# there is no database yet — the operator runs the documented first-run +# `openwatch migrate` after provisioning Postgres. +if [ "$1" -ge 2 ]; then + /usr/lib/openwatch/openwatch-upgrade.sh +fi + %preun # AC-09: stop and disable cleanly so removal doesn't leave a running orphan. if [ $1 -eq 0 ]; then systemctl stop openwatch.service >/dev/null 2>&1 || : systemctl disable openwatch.service >/dev/null 2>&1 || : + systemctl disable --now openwatch-backup-cleanup.timer >/dev/null 2>&1 || : fi %postun @@ -108,14 +128,20 @@ systemctl daemon-reload || : %dir %attr(0750, root, openwatch) /etc/openwatch %dir %attr(0750, root, openwatch) /etc/openwatch/tls %config(noreplace) %attr(0640, root, openwatch) /etc/openwatch/openwatch.toml +%config(noreplace) %attr(0640, root, openwatch) /etc/openwatch/upgrade.conf %attr(0644, root, openwatch) /etc/openwatch/tls/cert.pem %attr(0600, openwatch, openwatch) /etc/openwatch/tls/key.pem %attr(0644, root, root) /etc/systemd/system/openwatch.service +%attr(0644, root, root) /etc/systemd/system/openwatch-backup-cleanup.service +%attr(0644, root, root) /etc/systemd/system/openwatch-backup-cleanup.timer # Identity-key directory ships empty (0750); %post generates the per-install # keys into it. The key files are intentionally NOT packaged. %dir %attr(0750, root, openwatch) /etc/openwatch/keys %attr(0755, root, root) /usr/lib/openwatch/provision-identity-keys.sh +%attr(0755, root, root) /usr/lib/openwatch/openwatch-upgrade.sh +%attr(0755, root, root) /usr/lib/openwatch/cleanup-backups.sh %dir %attr(0750, openwatch, openwatch) /var/lib/openwatch +%dir %attr(0750, root, openwatch) /var/lib/openwatch/backups %dir %attr(0750, openwatch, openwatch) /var/log/openwatch %changelog diff --git a/packaging/tests/upgrade_test.go b/packaging/tests/upgrade_test.go new file mode 100644 index 00000000..9bbc054f --- /dev/null +++ b/packaging/tests/upgrade_test.go @@ -0,0 +1,140 @@ +// @spec release-upgrade +// +// AC traceability (this file): +// +// AC-03 TestUpgrade_MigrateFlagsAndFailClosed +// AC-04 TestUpgrade_ScriptletGuardedOnUpgradeOnly +// AC-05 TestUpgrade_HelperSequenceAndFailSafe +// AC-06 TestUpgrade_CleanupTimerShippedAndKeepsNewest +// AC-07 TestUpgrade_PayloadShipsUpgradeFiles + +package packaging_test + +import ( + "strings" + "testing" +) + +// @ac AC-03 +// AC-03: `openwatch migrate` gains --status and --backup-dir, the backup +// runs BEFORE Apply, and a backup failure returns without migrating. +func TestUpgrade_MigrateFlagsAndFailClosed(t *testing.T) { + t.Run("release-upgrade/AC-03", func(t *testing.T) { + src := readAppFile(t, "cmd/openwatch/main.go") + for _, needle := range []string{`"backup-dir"`, `"status"`, "refusing to migrate"} { + if !strings.Contains(src, needle) { + t.Errorf("cmdMigrate missing %q", needle) + } + } + // The pre-migration backup must precede migrations.Apply. + backupAt := strings.Index(src, "dbbackup.Run(") + applyAt := strings.Index(src, "migrations.Apply(ctx, pool)") + if backupAt < 0 || applyAt < 0 { + t.Fatalf("could not locate backup (%d) / apply (%d) in cmdMigrate", backupAt, applyAt) + } + if backupAt > applyAt { + t.Error("backup must run BEFORE migrations.Apply (fail-closed restore point)") + } + }) +} + +// @ac AC-04 +// AC-04: the upgrade helper runs on UPGRADE only — RPM gates on $1>=2, DEB +// on a non-empty old-version $2 — never on a fresh install. +func TestUpgrade_ScriptletGuardedOnUpgradeOnly(t *testing.T) { + t.Run("release-upgrade/AC-04", func(t *testing.T) { + spec := readAppFile(t, "packaging/rpm/openwatch.spec") + if !strings.Contains(spec, `[ "$1" -ge 2 ]`) || !strings.Contains(spec, "openwatch-upgrade.sh") { + t.Error(`RPM post-install must guard openwatch-upgrade.sh on [ "$1" -ge 2 ] (upgrade only)`) + } + post := readAppFile(t, "packaging/deb/postinst") + if !strings.Contains(post, `[ -n "${2:-}" ]`) || !strings.Contains(post, "openwatch-upgrade.sh") { + t.Error("DEB postinst must guard openwatch-upgrade.sh on a non-empty old-version $2 (upgrade only)") + } + }) +} + +// @ac AC-05 +// AC-05: the helper stops the service, migrates with a backup, starts on +// success, and on failure leaves the service stopped + exits non-zero. +func TestUpgrade_HelperSequenceAndFailSafe(t *testing.T) { + t.Run("release-upgrade/AC-05", func(t *testing.T) { + h := readAppFile(t, "packaging/common/openwatch-upgrade.sh") + for _, needle := range []string{ + "systemctl stop openwatch.service", + "openwatch migrate", + "--backup-dir", + "systemctl start openwatch.service", + "STOPPED", // the fail-safe message + "exit 1", // non-zero on migration failure + } { + if !strings.Contains(h, needle) { + t.Errorf("openwatch-upgrade.sh missing %q", needle) + } + } + // Start must be gated on migrate success, not unconditional: the + // success path returns before the failure block. + if strings.Index(h, "systemctl start openwatch.service") > strings.Index(h, "MIGRATION FAILED") { + t.Error("service start must be on the success path, before the failure block") + } + }) +} + +// @ac AC-06 +// AC-06: the cleanup timer + service ship and are enabled; the prune keeps +// the most recent dump. +func TestUpgrade_CleanupTimerShippedAndKeepsNewest(t *testing.T) { + t.Run("release-upgrade/AC-06", func(t *testing.T) { + // Enabled in both scriptlets. + if !strings.Contains(readAppFile(t, "packaging/rpm/openwatch.spec"), "enable --now openwatch-backup-cleanup.timer") { + t.Error("RPM post-install must enable openwatch-backup-cleanup.timer") + } + if !strings.Contains(readAppFile(t, "packaging/deb/postinst"), "enable --now openwatch-backup-cleanup.timer") { + t.Error("DEB postinst must enable openwatch-backup-cleanup.timer") + } + // The prune always keeps the newest dump (index 0 -> continue). + clean := readAppFile(t, "packaging/common/cleanup-backups.sh") + if !strings.Contains(clean, "always keep") && !strings.Contains(clean, "keep the most recent") { + t.Error("cleanup-backups.sh must keep the most recent dump") + } + if !strings.Contains(clean, "BACKUP_RETENTION_DAYS") { + t.Error("cleanup-backups.sh must prune by BACKUP_RETENTION_DAYS") + } + }) +} + +// @ac AC-07 +// AC-07: the package payloads carry the upgrade helper, cleanup script, the +// two systemd units, the empty backups dir, and upgrade.conf as a config. +func TestUpgrade_PayloadShipsUpgradeFiles(t *testing.T) { + t.Run("release-upgrade/AC-07", func(t *testing.T) { + want := []string{ + "/usr/lib/openwatch/openwatch-upgrade.sh", + "/usr/lib/openwatch/cleanup-backups.sh", + "/etc/systemd/system/openwatch-backup-cleanup.service", + "/etc/systemd/system/openwatch-backup-cleanup.timer", + "/etc/openwatch/upgrade.conf", + "/var/lib/openwatch/backups", + } + + rpm := rpmPath(t) + rpmFiles := rpmQuery(t, rpm, "[%{FILENAMES}\n]") + for _, f := range want { + if !strings.Contains(rpmFiles, f) { + t.Errorf("RPM payload missing %s", f) + } + } + // upgrade.conf must be a config file (noreplace), like openwatch.toml. + if cfgs := rpmQuery(t, rpm, "[%{FILEFLAGS:fflags} %{FILENAMES}\n]"); !strings.Contains(cfgs, "/etc/openwatch/upgrade.conf") { + t.Errorf("upgrade.conf not in file list: %s", cfgs) + } + + deb := debPath(t) + debFiles := debContents(t, deb) + for _, f := range want { + if !strings.Contains(debFiles, f) { + t.Errorf("DEB payload missing %s", f) + } + } + }) +} diff --git a/specs/release/upgrade.spec.yaml b/specs/release/upgrade.spec.yaml new file mode 100644 index 00000000..25821a29 --- /dev/null +++ b/specs/release/upgrade.spec.yaml @@ -0,0 +1,93 @@ +spec: + id: release-upgrade + title: One-command package upgrade with auto-migrate + backup + version: "1.0.0" + status: approved + tier: 2 + + context: + system: openwatch-go + feature: Automatic, safe DB migration on package upgrade + description: > + The operator upgrades OpenWatch with a single `dnf update openwatch*` + (or apt) command; the package post-install scriptlet applies any + pending database migrations automatically, taking a pg_dump restore + point first, and restarts the service — or, if a migration fails, + leaves the service stopped with a clear recovery path. The app schema + is in scope; the PostgreSQL ENGINE major-version upgrade is NOT (that + is an operator-supervised pg_upgrade, never triggered from a scriptlet). + related_specs: + - release-package-build + - system-config + + objective: + summary: > + Make a production upgrade one command, safe by construction: never + migrate without a restore point, never run a binary against a + mismatched schema, and never lose data — for the single-instance + appliance model. + scope: + includes: + - openwatch migrate --status (report pending without applying) + - openwatch migrate --backup-dir (pg_dump restore point before applying; fail-closed) + - pg_dump password delivered via PG* env, never in argv + - RPM %post / DEB postinst run the upgrade helper ONLY on upgrade (not fresh install) + - Upgrade helper sequence stop -> backup+migrate -> start, fail-safe on error + - Backup-cleanup systemd timer + service, keep-most-recent prune + - Operator-tunable /etc/openwatch/upgrade.conf (AUTO_BACKUP, retention) + excludes: + - PostgreSQL engine major-version upgrade (operator-supervised pg_upgrade) + - Multi-instance / HA migration coordination (advisory lock + rolling restart) + - Zero-downtime upgrades (the appliance model accepts brief downtime during migrate) + + constraints: + - id: C-01 + description: The pre-migration pg_dump MUST receive connection parameters (especially the password) via PG* environment variables, never on the command line / argv. + type: security + enforcement: error + - id: C-02 + description: When a backup is requested it MUST complete successfully before any migration is applied; a failed backup MUST abort without migrating (fail-closed). Backup is skipped only when the DB has no schema yet (fresh DB). + type: technical + enforcement: error + - id: C-03 + description: The package scriptlet MUST run the auto-migrate helper ONLY on upgrade (RPM $1 >= 2; DEB configure with a non-empty old-version $2), never on a fresh install (no database exists yet). + type: technical + enforcement: error + - id: C-04 + description: On migration failure the upgrade helper MUST leave the service STOPPED (never run the new binary against an un-migrated schema) and exit non-zero so the package manager surfaces the problem, printing the restore path. + type: technical + enforcement: error + - id: C-05 + description: The backup-cleanup MUST always keep the most recent dump regardless of age, pruning only older dumps past the retention window. + type: technical + enforcement: error + + acceptance_criteria: + - id: AC-01 + description: dbbackup.Command places the password (and all connection params) in the command Env, never in Args — verified for a DSN carrying a password. + priority: critical + references_constraints: [C-01] + - id: AC-02 + description: dbbackup.DumpFileName is deterministic given (version, stamp) and sanitizes unsafe characters; empty version defaults to a non-empty token. + priority: high + references_constraints: [C-01] + - id: AC-03 + description: cmdMigrate supports --status (report current version + pending count, no apply) and --backup-dir (pg_dump before Apply, skipped when version==0, fail-closed on backup error) — the backup precedes migrations.Apply and a backup error returns before Apply. + priority: critical + references_constraints: [C-02] + - id: AC-04 + description: The RPM %post guards the upgrade helper on `$1 -ge 2` and the DEB postinst guards it on a non-empty old-version `$2`, so it runs on upgrade only, never on fresh install. + priority: critical + references_constraints: [C-03] + - id: AC-05 + description: packaging/common/openwatch-upgrade.sh stops the service, runs `openwatch migrate --backup-dir`, starts on success, and on failure leaves the service stopped and exits non-zero with the restore path (source-inspection). + priority: critical + references_constraints: [C-04] + - id: AC-06 + description: Both packages ship the openwatch-backup-cleanup .service + .timer and enable the timer in %post/postinst; cleanup-backups.sh keeps the most recent dump and prunes older ones past BACKUP_RETENTION_DAYS. + priority: high + references_constraints: [C-05] + - id: AC-07 + description: The package payloads contain /usr/lib/openwatch/openwatch-upgrade.sh, /usr/lib/openwatch/cleanup-backups.sh, the two systemd units, and the empty /var/lib/openwatch/backups directory; /etc/openwatch/upgrade.conf ships as a noreplace config file. + priority: high + references_constraints: [C-03] diff --git a/specter.yaml b/specter.yaml index 07cf6a99..f635b9e5 100644 --- a/specter.yaml +++ b/specter.yaml @@ -26,6 +26,7 @@ domains: - system-job-queue - system-policy - release-package-build + - release-upgrade - release-fips-build - release-stage-0-signoff - release-ci-gates From b5a8f5933226225af48bad5d48b459ac420b9a87 Mon Sep 17 00:00:00 2001 From: Remylus Losius Date: Mon, 15 Jun 2026 23:48:21 -0400 Subject: [PATCH 2/5] test(packaging): full package-upgrade test in a container Proves the real upgrade path end to end, beyond the source-inspection + scriptlet-logic tests: install the OLD openwatch RPM (release 1), stand up Postgres, roll the schema back one migration to simulate the prior version, then `rpm -U` the NEW RPM (release 2) and assert the package's %post scriptlet migrated the DB to head (34 -> 35, host_connection_profile created), took a pre-upgrade backup, and issued the service stop/start. - packaging/tests/upgrade-container-test.sh runs inside rockylinux:9 (a systemctl shim records stop/start since the container has no systemd). - packaging/tests/run-upgrade-container-test.sh is the one-command host driver: builds the two RPM releases and runs the container. - openwatch-upgrade.sh: OPENWATCH_UPGRADE_CONF / OPENWATCH_SECRETS_ENV overrides (default to the production /etc paths) so the test can point config + secrets at a scratch fixture. No production behavior change. Verified locally: RESULT PASS (34 -> 35, backup taken, stop+start issued). --- packaging/common/openwatch-upgrade.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/packaging/common/openwatch-upgrade.sh b/packaging/common/openwatch-upgrade.sh index b8f727d2..233eaea9 100755 --- a/packaging/common/openwatch-upgrade.sh +++ b/packaging/common/openwatch-upgrade.sh @@ -16,7 +16,9 @@ # NOT `set -e`: we handle each failure explicitly to control the fail mode. set -uo pipefail -CONF=/etc/openwatch/upgrade.conf +# Config/secrets paths default to the production locations; the env +# overrides exist ONLY so the e2e test can point them at a scratch fixture. +CONF="${OPENWATCH_UPGRADE_CONF:-/etc/openwatch/upgrade.conf}" AUTO_BACKUP=yes BACKUP_DIR=/var/lib/openwatch/backups # shellcheck source=/dev/null @@ -24,7 +26,7 @@ BACKUP_DIR=/var/lib/openwatch/backups # The migrate command reads the same config the service does; load the DB # secret the systemd unit normally injects so a root scriptlet can connect. -SECRETS=/etc/openwatch/secrets.env +SECRETS="${OPENWATCH_SECRETS_ENV:-/etc/openwatch/secrets.env}" if [ -f "$SECRETS" ]; then set -a # shellcheck source=/dev/null From 97087b78f895691e97baadb463169337c69b84fa Mon Sep 17 00:00:00 2001 From: Remylus Losius Date: Mon, 15 Jun 2026 23:49:18 -0400 Subject: [PATCH 3/5] test(packaging): add the container upgrade-test scripts (un-ignore) The two scripts were silently caught by .gitignore's blanket `*test*.sh` and dropped from the previous commit. Add a `!packaging/tests/*test*.sh` exception so committed test-harness scripts are tracked, and add the container upgrade test + its host driver. --- .gitignore | 3 + packaging/tests/run-upgrade-container-test.sh | 35 +++++++++ packaging/tests/upgrade-container-test.sh | 72 +++++++++++++++++++ 3 files changed, 110 insertions(+) create mode 100755 packaging/tests/run-upgrade-container-test.sh create mode 100755 packaging/tests/upgrade-container-test.sh diff --git a/.gitignore b/.gitignore index 60eca9f7..e020773f 100644 --- a/.gitignore +++ b/.gitignore @@ -439,6 +439,9 @@ frontend_test.log *test*.md *test*.js *test*.cjs +# Exception: committed test harness scripts under packaging/tests/ (e.g. the +# container upgrade test) are real tracked files, not transient scratch. +!packaging/tests/*test*.sh *test*.mjs *test*.ts *test*.tsx diff --git a/packaging/tests/run-upgrade-container-test.sh b/packaging/tests/run-upgrade-container-test.sh new file mode 100755 index 00000000..e4f1fade --- /dev/null +++ b/packaging/tests/run-upgrade-container-test.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Host-side driver for the full package-upgrade test. Builds an OLD +# (release 1) and NEW (release 2) openwatch RPM from the current tree, then +# runs packaging/tests/upgrade-container-test.sh inside a rockylinux:9 +# container to prove that `rpm -U` (the real package upgrade) migrates the +# DB to head, takes a pre-upgrade backup, and stop/starts the service. +# +# Requires docker plus the host build toolchain (go, make, rpmbuild). +# +# Network: uses --network host so dnf inside the container can reach the +# Rocky mirrors; the container's throwaway Postgres binds the unused port +# 55432 to avoid colliding with any Postgres already on the host. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +APP_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" +cd "$APP_DIR" + +command -v docker >/dev/null || { echo "docker is required" >&2; exit 1; } + +RPMS="$(mktemp -d)" +trap 'rm -rf "$RPMS"' EXIT +mkdir -p "$RPMS/old" "$RPMS/new" + +echo ">> building OLD (release 1) and NEW (release 2) RPMs" +make rpm >/dev/null +cp dist/openwatch-*-1.*.rpm "$RPMS/old/" +RPM_RELEASE=2 bash packaging/rpm/build-rpm.sh >/dev/null +cp dist/openwatch-*-2.*.rpm "$RPMS/new/" + +echo ">> running the upgrade in a rockylinux:9 container" +exec docker run --rm --network host \ + -v "$RPMS:/rpms:ro" \ + -v "$APP_DIR/packaging/tests/upgrade-container-test.sh:/test.sh:ro" \ + rockylinux:9 bash /test.sh diff --git a/packaging/tests/upgrade-container-test.sh b/packaging/tests/upgrade-container-test.sh new file mode 100755 index 00000000..8715290c --- /dev/null +++ b/packaging/tests/upgrade-container-test.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +# Full package-manager upgrade test. Runs INSIDE a rockylinux:9 container +# with the OLD and NEW openwatch RPMs mounted at /rpms/old and /rpms/new. +# +# It proves the real upgrade path: install the OLD package, stand up a +# database, roll it back one migration to simulate a prior version, then +# `rpm -U` the NEW package and confirm the package's %post scriptlet (with +# $1=2) migrated the DB to head and took a pre-upgrade backup — with a +# systemctl shim standing in for systemd (no init in the container). +# +# Invoke from the host via packaging/tests/run-upgrade-container-test.sh. +set -euo pipefail + +echo "### prerequisites" +dnf install -y -q postgresql-server postgresql openssl findutils >/dev/null + +echo "### start postgres (no systemd)" +PGDATA=/var/lib/pgsql/data +mkdir -p /run/postgresql && chown postgres:postgres /run/postgresql +su postgres -c "initdb -D $PGDATA" >/dev/null +su postgres -c "pg_ctl -D $PGDATA -o '-c listen_addresses=127.0.0.1 -p 55432' -l /tmp/pg.log -w start" \ + || { echo '--- pg log ---'; cat /tmp/pg.log; exit 1; } +su postgres -c "psql -p 55432 -q -c \"CREATE USER ow PASSWORD 'ow' SUPERUSER;\"" +su postgres -c "createdb -p 55432 -O ow owdb" +DSN="postgres://ow:ow@127.0.0.1:55432/owdb?sslmode=disable" + +echo "### systemctl shim (records the helper's stop/start)" +# Overwrite the real systemctl: there is no systemd init in the container, and +# RPM scriptlets run with a restricted PATH (/sbin:/bin:/usr/sbin:/usr/bin) +# that excludes /usr/local/bin, so the shim must live in /usr/bin to be seen. +printf '#!/bin/sh\necho "$@" >> /tmp/systemctl.log\nexit 0\n' > /usr/bin/systemctl +chmod +x /usr/bin/systemctl +: > /tmp/systemctl.log + +echo "### install OLD package (release 1)" +rpm -i --nodeps /rpms/old/openwatch-*.rpm +echo "OPENWATCH_DATABASE_DSN=$DSN" > /etc/openwatch/secrets.env + +echo "### bring DB to head, then roll back migration 0035 to simulate the prior version" +set -a; . /etc/openwatch/secrets.env; set +a +openwatch migrate >/dev/null +PGPASSWORD=ow psql -h 127.0.0.1 -p 55432 -U ow -d owdb -q \ + -c "DROP TABLE IF EXISTS host_connection_profile CASCADE; DELETE FROM goose_db_version WHERE version_id=35;" +before=$(PGPASSWORD=ow psql -h 127.0.0.1 -p 55432 -U ow -d owdb -tAc 'SELECT max(version_id) FROM goose_db_version') +hcp_before=$(PGPASSWORD=ow psql -h 127.0.0.1 -p 55432 -U ow -d owdb -tAc "SELECT to_regclass('host_connection_profile')") +echo "BEFORE upgrade: version=$before host_connection_profile=${hcp_before:-}" +: > /tmp/systemctl.log + +echo "### UPGRADE: rpm -U the NEW package (release 2) — triggers %post with \$1=2" +rpm -U --nodeps /rpms/new/openwatch-*.rpm + +echo "### results" +after=$(PGPASSWORD=ow psql -h 127.0.0.1 -p 55432 -U ow -d owdb -tAc 'SELECT max(version_id) FROM goose_db_version') +hcp_after=$(PGPASSWORD=ow psql -h 127.0.0.1 -p 55432 -U ow -d owdb -tAc "SELECT to_regclass('host_connection_profile')") +echo "AFTER upgrade: version=$after host_connection_profile=${hcp_after:-}" +echo "systemctl during upgrade: $(tr '\n' ',' < /tmp/systemctl.log)" +echo "backups: $(ls /var/lib/openwatch/backups/ 2>/dev/null || echo none)" + +echo "### assertions" +fail=0 +[ "$before" = "34" ] || { echo "FAIL: pre-upgrade version != 34"; fail=1; } +[ "$after" = "35" ] || { echo "FAIL: post-upgrade version != 35 (migration did not apply)"; fail=1; } +[ "$hcp_after" = "host_connection_profile" ] || { echo "FAIL: 0035 table not created by the upgrade"; fail=1; } +ls /var/lib/openwatch/backups/openwatch-pre-upgrade-*.sql >/dev/null 2>&1 || { echo "FAIL: no pre-upgrade backup"; fail=1; } +grep -q "stop openwatch.service" /tmp/systemctl.log || { echo "FAIL: service not stopped"; fail=1; } +grep -q "start openwatch.service" /tmp/systemctl.log || { echo "FAIL: service not restarted"; fail=1; } +if [ "$fail" -eq 0 ]; then + echo "RESULT: PASS - the package upgrade migrated 34 -> 35 with a backup and a stop/start" +else + echo "RESULT: FAIL" + exit 1 +fi From d104bff8229c7ea576ba1382a3dcdda515c64e75 Mon Sep 17 00:00:00 2001 From: Remylus Losius Date: Mon, 15 Jun 2026 23:52:27 -0400 Subject: [PATCH 4/5] ci(packaging): add an upgrade-smoke job (rpm -U auto-migrate) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds an `upgrade` job to package-smoke that exercises the full package UPGRADE path the per-distro `smoke` (fresh install) job can't: build an old (release 1) + new (release 2) RPM, then in a rockylinux:9 container install the old, stand up Postgres, roll the schema back one migration, and `rpm -U` the new — asserting the %post scriptlet migrates the DB to head, takes a pre-upgrade backup, and stop/starts the service. Runs the committed packaging/tests/run-upgrade-container-test.sh driver. Fires on the same packaging-change / tag / dispatch triggers as the rest of the workflow, so it runs on this PR. --- .github/workflows/package-smoke.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/.github/workflows/package-smoke.yml b/.github/workflows/package-smoke.yml index 09b5a803..a01e0a0d 100644 --- a/.github/workflows/package-smoke.yml +++ b/.github/workflows/package-smoke.yml @@ -115,3 +115,29 @@ jobs: # Binary runs (no DB needed for these). /usr/bin/openwatch --version /usr/bin/openwatch check-config + + # Full package-UPGRADE path: build an old (release 1) + new (release 2) RPM, + # then in a rockylinux:9 container install the old one, stand up Postgres, + # roll the schema back one migration, and `rpm -U` the new one — asserting + # the %post scriptlet migrates the DB to head, takes a pre-upgrade backup, + # and stop/starts the service. This is the half the per-distro `smoke` job + # (a fresh install) does not cover. The driver script builds both RPMs and + # runs the container; it uses --network host (so the container's dnf reaches + # mirrors) and a throwaway Postgres on port 55432. + upgrade: + name: Package upgrade (rpm -U auto-migrate) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-go@v5 + with: + go-version: '1.26.4' + - uses: actions/setup-node@v4 + with: + node-version: '20' + cache: 'npm' + cache-dependency-path: frontend/package-lock.json + - name: Install rpmbuild + run: sudo apt-get update && sudo apt-get install -y rpm + - name: Build old+new RPMs and run the container upgrade test + run: bash packaging/tests/run-upgrade-container-test.sh From 49ffede2994dfb084e72039e7cc9ce72d152a230 Mon Sep 17 00:00:00 2001 From: Remylus Losius Date: Mon, 15 Jun 2026 23:57:55 -0400 Subject: [PATCH 5/5] fix(dbbackup): suppress gosec G204 on the controlled pg_dump exec The command is the constant pg_dump, args are package-controlled flags + an output path (never user input), and connection params travel via env not argv. Annotate both exec sites with // #nosec G204 (the repo's convention) so make lint passes. --- internal/dbbackup/dbbackup.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/internal/dbbackup/dbbackup.go b/internal/dbbackup/dbbackup.go index 8bec97b0..eb3544a4 100644 --- a/internal/dbbackup/dbbackup.go +++ b/internal/dbbackup/dbbackup.go @@ -77,6 +77,9 @@ func Command(dsn, outPath string) (*exec.Cmd, error) { if err != nil { return nil, err } + // #nosec G204 -- the command is the constant "pg_dump"; the args are + // package-controlled flags + an output path (never user input), and the + // connection parameters (incl. the password) travel via env, not argv. cmd := exec.Command("pg_dump", dumpArgs(outPath)...) cmd.Env = env return cmd, nil @@ -93,6 +96,8 @@ func Run(ctx context.Context, dsn, dir, version, stamp string) (string, error) { if err != nil { return "", err } + // #nosec G204 -- see Command: constant "pg_dump", package-controlled + // args, secrets via env not argv. cmd := exec.CommandContext(ctx, "pg_dump", dumpArgs(out)...) cmd.Env = env if combined, err := cmd.CombinedOutput(); err != nil {