Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .github/workflows/package-smoke.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,29 @@ jobs:
# Binary runs (no DB needed for these).
/usr/bin/openwatch --version
/usr/bin/openwatch check-config

# Full package-UPGRADE path: build an old (release 1) + new (release 2) RPM,
# then in a rockylinux:9 container install the old one, stand up Postgres,
# roll the schema back one migration, and `rpm -U` the new one — asserting
# the %post scriptlet migrates the DB to head, takes a pre-upgrade backup,
# and stop/starts the service. This is the half the per-distro `smoke` job
# (a fresh install) does not cover. The driver script builds both RPMs and
# runs the container; it uses --network host (so the container's dnf reaches
# mirrors) and a throwaway Postgres on port 55432.
upgrade:
name: Package upgrade (rpm -U auto-migrate)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-go@v5
with:
go-version: '1.26.4'
- uses: actions/setup-node@v4
with:
node-version: '20'
cache: 'npm'
cache-dependency-path: frontend/package-lock.json
- name: Install rpmbuild
run: sudo apt-get update && sudo apt-get install -y rpm
- name: Build old+new RPMs and run the container upgrade test
run: bash packaging/tests/run-upgrade-container-test.sh
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,9 @@ frontend_test.log
*test*.md
*test*.js
*test*.cjs
# Exception: committed test harness scripts under packaging/tests/ (e.g. the
# container upgrade test) are real tracked files, not transient scratch.
!packaging/tests/*test*.sh
*test*.mjs
*test*.ts
*test*.tsx
Expand Down
70 changes: 59 additions & 11 deletions cmd/openwatch/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (

"github.com/Hanalyx/openwatch/internal/db"
"github.com/Hanalyx/openwatch/internal/db/migrations"
"github.com/Hanalyx/openwatch/internal/dbbackup"
"github.com/Hanalyx/openwatch/internal/eventbus"
"github.com/Hanalyx/openwatch/internal/exception"
"github.com/Hanalyx/openwatch/internal/group"
Expand Down Expand Up @@ -646,15 +647,36 @@ func parseLogLevel(s string) slog.Level {
}
}

// cmdMigrate connects to the configured database, runs goose Up for every
// pending migration, and prints the resulting version.
func cmdMigrate(cfg *config.Config, _ []string, stdout, stderr *os.File) int {
// cmdMigrate connects to the configured database and runs goose Up for
// every pending migration.
//
// Flags:
//
// --status report the current version + whether migrations are
// pending, WITHOUT applying anything (used by the
// package upgrade scriptlet and by operators).
// --backup-dir <dir> pg_dump to <dir> as a restore point BEFORE applying.
// Skipped when the DB has no schema yet (fresh install,
// nothing to back up). If the backup fails the command
// fails WITHOUT migrating — we never migrate without the
// restore point we promised.
func cmdMigrate(cfg *config.Config, args []string, stdout, stderr *os.File) int {
fs := flag.NewFlagSet("migrate", flag.ContinueOnError)
fs.SetOutput(stderr)
backupDir := fs.String("backup-dir", "", "pg_dump to this directory before applying (skipped when the DB has no schema yet)")
statusOnly := fs.Bool("status", false, "report current version + pending count without applying")
if err := fs.Parse(args); err != nil {
return 1
}

if err := cfg.Validate(); err != nil {
fmt.Fprintf(stderr, "openwatch migrate: invalid config:\n%v\n", err)
return 1
}

ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
// Generous timeout: a pg_dump of a large DB before migrating can take
// minutes; this is an operator/scriptlet command, not a hot path.
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
defer cancel()

pool, err := db.NewPool(ctx, cfg.Database.DSN, cfg.Database.MaxConnections)
Expand All @@ -664,23 +686,47 @@ func cmdMigrate(cfg *config.Config, _ []string, stdout, stderr *os.File) int {
}
defer pool.Close()

curr, files, err := migrations.Status(ctx, pool)
if err != nil {
fmt.Fprintf(stderr, "openwatch migrate: status: %v\n", err)
return 1
}
total := len(files)

if *statusOnly {
fmt.Fprintf(stdout, "current version: %d\n", curr)
if int(curr) >= total {
fmt.Fprintln(stdout, "up to date — no migrations pending")
} else {
fmt.Fprintf(stdout, "PENDING: %d migration(s) not yet applied — run `openwatch migrate`\n", total-int(curr))
}
return 0
}

// Restore point before applying — only when a schema already exists; a
// fresh DB (curr == 0) has nothing to dump. Fail closed on backup error.
if *backupDir != "" && curr > 0 {
stamp := time.Now().UTC().Format("20060102T150405Z")
path, berr := dbbackup.Run(ctx, cfg.Database.DSN, *backupDir, version.Version, stamp)
if berr != nil {
fmt.Fprintf(stderr, "openwatch migrate: backup failed, refusing to migrate: %v\n", berr)
return 1
}
fmt.Fprintf(stdout, "backed up to %s\n", path)
}

fmt.Fprintf(stdout, "applying migrations against %s ...\n", config.RedactDSN(cfg.Database.DSN))
if err := migrations.Apply(ctx, pool); err != nil {
fmt.Fprintf(stderr, "openwatch migrate: %v\n", err)
return 1
}

version, files, err := migrations.Status(ctx, pool)
newVer, _, err := migrations.Status(ctx, pool)
if err != nil {
fmt.Fprintf(stderr, "openwatch migrate: status: %v\n", err)
return 1
}
fmt.Fprintf(stdout, " current version: %d\n", version)
fmt.Fprintf(stdout, " migration files: %d\n", len(files))
for _, name := range files {
fmt.Fprintf(stdout, " - %s\n", name)
}
fmt.Fprintln(stdout, "migrations applied")
fmt.Fprintf(stdout, "migrations applied — version %d -> %d\n", curr, newVer)
return 0
}

Expand Down Expand Up @@ -792,6 +838,8 @@ subcommands:
serve run the HTTPS API server (default)
worker run the scan-job claimer/dispatcher loop
migrate apply pending goose migrations
--status report version + pending count, don't apply
--backup-dir <dir> pg_dump a restore point before applying
create-admin create the first admin user (requires --username --email --password)
check-config validate and print resolved config

Expand Down
89 changes: 89 additions & 0 deletions docs/runbooks/UPGRADING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Upgrading OpenWatch

OpenWatch upgrades are **one command**. The package post-install scriptlet
applies any pending database migrations automatically — taking a backup
restore point first — and restarts the service.

```bash
# RHEL / CentOS / Rocky / Alma / Fedora
sudo dnf update -y 'openwatch*' 'kensa-rules*'

# Debian / Ubuntu
sudo apt update && sudo apt install --only-upgrade openwatch kensa-rules
```

That's it. On a single-instance install this is all an operator needs to do.

## What happens automatically (on upgrade)

The scriptlet runs **only on upgrade**, never on a fresh install, and does:

1. **Checks the database is reachable.** If it isn't, migrations are skipped
with a warning (the upgrade doesn't fail) — run `openwatch migrate`
manually once the DB is back, then `systemctl restart openwatch`.
2. **Stops the service** — so the new binary never runs against an old schema
and vice versa.
3. **Backs up the database** with `pg_dump` to `/var/lib/openwatch/backups/`
(this is your restore point; the password is passed to `pg_dump` via the
environment, never on the command line).
4. **Applies pending migrations.** Each migration runs in a transaction, so a
failure rolls back atomically — your data is never left half-migrated.
5. **On success → starts the service** on the new version.
**On failure → leaves the service stopped**, prints the restore path, and
exits non-zero so `dnf`/`apt` flag that the upgrade needs attention.

### Pre-flight (optional)

See what would change before upgrading:

```bash
sudo openwatch migrate --status
# -> "up to date — no migrations pending" OR "PENDING: N migration(s) ..."
```

## If a migration fails

The service is left **stopped** and your data is intact (the failed migration
rolled back). Recover with:

```bash
# 1. read the error in the dnf/apt output or: journalctl -u openwatch
# 2. fix the cause, then re-apply:
sudo openwatch migrate
sudo systemctl start openwatch
# on Debian, also clear the half-configured state:
sudo dpkg --configure -a
```

### Restoring from the pre-upgrade backup (last resort)

```bash
ls -t /var/lib/openwatch/backups/ # newest dump first
sudo systemctl stop openwatch
# DSN is in /etc/openwatch/secrets.env (OPENWATCH_DATABASE_DSN)
psql "$OPENWATCH_DATABASE_DSN" < /var/lib/openwatch/backups/openwatch-pre-upgrade-<...>.sql
```

## Backups: location, retention, opt-out

- Dumps live in `/var/lib/openwatch/backups/`.
- A systemd timer (`openwatch-backup-cleanup.timer`, runs daily) prunes dumps
older than `BACKUP_RETENTION_DAYS` (default 30) but **always keeps the most
recent one**, so you never lose your last restore point.
- Tune behavior in `/etc/openwatch/upgrade.conf`:
- `AUTO_BACKUP=yes|no` — set `no` only if you run your own verified
pre-upgrade backups.
- `BACKUP_DIR`, `BACKUP_RETENTION_DAYS`.

## Scope and limits

- **App schema migrations: automatic and safe** (this document).
- **Minor PostgreSQL / dependency updates**: handled by `dnf`/`apt` itself via
package dependencies — nothing extra to do.
- **PostgreSQL MAJOR-version upgrade** (e.g. 15 → 16): **NOT** performed by the
OpenWatch scriptlet. That is a data-directory migration (`pg_upgrade` or
dump/restore) that needs both server versions and must be operator-supervised
— doing it silently from a package upgrade risks the whole database. Plan it
separately, with its own backup.
- **Brief downtime** during the migrate step is expected (the appliance model).
Multi-instance / zero-downtime upgrades are out of scope.
107 changes: 107 additions & 0 deletions internal/dbbackup/dbbackup.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
// Package dbbackup creates a plain-SQL pg_dump of the OpenWatch database,
// used as the pre-upgrade restore point before migrations run.
//
// The cardinal rule: connection parameters (especially the password) go to
// pg_dump via PG* environment variables, NEVER on the command line — so the
// password never appears in the process argv (visible in `ps`). This mirrors
// how the rest of the codebase keeps credentials out of argv.
package dbbackup

import (
"context"
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"

"github.com/jackc/pgx/v5/pgxpool"
)

// dumpArgs are the pg_dump flags shared by Command and Run. --no-owner /
// --no-privileges keep the dump restorable regardless of the target role
// layout (the restore path is `psql < file`); -f writes to the file.
func dumpArgs(outPath string) []string {
return []string{"--no-owner", "--no-privileges", "-f", outPath}
}

// dumpEnv translates dsn into the PG* environment pg_dump reads, so no
// connection parameter (least of all the password) ends up in argv.
func dumpEnv(dsn string) ([]string, error) {
cfg, err := pgxpool.ParseConfig(dsn)
if err != nil {
return nil, fmt.Errorf("dbbackup: parse dsn: %w", err)
}
cc := cfg.ConnConfig
env := append(os.Environ(),
"PGHOST="+cc.Host,
"PGPORT="+strconv.Itoa(int(cc.Port)),
"PGUSER="+cc.User,
"PGPASSWORD="+cc.Password,
"PGDATABASE="+cc.Database,
)
// pgxpool leaves TLSConfig nil for sslmode=disable; map that back so
// pg_dump doesn't attempt TLS against a non-TLS local Postgres. When TLS
// IS configured we leave PGSSLMODE unset (pg_dump defaults to prefer).
if cc.TLSConfig == nil {
env = append(env, "PGSSLMODE=disable")
}
return env, nil
}

// DumpFileName returns the backup filename for a dump taken at stamp (an
// already-formatted timestamp, e.g. "20260616T010203Z") for version. Pure,
// so the caller owns the clock and tests are deterministic.
func DumpFileName(version, stamp string) string {
v := version
if v == "" {
v = "unknown"
}
safe := make([]rune, 0, len(v))
for _, r := range v {
switch {
case r >= 'a' && r <= 'z', r >= 'A' && r <= 'Z', r >= '0' && r <= '9', r == '.', r == '-', r == '_':
safe = append(safe, r)
default:
safe = append(safe, '_')
}
}
return fmt.Sprintf("openwatch-pre-upgrade-%s-%s.sql", string(safe), stamp)
}

// Command builds the pg_dump command writing a plain-SQL dump to outPath.
// Connection parameters live in cmd.Env (PG*), NONE in cmd.Args — the tests
// pin that the password never reaches argv.
func Command(dsn, outPath string) (*exec.Cmd, error) {
env, err := dumpEnv(dsn)
if err != nil {
return nil, err
}
// #nosec G204 -- the command is the constant "pg_dump"; the args are
// package-controlled flags + an output path (never user input), and the
// connection parameters (incl. the password) travel via env, not argv.
cmd := exec.Command("pg_dump", dumpArgs(outPath)...)
cmd.Env = env
return cmd, nil
}

// Run writes a dump to dir (created if absent) and returns the file path.
// The caller supplies version + stamp for the filename.
func Run(ctx context.Context, dsn, dir, version, stamp string) (string, error) {
if err := os.MkdirAll(dir, 0o750); err != nil {
return "", fmt.Errorf("dbbackup: mkdir %s: %w", dir, err)
}
out := filepath.Join(dir, DumpFileName(version, stamp))
env, err := dumpEnv(dsn)
if err != nil {
return "", err
}
// #nosec G204 -- see Command: constant "pg_dump", package-controlled
// args, secrets via env not argv.
cmd := exec.CommandContext(ctx, "pg_dump", dumpArgs(out)...)
cmd.Env = env
if combined, err := cmd.CombinedOutput(); err != nil {
return "", fmt.Errorf("dbbackup: pg_dump failed: %w: %s", err, string(combined))
}
return out, nil
}
Loading
Loading