diff --git a/internal/storage/factory.go b/internal/storage/factory.go index e14f78d..52f2b54 100644 --- a/internal/storage/factory.go +++ b/internal/storage/factory.go @@ -1,6 +1,7 @@ package storage import ( + "context" "fmt" "log" "os" @@ -204,6 +205,13 @@ type MigrateOptions struct { // PartitionLookaheadDays is the number of future daily partitions to // pre-create at boot. Defaults to 3 when zero. PartitionLookaheadDays int + // Timeout, when > 0, bounds the AutoMigrate call. Without it, + // db.AutoMigrate inherits no deadline and an ALTER TABLE waiting on a + // Postgres relation lock can hang startup indefinitely. The timeout is + // applied via db.WithContext to the AutoMigrate call only — pre/post + // hooks (FTS5 triggers, legacy index drops) are not bounded since they + // don't take long-held locks. Zero preserves legacy unbounded behaviour. + Timeout time.Duration } // AutoMigrateModelsWithOptions is the option-driven variant of @@ -251,7 +259,17 @@ func AutoMigrateModelsWithOptions(db *gorm.DB, driver string, opts MigrateOption if !logsPartitioned { migrateModels = append(migrateModels, &Log{}) } - if err := db.AutoMigrate(migrateModels...); err != nil { + // Apply a deadline to the AutoMigrate call when configured so a Postgres + // relation-lock wait cannot hang startup indefinitely. WithContext returns + // a session-scoped *gorm.DB; the parent db is unaffected for the post- + // migration helpers below. + migrator := db + if opts.Timeout > 0 { + ctx, cancel := context.WithTimeout(context.Background(), opts.Timeout) + defer cancel() + migrator = db.WithContext(ctx) + } + if err := migrator.AutoMigrate(migrateModels...); err != nil { return fmt.Errorf("failed to migrate database: %w", err) } diff --git a/internal/storage/migrate_timeout_test.go b/internal/storage/migrate_timeout_test.go new file mode 100644 index 0000000..fc3b0d0 --- /dev/null +++ b/internal/storage/migrate_timeout_test.go @@ -0,0 +1,73 @@ +package storage + +import ( + "testing" + "time" +) + +func TestMigrateTimeoutFromEnv(t *testing.T) { + cases := []struct { + name string + env string + want time.Duration + }{ + {"default when unset", "", 60 * time.Second}, + {"explicit 30s", "30", 30 * time.Second}, + {"explicit 0 = opt out", "0", 0}, + {"negative falls back", "-5", 60 * time.Second}, + {"malformed falls back", "abc", 60 * time.Second}, + {"caps at 1h", "7200", time.Hour}, + {"surrounding whitespace", " 120 ", 120 * time.Second}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + if tc.env == "" { + t.Setenv("DB_MIGRATE_TIMEOUT_SECS", "") + } else { + t.Setenv("DB_MIGRATE_TIMEOUT_SECS", tc.env) + } + if got := migrateTimeoutFromEnv(); got != tc.want { + t.Errorf("env=%q -> got %v, want %v", tc.env, got, tc.want) + } + }) + } +} + +// TestAutoMigrateModels_RespectsTimeout exercises the timeout-bounded path +// against an in-memory SQLite DB. SQLite migrations are too fast to actually +// trip the deadline, so this test asserts: (a) Timeout > 0 doesn't break +// the happy path, (b) Timeout=0 still works (legacy unbounded behaviour). +// +// A real Postgres lock-contention test would need a live Postgres + advisory +// lock and lives in pg_integration_test.go behind an external-DB build tag. +func TestAutoMigrateModels_RespectsTimeout(t *testing.T) { + t.Run("WithTimeout", func(t *testing.T) { + db, err := NewDatabase("sqlite", ":memory:") + if err != nil { + t.Fatalf("NewDatabase: %v", err) + } + defer func() { sqlDB, _ := db.DB(); _ = sqlDB.Close() }() + opts := MigrateOptions{Timeout: 30 * time.Second} + if err := AutoMigrateModelsWithOptions(db, "sqlite", opts); err != nil { + t.Fatalf("AutoMigrate with timeout: %v", err) + } + if !db.Migrator().HasTable("traces") { + t.Fatalf("expected traces table to be created") + } + }) + + t.Run("ZeroTimeoutLegacyPath", func(t *testing.T) { + db, err := NewDatabase("sqlite", ":memory:") + if err != nil { + t.Fatalf("NewDatabase: %v", err) + } + defer func() { sqlDB, _ := db.DB(); _ = sqlDB.Close() }() + opts := MigrateOptions{Timeout: 0} + if err := AutoMigrateModelsWithOptions(db, "sqlite", opts); err != nil { + t.Fatalf("AutoMigrate without timeout: %v", err) + } + if !db.Migrator().HasTable("traces") { + t.Fatalf("expected traces table to be created") + } + }) +} diff --git a/internal/storage/repository.go b/internal/storage/repository.go index d190662..b76552c 100644 --- a/internal/storage/repository.go +++ b/internal/storage/repository.go @@ -30,6 +30,39 @@ func partitionLookaheadFromEnv() int { return n } +// migrateTimeoutFromEnv reads DB_MIGRATE_TIMEOUT_SECS, defaulting to 60s +// when unset, malformed, or non-positive. The migration timeout bounds +// db.AutoMigrate so a Postgres relation-lock wait cannot hang startup +// indefinitely. Set to 0 explicitly to opt out (legacy unbounded behaviour). +// +// Cap at 1 hour — anyone needing a longer migration window should run +// schema changes out-of-band with versioned migrations and DB_AUTOMIGRATE=false. +func migrateTimeoutFromEnv() time.Duration { + const ( + defaultTimeout = 60 * time.Second + maxTimeout = time.Hour + ) + v := strings.TrimSpace(os.Getenv("DB_MIGRATE_TIMEOUT_SECS")) + if v == "" { + return defaultTimeout + } + n, err := strconv.Atoi(v) + if err != nil { + return defaultTimeout + } + if n == 0 { + return 0 // explicit opt-out — preserves legacy behaviour + } + if n < 0 { + return defaultTimeout + } + d := time.Duration(n) * time.Second + if d > maxTimeout { + return maxTimeout + } + return d +} + // likeOpFor returns the case-insensitive LIKE operator for the given driver. // Postgres LIKE is case-sensitive; SQLite/MySQL LIKE is case-insensitive by default. // Callers should embed the returned token directly into SQL fragments. @@ -109,6 +142,7 @@ func NewRepository(metrics *telemetry.Metrics) (*Repository, error) { opts := MigrateOptions{ PostgresPartitioning: strings.ToLower(strings.TrimSpace(os.Getenv("DB_POSTGRES_PARTITIONING"))), PartitionLookaheadDays: partitionLookaheadFromEnv(), + Timeout: migrateTimeoutFromEnv(), } if err := AutoMigrateModelsWithOptions(db, driver, opts); err != nil { return nil, err