From 84954c71a56b396bae59e36de054611c11744142 Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Sat, 25 Apr 2026 17:30:53 +0000 Subject: [PATCH] fix(storage): disable FK creation during AutoMigrate to unblock Postgres boot (RAN-49) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After RAN-21 made trace identity tenant-scoped — replacing the single-column unique on traces.trace_id with a composite (tenant_id, trace_id) — Postgres rejected the auto-generated spans.trace_id FK at CREATE TABLE time with SQLSTATE 42830 ("there is no unique constraint matching given keys for referenced table"), aborting boot on every fresh Postgres deployment. The model already declared `constraint:false`, but gorm v2's relationship parser does not honor that tag — which is why the MySQL migrate path was explicitly dropping fk_traces_spans / fk_traces_logs post-AutoMigrate. The reliable cross-driver suppression is the gorm.Config flag, so set DisableForeignKeyConstraintWhenMigrating=true on the global session. This matches the existing async-ingest design (spans/logs may land before their parent trace) and removes the silent SQLite-hiding-Postgres regression class: SQLite did not validate FK targets at CREATE TABLE time, so the SQLite-only migrate tests passed while Postgres failed. - factory.go: set DisableForeignKeyConstraintWhenMigrating; reword the MySQL drop comments to reflect they are now legacy-only. - pg_integration_test.go: add TestPG_AutoMigrate_FromEmpty_NoSpansTracesFK to lock in the regression class — boots an empty Postgres 16 container, asserts AutoMigrateModels succeeds, then verifies no FK exists from spans/logs back to traces. Verified: full default test suite green; 8/8 PG integration tests pass against a real Postgres 16 container (the unrelated pg_trgm planner-stats flake is pre-existing and reproduces on the prior tree). Co-Authored-By: Paperclip --- internal/storage/factory.go | 32 ++++++++++- internal/storage/pg_integration_test.go | 73 +++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 3 deletions(-) diff --git a/internal/storage/factory.go b/internal/storage/factory.go index 733e142..633fc65 100644 --- a/internal/storage/factory.go +++ b/internal/storage/factory.go @@ -69,6 +69,25 @@ func NewDatabase(driver, dsn string) (*gorm.DB, error) { db, err := gorm.Open(dialector, &gorm.Config{ Logger: logger.Default.LogMode(logger.Error), + // RAN-49: never emit FK constraints during AutoMigrate. + // + // (1) Async ingestion: spans/logs can arrive before their parent trace, + // so a real FK on (spans|logs).trace_id → traces.trace_id would reject + // valid out-of-order writes — which is why the MySQL path explicitly + // drops fk_traces_spans / fk_traces_logs after AutoMigrate. + // + // (2) RAN-21 made trace identity tenant-scoped: traces no longer carries + // a single-column unique on trace_id (only the composite + // (tenant_id, trace_id)). On Postgres, CREATE TABLE spans (... FK + // trace_id REFERENCES traces(trace_id)) then fails at DDL time with + // SQLSTATE 42830 ("there is no unique constraint matching given keys + // for referenced table"), aborting boot on a fresh DB. SQLite hides + // this because it does not validate FK targets at CREATE TABLE time. + // + // The model-level `constraint:false` GORM tag is not honored by + // gorm v2's relationship parser, so the only reliable way to suppress + // FK creation across all drivers is this config flag. + DisableForeignKeyConstraintWhenMigrating: true, }) if err != nil { // Never surface the DSN in error wraps — pgx occasionally embeds connection @@ -168,7 +187,11 @@ func getEnvPoolDuration(key string, fallback time.Duration) time.Duration { func AutoMigrateModels(db *gorm.DB, driver string) error { driver = strings.ToLower(driver) - // Disable FK checks during migration for MySQL + // Disable FK checks during migration for MySQL. + // New databases will not get FKs created (DisableForeignKeyConstraintWhenMigrating + // in NewDatabase), but legacy MySQL DBs may still carry fk_traces_spans / + // fk_traces_logs from before RAN-49 — toggling FK_CHECKS=0 keeps the + // post-migrate DROP statements below safe regardless of legacy state. if driver == "mysql" { db.Exec("SET FOREIGN_KEY_CHECKS = 0") log.Println("🔓 Disabled foreign key checks for migration") @@ -187,12 +210,15 @@ func AutoMigrateModels(db *gorm.DB, driver string) error { log.Printf("⚠️ legacy trace_id unique index drop failed: %v", err) } - // Drop foreign keys that AutoMigrate may have created (MySQL) + // Legacy MySQL cleanup: drop FKs that pre-RAN-49 migrations created. Fresh + // MySQL DBs after RAN-49 won't have these (FK creation is now disabled at + // the gorm.Config layer), but pre-existing deployments still need this + // drop to keep async ingestion non-blocking. if driver == "mysql" { db.Exec("ALTER TABLE spans DROP FOREIGN KEY fk_traces_spans") db.Exec("ALTER TABLE logs DROP FOREIGN KEY fk_traces_logs") db.Exec("SET FOREIGN_KEY_CHECKS = 1") - log.Println("🔓 Dropped FK constraints for async ingestion compatibility") + log.Println("🔓 Dropped legacy FK constraints (no-op on fresh DBs)") } // Postgres: enable pg_trgm and create a GIN index on logs.body for fuzzy ILIKE search. diff --git a/internal/storage/pg_integration_test.go b/internal/storage/pg_integration_test.go index a1a2bbd..46e04b3 100644 --- a/internal/storage/pg_integration_test.go +++ b/internal/storage/pg_integration_test.go @@ -348,6 +348,79 @@ func TestPG_PurgeTracesBatched_OrphanSpanSweep_NOT_IN(t *testing.T) { } } +// TestPG_AutoMigrate_FromEmpty_NoSpansTracesFK is the RAN-49 regression +// guard. It boots a fresh Postgres 16 container, runs AutoMigrateModels +// directly against the empty schema, and asserts: +// +// 1. The migrator completes (no SQLSTATE 42830 from the legacy +// spans.trace_id → traces.trace_id FK that pre-RAN-49 GORM emitted). +// 2. No FK constraint exists on spans or logs that references traces — +// async ingestion can land child rows before their parent trace, and +// RAN-21 made trace identity composite, so a single-column FK on +// trace_id would either reject valid writes or fail at DDL time. +// +// SQLite cannot exercise this path: it does not validate FK targets at +// CREATE TABLE time, which is exactly why RAN-21 silently regressed past +// CI. Keep this test on the integration build tag so the regression +// class can not sneak back in. +func TestPG_AutoMigrate_FromEmpty_NoSpansTracesFK(t *testing.T) { + ctx := context.Background() + + pgContainer, err := postgres.Run(ctx, "postgres:16-alpine", + postgres.WithDatabase("otel_test"), + postgres.WithUsername("otel"), + postgres.WithPassword("otel"), + postgres.BasicWaitStrategies(), + ) + if err != nil { + t.Skipf("docker unavailable, skipping pg integration tests: %v", err) + } + defer func() { _ = pgContainer.Terminate(ctx) }() + + dsn, err := pgContainer.ConnectionString(ctx, "sslmode=disable") + if err != nil { + t.Fatalf("ConnectionString: %v", err) + } + + db, err := NewDatabase("postgres", dsn) + if err != nil { + t.Fatalf("NewDatabase(postgres): %v", err) + } + defer func() { + if sqlDB, dbErr := db.DB(); dbErr == nil { + _ = sqlDB.Close() + } + }() + + if err := AutoMigrateModels(db, "postgres"); err != nil { + t.Fatalf("AutoMigrateModels(postgres) on empty DB failed (RAN-49 regression): %v", err) + } + + // Assert no FK from spans/logs references traces. + rows, err := db.Raw(` + SELECT tc.table_name, tc.constraint_name, ccu.table_name AS foreign_table + FROM information_schema.table_constraints tc + JOIN information_schema.constraint_column_usage ccu + ON tc.constraint_name = ccu.constraint_name + AND tc.table_schema = ccu.table_schema + WHERE tc.constraint_type = 'FOREIGN KEY' + AND tc.table_schema = 'public' + AND tc.table_name IN ('spans','logs') + AND ccu.table_name = 'traces'`).Rows() + if err != nil { + t.Fatalf("information_schema FK lookup: %v", err) + } + defer rows.Close() + + for rows.Next() { + var table, name, foreignTable string + if scanErr := rows.Scan(&table, &name, &foreignTable); scanErr != nil { + t.Fatalf("FK row scan: %v", scanErr) + } + t.Errorf("unexpected FK %s on %s referencing %s — async ingestion expects no FK back to traces (RAN-49)", name, table, foreignTable) + } +} + // TestPG_AutoMigrate_BlobTypesBecomeBytea inspects information_schema to // assert that CompressedText fields are materialized as bytea columns on // Postgres — proves the GormDBDataType dialect mapping survives migration.