diff --git a/internal/api/dbhealth.go b/internal/api/dbhealth.go index 2deaf7b..dbc2fbe 100644 --- a/internal/api/dbhealth.go +++ b/internal/api/dbhealth.go @@ -21,26 +21,42 @@ type DBPinger interface { // atomic boolean. The HTTP middleware below short-circuits /api/* traffic // with a 503 when the flag is false, preventing goroutine pile-up on pool // acquisition when the DB is unreachable. +// +// To avoid false-positive 503 windows under load — e.g. SQLite with +// MaxOpen=1 where a busy write transiently blocks the 2s health ping — +// the poller flips to unhealthy only after failureThreshold consecutive +// failed pings. A single successful ping clears the counter and restores +// healthy immediately. type DBHealth struct { - db DBPinger - driver string - interval time.Duration - healthy atomic.Bool - metrics *telemetry.Metrics - stopCh chan struct{} - doneCh chan struct{} + db DBPinger + driver string + interval time.Duration + healthy atomic.Bool + consecutiveFails atomic.Int32 + failureThreshold int32 + metrics *telemetry.Metrics + stopCh chan struct{} + doneCh chan struct{} } +// defaultFailureThreshold is the number of consecutive failed pings before +// the middleware flips into 503-serving mode. Three @ 5s = ~15s of real +// outage before the gate trips, which comfortably swallows transient +// connection-pool contention on single-writer drivers like SQLite. +const defaultFailureThreshold = 3 + // NewDBHealth constructs a health poller. Default poll interval is 5s; ping -// timeout is 2s per attempt. Start() must be called to begin polling. +// timeout is 2s per attempt; failureThreshold defaults to 3 consecutive +// failed pings before the gate flips. Start() must be called to begin polling. func NewDBHealth(db DBPinger, driver string, metrics *telemetry.Metrics) *DBHealth { h := &DBHealth{ - db: db, - driver: driver, - interval: 5 * time.Second, - metrics: metrics, - stopCh: make(chan struct{}), - doneCh: make(chan struct{}), + db: db, + driver: driver, + interval: 5 * time.Second, + failureThreshold: defaultFailureThreshold, + metrics: metrics, + stopCh: make(chan struct{}), + doneCh: make(chan struct{}), } // Assume healthy until the first ping proves otherwise — avoids a // spurious 503 window at startup. @@ -51,6 +67,16 @@ func NewDBHealth(db DBPinger, driver string, metrics *telemetry.Metrics) *DBHeal return h } +// SetFailureThreshold overrides the number of consecutive failed pings +// before the middleware flips to 503. n <= 0 normalises to 1 (legacy +// behaviour: any single failure trips the gate). +func (h *DBHealth) SetFailureThreshold(n int) { + if n <= 0 { + n = 1 + } + atomic.StoreInt32(&h.failureThreshold, int32(n)) // #nosec G115 -- n bounded above; in test wiring +} + // Start launches the background poller. func (h *DBHealth) Start(ctx context.Context) { go h.loop(ctx) @@ -97,14 +123,34 @@ func (h *DBHealth) ping(parent context.Context) { pctx, cancel := context.WithTimeout(parent, 2*time.Second) defer cancel() err := h.db.PingContext(pctx) - up := err == nil + if err == nil { + // A single success clears the failure streak and restores healthy + // immediately — recovery should not be debounced. + h.consecutiveFails.Store(0) + h.markHealthy(true) + return + } + fails := h.consecutiveFails.Add(1) + threshold := atomic.LoadInt32(&h.failureThreshold) + if threshold < 1 { + threshold = 1 + } + if fails >= threshold { + h.markHealthy(false) + } + // Below threshold: leave the gate where it was. A transient pool-contention + // timeout under SQLite MaxOpen=1 must not produce a user-visible 503 window. +} + +func (h *DBHealth) markHealthy(up bool) { h.healthy.Store(up) - if h.metrics != nil && h.metrics.DBUp != nil { - if up { - h.metrics.DBUp.WithLabelValues(h.driver).Set(1) - } else { - h.metrics.DBUp.WithLabelValues(h.driver).Set(0) - } + if h.metrics == nil || h.metrics.DBUp == nil { + return + } + if up { + h.metrics.DBUp.WithLabelValues(h.driver).Set(1) + } else { + h.metrics.DBUp.WithLabelValues(h.driver).Set(0) } } diff --git a/internal/api/dbhealth_test.go b/internal/api/dbhealth_test.go index ad2788d..67bccff 100644 --- a/internal/api/dbhealth_test.go +++ b/internal/api/dbhealth_test.go @@ -66,6 +66,103 @@ func TestDBHealth_TogglesOnPingFailure(t *testing.T) { } } +// TestDBHealth_SingleFailureDoesNotTripGate exercises the consecutive-failure +// threshold. With the legacy behaviour, a single ping failure flipped +// healthy=false and produced a 503 window for every /api/* request until +// the next successful poll up to 5s later. Under SQLite (MaxOpen=1) + +// real load, a poll competing with an in-flight write is enough to trip +// this — so a brief contended ping must NOT be observable to API clients. +func TestDBHealth_SingleFailureDoesNotTripGate(t *testing.T) { + p := &stubPinger{} + h := NewDBHealth(p, "sqlite", nil) + // Drive ping() directly — no goroutine, no ticker — so we control the + // exact failure pattern without timing flakes. + ctx := context.Background() + + // One success then one failure; threshold is 3, so we must still be healthy. + h.ping(ctx) + if !h.Healthy() { + t.Fatalf("expected healthy=true after first successful ping") + } + p.fail.Store(true) + h.ping(ctx) + if !h.Healthy() { + t.Fatalf("single failure flipped healthy=false; threshold=%d should debounce", + h.failureThreshold) + } + // Two failures total — still under threshold of 3. + h.ping(ctx) + if !h.Healthy() { + t.Fatalf("two failures flipped healthy=false; threshold=%d should debounce", + h.failureThreshold) + } +} + +// TestDBHealth_FlipsAtThreshold asserts that exactly N consecutive failed +// pings (default 3) are required before the gate trips and starts serving +// 503s. Anything less is treated as transient pool contention. +func TestDBHealth_FlipsAtThreshold(t *testing.T) { + p := &stubPinger{fail: atomic.Bool{}} + p.fail.Store(true) + h := NewDBHealth(p, "sqlite", nil) + ctx := context.Background() + + for i := 1; i < int(h.failureThreshold); i++ { + h.ping(ctx) + if !h.Healthy() { + t.Fatalf("flipped after %d failures; expected only after %d", + i, h.failureThreshold) + } + } + // Threshold-th failure: must flip. + h.ping(ctx) + if h.Healthy() { + t.Fatalf("did not flip after %d consecutive failures", h.failureThreshold) + } +} + +// TestDBHealth_SuccessResetsCounter asserts that the streak counter resets +// on the first success: two failures, one success, two more failures must +// NOT trip the gate, because there were never 3-in-a-row. +func TestDBHealth_SuccessResetsCounter(t *testing.T) { + p := &stubPinger{} + h := NewDBHealth(p, "sqlite", nil) + ctx := context.Background() + + p.fail.Store(true) + h.ping(ctx) // fail 1 + h.ping(ctx) // fail 2 + if !h.Healthy() { + t.Fatalf("flipped before threshold") + } + p.fail.Store(false) + h.ping(ctx) // success — counter reset + if !h.Healthy() { + t.Fatalf("expected healthy=true after success") + } + p.fail.Store(true) + h.ping(ctx) // fail 1 (streak) + h.ping(ctx) // fail 2 (streak) + if !h.Healthy() { + t.Fatalf("flipped after only 2 fresh failures; counter did not reset on success") + } +} + +// TestDBHealth_SetFailureThresholdNormalisesNonPositive confirms that a +// non-positive threshold collapses to 1 — restoring the legacy "any single +// failure trips the gate" behaviour for callers that explicitly opt out. +func TestDBHealth_SetFailureThresholdNormalisesNonPositive(t *testing.T) { + p := &stubPinger{} + h := NewDBHealth(p, "sqlite", nil) + + for _, n := range []int{0, -1, -100} { + h.SetFailureThreshold(n) + if got := atomic.LoadInt32(&h.failureThreshold); got != 1 { + t.Fatalf("SetFailureThreshold(%d) → %d, want 1", n, got) + } + } +} + func waitFor(t *testing.T, d time.Duration, cond func() bool) { t.Helper() deadline := time.Now().Add(d)