diff --git a/CLAUDE.md b/CLAUDE.md
index 1957a76..564eb8c 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -214,6 +214,10 @@ Key settings in `internal/config/config.go`:
 - `MCP_ENABLED` (true), `MCP_PATH` (/mcp)
 - `VECTOR_INDEX_MAX_ENTRIES` (100000)
 - `DLQ_MAX_FILES` (1000), `DLQ_MAX_DISK_MB` (500), `DLQ_MAX_RETRIES` (10)
+- `GRAPHRAG_WORKER_COUNT` (4), `GRAPHRAG_EVENT_QUEUE_SIZE` (10000) — raise both at 100+ services if `otelcontext_graphrag_events_dropped_total` climbs
+- `GRPC_MAX_RECV_MB` (16), `GRPC_MAX_CONCURRENT_STREAMS` (1000) — OTLP gRPC server caps, validated to 1..256 and 1..1_000_000
+- `RETENTION_BATCH_SIZE` (50000), `RETENTION_BATCH_SLEEP_MS` (1) — purge pacing; raise the sleep on busy production DBs
+- `APP_ENV` (`"development"`), `OTELCONTEXT_ALLOW_SQLITE_PROD` (false) — SQLite is refused when `APP_ENV=production` unless the allow flag is set
 
 ### Authentication
 
diff --git a/Makefile b/Makefile
index e97af63..10aff7c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: build test vet check setup-hooks ui-install ui-build dev-ui
+.PHONY: build test vet check setup-hooks ui-install ui-build dev-ui loadtest loadtest-build
 
 ui-install:
 	cd ui && npm install
@@ -21,6 +21,13 @@ check: build vet test
 dev-ui:
 	cd ui && npm run dev
 
+loadtest-build:
+	CGO_ENABLED=0 go build -tags loadtest -o bin/loadsim ./test/loadsim
+
+loadtest: loadtest-build
+	@echo "Running 200-service load simulator (60s) against localhost:4317..."
+	./bin/loadsim
+
 ## setup-hooks installs the pre-commit hook into .git/hooks
 setup-hooks:
 	cp scripts/pre-commit .git/hooks/pre-commit
diff --git a/docs/OPERATIONS.md b/docs/OPERATIONS.md
index bf39ab7..0db5343 100644
--- a/docs/OPERATIONS.md
+++ b/docs/OPERATIONS.md
@@ -92,6 +92,12 @@ DB_DSN="host=my-server.postgres.database.azure.com user=my-mi@tenant.onmicrosoft
 - `API_RATE_LIMIT_RPS=100`
 - `VECTOR_INDEX_MAX_ENTRIES=100000`
 - `SAMPLING_*` (defaults keep 100% + always-on errors)
+- `GRAPHRAG_WORKER_COUNT=4`, `GRAPHRAG_EVENT_QUEUE_SIZE=10000` — raise worker count at 100+ services if you see `graphrag_events_dropped_total` climbing
+- `GRPC_MAX_RECV_MB=16`, `GRPC_MAX_CONCURRENT_STREAMS=1000` — OTLP gRPC server caps
+- `RETENTION_BATCH_SIZE=50000`, `RETENTION_BATCH_SLEEP_MS=1` — purge pacing; raise the sleep for busy production DBs
+
+### SQLite in production
+SQLite is rejected at startup when `APP_ENV=production` unless you explicitly opt in with `OTELCONTEXT_ALLOW_SQLITE_PROD=true`. The guard exists because SQLite uses a single writer lock — fine for < ~10 services at low QPS, miserable at scale. Prefer Postgres for anything resembling production.
 
 ---
 
@@ -190,6 +196,11 @@ Grep structured logs for `acquire entra token`. Common causes: expired managed-i
   - `OtelContext_retention_consecutive_failures > 3`
   - `rate(OtelContext_otlp_payload_rejected_total[5m]) > 0`
   - `rate(OtelContext_dlq_replay_failure_total[5m]) > rate(OtelContext_dlq_replay_success_total[5m])`
+  - `rate(otelcontext_graphrag_events_dropped_total[5m]) > 0` — ingestion channel saturated; bump `GRAPHRAG_WORKER_COUNT` or `GRAPHRAG_EVENT_QUEUE_SIZE`
+  - `otelcontext_retention_rows_behind > 1_000_000` — purge is falling behind; tune `RETENTION_BATCH_SIZE` / `RETENTION_BATCH_SLEEP_MS`
+  - `otelcontext_db_pool_in_use / otelcontext_db_pool_max_open > 0.9` — pool exhausted; raise `DB_MAX_OPEN_CONNS`
+  - `rate(otelcontext_dlq_evicted_total[5m]) > 0` — DLQ is actively dropping entries at cap; replay target is down or slow
+  - `rate(otelcontext_dashboard_p99_row_cap_hits_total[1h]) > 0` on SQLite — dataset exceeds the 200k in-memory cap; migrate to Postgres for accurate p99
 - **Log levels:** `LOG_LEVEL=DEBUG` for deep diagnostics, default `INFO`. `WARN` or `ERROR` is too quiet for a running system; avoid in prod.
 
 ---
@@ -204,6 +215,40 @@ Grep structured logs for `acquire entra token`. Common causes: expired managed-i
 
 ---
 
+## Scale & Load Testing
+
+The backend is sized for **100–200 services** emitting OTLP at commodity rates. A programmatic load simulator ships with the repo to verify this.
+
+### Running the simulator
+
+```bash
+make loadtest-build       # produces bin/loadsim
+./bin/loadsim             # 200 producers × 50 spans/sec × 60s against localhost:4317
+./bin/loadsim --help      # flags: --endpoint, --services, --rate, --duration, --tenant-id, --warmup
+```
+
+The binary is under the `loadtest` build tag — `go build ./...` and `go test ./...` ignore it. `make loadtest` runs a full 60s sweep against `localhost:4317`.
+
+### What healthy looks like
+
+During a 60s / 200-service run against a warm instance on Postgres:
+
+- Ingestion: no `otlp_payload_rejected_total` samples, no `graphrag_events_dropped_total` samples.
+- DB pool: `db_pool_in_use / db_pool_max_open` stays below ~0.8.
+- Retention: `retention_rows_behind` stays within one hourly tick of steady state.
+- DLQ: zero activity (`dlq_evicted_total`, `dlq_replay_failure_total` unchanged).
+- The dashboard p99 gauge updates without hitting the SQLite row cap.
+
+If any of those trip, use the corresponding metric alert from the Observability section above as the entry point.
+
+### When to re-run
+
+- Before cutting a release that touches the ingestion path or GraphRAG.
+- After tuning any of: `GRAPHRAG_WORKER_COUNT`, `GRPC_MAX_CONCURRENT_STREAMS`, `RETENTION_BATCH_SIZE`, `DB_MAX_OPEN_CONNS`.
+- When scaling the deployment past the current-tested envelope (e.g., 500+ services) — expand the simulator's `--services` flag to match.
+
+---
+
 ## Upgrade Path
 
 1. **Back up the DB** (see Backup & Restore above).
diff --git a/go.mod b/go.mod
index f3b5144..3c6679a 100644
--- a/go.mod
+++ b/go.mod
@@ -8,12 +8,14 @@ require (
 	github.com/Azure/azure-sdk-for-go/sdk/azcore v1.21.0
 	github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.13.1
 	github.com/coder/websocket v1.8.14
+	github.com/glebarez/go-sqlite v1.21.2
 	github.com/glebarez/sqlite v1.11.0
 	github.com/jackc/pgx/v5 v5.7.2
 	github.com/joho/godotenv v1.5.1
 	github.com/klauspost/compress v1.18.5
 	github.com/microsoft/go-mssqldb v1.9.7
 	github.com/prometheus/client_golang v1.23.2
+	github.com/prometheus/client_model v0.6.2
 	github.com/testcontainers/testcontainers-go/modules/postgres v0.42.0
 	github.com/tmc/langchaingo v0.1.14
 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0
@@ -60,7 +62,6 @@ require (
 	github.com/dustin/go-humanize v1.0.1 // indirect
 	github.com/ebitengine/purego v0.10.0 // indirect
 	github.com/felixge/httpsnoop v1.0.4 // indirect
-	github.com/glebarez/go-sqlite v1.21.2 // indirect
 	github.com/go-logr/logr v1.4.3 // indirect
 	github.com/go-logr/stdr v1.2.2 // indirect
 	github.com/go-ole/go-ole v1.3.0 // indirect
@@ -95,7 +96,6 @@ require (
 	github.com/pkoukk/tiktoken-go v0.1.6 // indirect
 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect
-	github.com/prometheus/client_model v0.6.2 // indirect
 	github.com/prometheus/common v0.66.1 // indirect
 	github.com/prometheus/procfs v0.16.1 // indirect
 	github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
diff --git a/internal/api/ratelimit.go b/internal/api/ratelimit.go
index 30752b2..cf436ed 100644
--- a/internal/api/ratelimit.go
+++ b/internal/api/ratelimit.go
@@ -37,6 +37,22 @@ func (rl *RateLimiter) Middleware(next http.Handler) http.Handler {
 	})
 }
 
+// MiddlewareExcept returns a handler chain that applies the rate limit only
+// when skip(path) returns false. Intended to exempt OTLP ingestion paths from
+// the per-IP API limiter.
+func (rl *RateLimiter) MiddlewareExcept(skip func(path string) bool) func(http.Handler) http.Handler {
+	return func(next http.Handler) http.Handler {
+		wrapped := rl.Middleware(next)
+		return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if skip != nil && skip(r.URL.Path) {
+				next.ServeHTTP(w, r)
+				return
+			}
+			wrapped.ServeHTTP(w, r)
+		})
+	}
+}
+
 func (rl *RateLimiter) allow(ip string) bool {
 	rl.mu.Lock()
 	defer rl.mu.Unlock()
diff --git a/internal/api/ratelimit_otlp_test.go b/internal/api/ratelimit_otlp_test.go
new file mode 100644
index 0000000..37aa77e
--- /dev/null
+++ b/internal/api/ratelimit_otlp_test.go
@@ -0,0 +1,54 @@
+package api
+
+import (
+	"net/http"
+	"net/http/httptest"
+	"strings"
+	"testing"
+)
+
+// TestRateLimiter_ExemptsOTLPPaths pins the /v1/* exemption contract.
+//
+// OTLP collectors (Otel SDK, Collector, Alloy, vector, etc.) batch aggressively
+// and a single agent routinely exceeds the 100 RPS/IP default that protects
+// /api/*. Without this exemption, legitimate ingestion traffic — the exact data
+// this platform exists to capture — would get 429'd and dropped. This test
+// locks in the carve-out so a future refactor doesn't silently re-enable
+// throttling on /v1/* and regress ingestion.
+//
+// It also asserts the inverse — that /api/* on the *same* IP is still
+// throttled — so the exemption remains narrow (path-prefix, not blanket).
+func TestRateLimiter_ExemptsOTLPPaths(t *testing.T) {
+	rl := NewRateLimiter(1) // 1 RPS per IP — draconian
+	handler := rl.MiddlewareExcept(func(path string) bool {
+		return strings.HasPrefix(path, "/v1/")
+	})(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	}))
+
+	// 10 rapid OTLP requests should all 200.
+	for i := 0; i < 10; i++ {
+		r := httptest.NewRequest("POST", "/v1/traces", nil)
+		r.RemoteAddr = "10.0.0.1:1234"
+		w := httptest.NewRecorder()
+		handler.ServeHTTP(w, r)
+		if w.Code != 200 {
+			t.Fatalf("OTLP request %d got %d, expected 200 (exempt path)", i, w.Code)
+		}
+	}
+
+	// But /api/* on the same IP should get throttled after the 1st.
+	hits := 0
+	for i := 0; i < 5; i++ {
+		r := httptest.NewRequest("GET", "/api/logs", nil)
+		r.RemoteAddr = "10.0.0.1:1234"
+		w := httptest.NewRecorder()
+		handler.ServeHTTP(w, r)
+		if w.Code == 200 {
+			hits++
+		}
+	}
+	if hits >= 5 {
+		t.Fatalf("expected /api throttling, got %d/5 passes", hits)
+	}
+}
diff --git a/internal/config/config.go b/internal/config/config.go
index 722b891..52802d4 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -33,6 +33,12 @@ type Config struct {
 	// Retention
 	HotRetentionDays int
 
+	// Retention tuning. Defaults (batch=50000, sleep=1ms) work for Postgres at
+	// 100k logs/sec sustained. Lower on resource-constrained hosts; raise on
+	// dedicated DB machines. 0/negative values use defaults.
+	RetentionBatchSize    int
+	RetentionBatchSleepMs int
+
 	// TSDB
 	TSDBRingBufferDuration string // e.g. "1h"
 
@@ -63,6 +69,13 @@ type Config struct {
 	// Vector Index
 	VectorIndexMaxEntries int
 
+	// GraphRAG worker count (background consumers of the ingestion event channel).
+	// Defaults to 4 if unset or <=0. Increase under sustained high ingest.
+	GraphRAGWorkerCount int
+
+	// GraphRAG event channel buffer size. Defaults to 10000 if unset or <=0.
+	GraphRAGEventQueueSize int
+
 	// TLS (HTTP + gRPC). When both paths are set, TLS is enabled on both servers.
 	// Empty values (default) keep plaintext behavior.
 	TLSCertFile string
@@ -105,6 +118,15 @@ type Config struct {
 	// DevMode disables origin checks for WebSocket and enables dev-friendly defaults.
 	// Derived from APP_ENV == "development".
 	DevMode bool
+
+	// gRPC server tuning — protects against huge OTLP batches and connection abuse.
+	GRPCMaxRecvMB            int
+	GRPCMaxConcurrentStreams int
+
+	// AllowSqliteProd lets operators explicitly acknowledge that SQLite is
+	// being used outside dev/test. Without it, a production Env + SQLite
+	// combination refuses to start.
+	AllowSqliteProd bool
 }
 
 func Load(customPath string) (*Config, error) {
@@ -145,7 +167,9 @@ func Load(customPath string) (*Config, error) {
 		DBConnMaxLifetime: getEnv("DB_CONN_MAX_LIFETIME", "1h"),
 
 		// Retention
-		HotRetentionDays: getEnvInt("HOT_RETENTION_DAYS", 7),
+		HotRetentionDays:      getEnvInt("HOT_RETENTION_DAYS", 7),
+		RetentionBatchSize:    getEnvInt("RETENTION_BATCH_SIZE", 50000),
+		RetentionBatchSleepMs: getEnvInt("RETENTION_BATCH_SLEEP_MS", 1),
 
 		// TSDB
 		TSDBRingBufferDuration: getEnv("TSDB_RING_BUFFER_DURATION", "1h"),
@@ -177,6 +201,10 @@ func Load(customPath string) (*Config, error) {
 		// Vector
 		VectorIndexMaxEntries: getEnvInt("VECTOR_INDEX_MAX_ENTRIES", 100000),
 
+		// GraphRAG
+		GraphRAGWorkerCount:    getEnvInt("GRAPHRAG_WORKER_COUNT", 4),
+		GraphRAGEventQueueSize: getEnvInt("GRAPHRAG_EVENT_QUEUE_SIZE", 10000),
+
 		// TLS
 		TLSCertFile:       getEnv("TLS_CERT_FILE", ""),
 		TLSKeyFile:        getEnv("TLS_KEY_FILE", ""),
@@ -193,6 +221,13 @@ func Load(customPath string) (*Config, error) {
 		DefaultTenant:           getEnv("DEFAULT_TENANT", "default"),
 		OTLPTrustResourceTenant: parseTruthy(getEnv("OTLP_TRUST_RESOURCE_TENANT", "")),
 		APITenantKeysFile:       getEnv("API_TENANT_KEYS_FILE", ""),
+
+		// gRPC server tuning
+		GRPCMaxRecvMB:            getEnvInt("GRPC_MAX_RECV_MB", 16),
+		GRPCMaxConcurrentStreams: getEnvInt("GRPC_MAX_CONCURRENT_STREAMS", 1000),
+
+		// Production safety guard for SQLite
+		AllowSqliteProd: parseTruthy(getEnv("OTELCONTEXT_ALLOW_SQLITE_PROD", "")),
 	}, nil
 }
 
@@ -270,6 +305,12 @@ func (c *Config) Validate() error {
 	if c.HotRetentionDays < 1 || c.HotRetentionDays > 36500 {
 		return fmt.Errorf("HOT_RETENTION_DAYS must be between 1 and 36500, got %d", c.HotRetentionDays)
 	}
+	if c.RetentionBatchSize < 1 || c.RetentionBatchSize > 10_000_000 {
+		return fmt.Errorf("RETENTION_BATCH_SIZE must be between 1 and 10000000, got %d", c.RetentionBatchSize)
+	}
+	if c.RetentionBatchSleepMs < 0 || c.RetentionBatchSleepMs > 60_000 {
+		return fmt.Errorf("RETENTION_BATCH_SLEEP_MS must be between 0 and 60000, got %d", c.RetentionBatchSleepMs)
+	}
 	if c.MetricMaxCardinality < 0 {
 		return fmt.Errorf("METRIC_MAX_CARDINALITY must be >= 0, got %d", c.MetricMaxCardinality)
 	}
@@ -279,6 +320,17 @@ func (c *Config) Validate() error {
 	if c.APIRateLimitRPS < 0 {
 		return fmt.Errorf("API_RATE_LIMIT_RPS must be >= 0, got %d", c.APIRateLimitRPS)
 	}
+	// gRPC receive cap: must be positive, and capped to prevent per-message OOM
+	// from a bad env value (the limit pre-allocates a buffer of this size on
+	// the first large message). 256 MiB is far beyond any legitimate OTLP batch
+	// and still small enough that a 200-connection flood cannot exhaust a host
+	// with typical RAM.
+	if c.GRPCMaxRecvMB < 1 || c.GRPCMaxRecvMB > 256 {
+		return fmt.Errorf("GRPC_MAX_RECV_MB must be between 1 and 256, got %d", c.GRPCMaxRecvMB)
+	}
+	if c.GRPCMaxConcurrentStreams < 1 || c.GRPCMaxConcurrentStreams > 1_000_000 {
+		return fmt.Errorf("GRPC_MAX_CONCURRENT_STREAMS must be between 1 and 1000000, got %d", c.GRPCMaxConcurrentStreams)
+	}
 	if c.DBMaxOpenConns < 1 {
 		return fmt.Errorf("DB_MAX_OPEN_CONNS must be >= 1, got %d", c.DBMaxOpenConns)
 	}
@@ -352,3 +404,21 @@ func checkReadable(path string) error {
 	}
 	return f.Close()
 }
+
+// ValidateDBForEnv refuses the combination of SQLite driver + production
+// environment unless AllowSqliteProd is explicitly set. SQLite's single-writer
+// lock caps sustained throughput to ~5 services; using it in production will
+// silently throttle ingestion.
+//
+// Call once during startup after Load + Validate.
+func (c *Config) ValidateDBForEnv() error {
+	if !strings.EqualFold(c.DBDriver, "sqlite") {
+		return nil
+	}
+	if strings.EqualFold(c.Env, "production") && !c.AllowSqliteProd {
+		return fmt.Errorf("SQLite is unsuitable for APP_ENV=production " +
+			"(single-writer lock caps throughput at ~5 services). " +
+			"Use DB_DRIVER=postgres, or set OTELCONTEXT_ALLOW_SQLITE_PROD=true to acknowledge")
+	}
+	return nil
+}
diff --git a/internal/config/config_test.go b/internal/config/config_test.go
index e308840..345cc1e 100644
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@@ -21,6 +21,10 @@ func baseValid() *Config {
 		DBMaxOpenConns:       50,
 		DBMaxIdleConns:       10,
 		CompressionLevel:     "default",
+		GRPCMaxRecvMB:            16,
+		GRPCMaxConcurrentStreams: 1000,
+		RetentionBatchSize:    50000,
+		RetentionBatchSleepMs: 1,
 	}
 }
 
@@ -250,6 +254,45 @@ func TestTLSAutoSelfsigned_IgnoredWhenCertFilesSet(t *testing.T) {
 	}
 }
 
+func TestValidateDBForEnv_RefusesSQLiteInProduction(t *testing.T) {
+	c := baseValid()
+	c.DBDriver = "sqlite"
+	c.Env = "production"
+	c.AllowSqliteProd = false
+	err := c.ValidateDBForEnv()
+	if err == nil || !strings.Contains(err.Error(), "SQLite is unsuitable") {
+		t.Fatalf("expected SQLite-in-prod rejection, got %v", err)
+	}
+}
+
+func TestValidateDBForEnv_AllowsSQLiteWhenOptIn(t *testing.T) {
+	c := baseValid()
+	c.DBDriver = "sqlite"
+	c.Env = "production"
+	c.AllowSqliteProd = true
+	if err := c.ValidateDBForEnv(); err != nil {
+		t.Fatalf("opt-in should allow SQLite in prod, got %v", err)
+	}
+}
+
+func TestValidateDBForEnv_AllowsSQLiteInDev(t *testing.T) {
+	c := baseValid()
+	c.DBDriver = "sqlite"
+	c.Env = "development"
+	if err := c.ValidateDBForEnv(); err != nil {
+		t.Fatalf("SQLite in dev must pass, got %v", err)
+	}
+}
+
+func TestValidateDBForEnv_AllowsPostgresInProd(t *testing.T) {
+	c := baseValid()
+	c.DBDriver = "postgres"
+	c.Env = "production"
+	if err := c.ValidateDBForEnv(); err != nil {
+		t.Fatalf("Postgres in prod must pass, got %v", err)
+	}
+}
+
 func TestLoad_DefaultTenant_FallsBackToDefault(t *testing.T) {
 	// Ensure var is absent — Setenv("", "") would leave it set-but-empty, which
 	// the getEnv helper treats as a present value.
diff --git a/internal/graphrag/builder.go b/internal/graphrag/builder.go
index 39820ea..3bf5451 100644
--- a/internal/graphrag/builder.go
+++ b/internal/graphrag/builder.go
@@ -4,6 +4,7 @@ import (
 	"context"
 	"log/slog"
 	"runtime/debug"
+	"sync/atomic"
 	"time"
 
 	"github.com/RandomCodeSpace/otelcontext/internal/storage"
@@ -91,6 +92,69 @@ type GraphRAG struct {
 	refreshEvery  time.Duration
 	snapshotEvery time.Duration
 	anomalyEvery  time.Duration
+	workerCount   int // 0 = defaultWorkerCount (set by New from Config)
+
+	// Event drop counters. Atomic so OnSpanIngested/OnLogIngested/
+	// OnMetricIngested can record overflows without taking any lock —
+	// the channel-full path must stay hot-path cheap.
+	droppedSpans   atomic.Int64
+	droppedLogs    atomic.Int64
+	droppedMetrics atomic.Int64
+
+	// metrics is an optional Prometheus hook for exporting event drops.
+	// Assigned via SetMetrics; nil-safe at call sites.
+	metrics *telemetry.Metrics
+
+	// invCooldown suppresses repeat PersistInvestigation calls for the same
+	// (trigger_service, root_service, root_operation) inside a sliding window.
+	// Initialized in New; pruned from the refresh tick.
+	invCooldown *investigationCooldown
+
+	// invInserts counts cooldown-allowed PersistInvestigation calls.
+	// Incremented BEFORE the DB write — see InvestigationInsertCount.
+	invInserts atomic.Int64
+}
+
+// SetMetrics wires the Prometheus registry so GraphRAG event drops are
+// observable via otelcontext_graphrag_events_dropped_total. Safe to call
+// before Start; pass nil to disable Prometheus recording (atomic
+// counters still tick).
+func (g *GraphRAG) SetMetrics(m *telemetry.Metrics) { g.metrics = m }
+
+// DroppedSpansCount reports the number of span events dropped because
+// the ingestion channel was full. Exported for tests and readiness
+// probes; atomic, safe from any goroutine.
+func (g *GraphRAG) DroppedSpansCount() int64 { return g.droppedSpans.Load() }
+
+// DroppedLogsCount reports the number of log events dropped because
+// the ingestion channel was full.
+func (g *GraphRAG) DroppedLogsCount() int64 { return g.droppedLogs.Load() }
+
+// DroppedMetricsCount reports the number of metric events dropped
+// because the ingestion channel was full.
+func (g *GraphRAG) DroppedMetricsCount() int64 { return g.droppedMetrics.Load() }
+
+// InvestigationInsertCount reports cooldown-allowed PersistInvestigation
+// calls. Semantics: this counter increments when the cooldown check
+// passes, BEFORE the DB write — so a subsequent DB failure still
+// increments this. It is NOT a strict DB insert count. Intended for
+// tests to assert cooldown behavior without requiring a live repo.
+func (g *GraphRAG) InvestigationInsertCount() int64 { return g.invInserts.Load() }
+
+// recordEventDrop increments the per-signal atomic counter and — when
+// a telemetry registry is wired — the Prometheus counter vec.
+func (g *GraphRAG) recordEventDrop(signal string) {
+	switch signal {
+	case "span":
+		g.droppedSpans.Add(1)
+	case "log":
+		g.droppedLogs.Add(1)
+	case "metric":
+		g.droppedMetrics.Add(1)
+	}
+	if g.metrics != nil && g.metrics.GraphRAGEventsDroppedTotal != nil {
+		g.metrics.GraphRAGEventsDroppedTotal.WithLabelValues(signal).Inc()
+	}
 }
 
 // Config holds GraphRAG configuration.
@@ -152,6 +216,8 @@ func New(repo *storage.Repository, vectorIdx *vectordb.Index, tsdbAgg *tsdb.Aggr
 		refreshEvery:  cfg.RefreshEvery,
 		snapshotEvery: cfg.SnapshotEvery,
 		anomalyEvery:  cfg.AnomalyEvery,
+		workerCount:   cfg.WorkerCount,
+		invCooldown:   newInvestigationCooldown(5 * time.Minute),
 	}
 
 	// Restore persisted Drain templates so log clustering survives restarts.
@@ -173,8 +239,14 @@ func New(repo *storage.Repository, vectorIdx *vectordb.Index, tsdbAgg *tsdb.Aggr
 // Each goroutine is wrapped in a panic recovery so one misbehaving event
 // can't take down the whole subsystem.
 func (g *GraphRAG) Start(ctx context.Context) {
-	// Start event workers
-	for i := 0; i < defaultWorkerCount; i++ {
+	// Start event workers. Honor the configured worker count so operators
+	// can scale up under sustained high ingest; fall back to the package
+	// default when the constructor wasn't handed an override.
+	workers := g.workerCount
+	if workers <= 0 {
+		workers = defaultWorkerCount
+	}
+	for i := 0; i < workers; i++ {
 		go func() {
 			defer guardWorker("eventWorker")
 			g.eventWorker(ctx)
@@ -196,7 +268,7 @@ func (g *GraphRAG) Start(ctx context.Context) {
 	}()
 
 	slog.Info("GraphRAG started",
-		"workers", defaultWorkerCount,
+		"workers", workers,
 		"trace_ttl", g.traceTTL,
 		"refresh_every", g.refreshEvery,
 	)
@@ -240,14 +312,19 @@ func (g *GraphRAG) IsRunning() bool {
 
 // OnSpanIngested is the callback wired into the trace ingestion pipeline.
 func (g *GraphRAG) OnSpanIngested(span storage.Span) {
+	status := span.Status
+	if status == "" {
+		status = "STATUS_CODE_UNSET"
+	}
 	select {
 	case g.eventCh <- event{span: &spanEvent{
 		Span:    span,
 		TraceID: span.TraceID,
-		Status:  "OK",
+		Status:  status,
 	}}:
 	default:
-		// Channel full — graph is best-effort; DB is source of truth
+		// Channel full — graph is best-effort; DB is source of truth.
+		g.recordEventDrop("span")
 	}
 }
 
@@ -256,6 +333,8 @@ func (g *GraphRAG) OnLogIngested(log storage.Log) {
 	select {
 	case g.eventCh <- event{log: &logEvent{Log: log}}:
 	default:
+		// Channel full — graph is best-effort; DB is source of truth.
+		g.recordEventDrop("log")
 	}
 }
 
@@ -264,6 +343,8 @@ func (g *GraphRAG) OnMetricIngested(metric tsdb.RawMetric) {
 	select {
 	case g.eventCh <- event{metric: &metricEvent{Metric: metric}}:
 	default:
+		// Channel full — graph is best-effort; DB is source of truth.
+		g.recordEventDrop("metric")
 	}
 }
 
diff --git a/internal/graphrag/builder_test.go b/internal/graphrag/builder_test.go
new file mode 100644
index 0000000..9070598
--- /dev/null
+++ b/internal/graphrag/builder_test.go
@@ -0,0 +1,147 @@
+package graphrag
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/RandomCodeSpace/otelcontext/internal/storage"
+)
+
+// newTestRepo builds an in-memory SQLite Repository with all models migrated.
+// Duplicates the fixture in internal/storage/testhelpers_test.go because that
+// helper is in a different test package and thus not importable here.
+func newTestRepo(t *testing.T) *storage.Repository {
+	t.Helper()
+	db, err := storage.NewDatabase("sqlite", ":memory:")
+	if err != nil {
+		t.Fatalf("NewDatabase: %v", err)
+	}
+	if err := storage.AutoMigrateModels(db, "sqlite"); err != nil {
+		t.Fatalf("AutoMigrateModels: %v", err)
+	}
+	repo := storage.NewRepositoryFromDB(db, "sqlite")
+	t.Cleanup(func() { _ = repo.Close() })
+	return repo
+}
+
+// newTestGraphRAG constructs a GraphRAG usable in tests without a repo or
+// vectordb. The event workers are started so ingestion callbacks process
+// events asynchronously; tests must call Stop() via t.Cleanup.
+func newTestGraphRAG(t *testing.T) *GraphRAG {
+	t.Helper()
+	g := New(nil, nil, nil, nil, DefaultConfig())
+	// Start only the event workers — the background refresh/snapshot/anomaly
+	// loops require a repo, which this helper intentionally does not wire.
+	ctx, cancel := context.WithCancel(context.Background())
+	for i := 0; i < defaultWorkerCount; i++ {
+		go g.eventWorker(ctx)
+	}
+	t.Cleanup(func() {
+		cancel()
+		g.Stop()
+	})
+	return g
+}
+
+// TestOnSpanIngested_PropagatesErrorStatus asserts that when the ingestion
+// callback receives a span with status STATUS_CODE_ERROR, the GraphRAG
+// ServiceStore's ErrorCount for that service increments. Before the fix,
+// OnSpanIngested hardcoded status "OK" and the error was silently dropped.
+func TestOnSpanIngested_PropagatesErrorStatus(t *testing.T) {
+	g := newTestGraphRAG(t)
+
+	errSpan := storage.Span{
+		TraceID:       "trace-err",
+		SpanID:        "span-err",
+		OperationName: "/checkout",
+		ServiceName:   "orders",
+		Status:        "STATUS_CODE_ERROR",
+		StartTime:     time.Now(),
+		EndTime:       time.Now().Add(time.Millisecond),
+	}
+	g.OnSpanIngested(errSpan)
+
+	// Event loop is async; poll briefly for the event to be processed.
+	deadline := time.Now().Add(2 * time.Second)
+	for time.Now().Before(deadline) {
+		if svc, ok := g.ServiceStore.GetService("orders"); ok && svc.ErrorCount > 0 {
+			return
+		}
+		time.Sleep(20 * time.Millisecond)
+	}
+	t.Fatalf("ERROR span did not increment ServiceStore.ErrorCount — status was dropped")
+}
+
+// TestRefresh_PopulatesErrorCountFromDBStatus asserts that a persisted
+// ERROR span is reflected in ServiceStore.ErrorCount after the refresh
+// rebuild path runs. Before the fix, rebuildFromDB's SELECT omitted the
+// status column so every reloaded span looked successful.
+func TestRefresh_PopulatesErrorCountFromDBStatus(t *testing.T) {
+	repo := newTestRepo(t)
+
+	// Seed: one trace + one ERROR span under it.
+	now := time.Now()
+	tr := storage.Trace{
+		TraceID:     "trace-err-refresh",
+		ServiceName: "orders",
+		Duration:    1000,
+		Status:      "STATUS_CODE_ERROR",
+		Timestamp:   now,
+	}
+	if err := repo.DB().Create(&tr).Error; err != nil {
+		t.Fatalf("seed trace: %v", err)
+	}
+	sp := storage.Span{
+		TraceID:       "trace-err-refresh",
+		SpanID:        "span-err-refresh",
+		OperationName: "/checkout",
+		ServiceName:   "orders",
+		Status:        "STATUS_CODE_ERROR",
+		StartTime:     now,
+		EndTime:       now.Add(time.Millisecond),
+		Duration:      1000,
+	}
+	if err := repo.DB().Create(&sp).Error; err != nil {
+		t.Fatalf("seed span: %v", err)
+	}
+
+	// Build GraphRAG with the seeded repo, skip starting background loops;
+	// invoke the rebuild path directly.
+	g := New(repo, nil, nil, nil, DefaultConfig())
+	t.Cleanup(g.Stop)
+
+	g.rebuildFromDB()
+
+	svc, ok := g.ServiceStore.GetService("orders")
+	if !ok {
+		t.Fatalf("service 'orders' missing after rebuildFromDB")
+	}
+	if svc.ErrorCount < 1 {
+		t.Fatalf("ErrorCount=%d after refresh, want >=1 — status not read from DB", svc.ErrorCount)
+	}
+}
+
+// TestOnSpanIngested_DropsIncrementMetric asserts that when the event
+// channel is full, OnSpanIngested records the drop via an atomic counter
+// (and — when wired — the otelcontext_graphrag_events_dropped_total
+// Prometheus metric).
+func TestOnSpanIngested_DropsIncrementMetric(t *testing.T) {
+	// Build a GraphRAG WITHOUT starting any event workers so the channel
+	// fills up and overflows.
+	g := New(nil, nil, nil, nil, DefaultConfig())
+	t.Cleanup(g.Stop)
+
+	// Fill the buffer well beyond capacity (default 10k).
+	for i := 0; i < 11000; i++ {
+		g.OnSpanIngested(storage.Span{
+			TraceID:     "t",
+			SpanID:      "s",
+			ServiceName: "x",
+			Status:      "STATUS_CODE_UNSET",
+		})
+	}
+	if got := g.DroppedSpansCount(); got == 0 {
+		t.Fatalf("expected drops > 0, got %d", got)
+	}
+}
diff --git a/internal/graphrag/investigation.go b/internal/graphrag/investigation.go
index 43ad0d6..b4f9d9a 100644
--- a/internal/graphrag/investigation.go
+++ b/internal/graphrag/investigation.go
@@ -4,11 +4,53 @@ import (
 	"encoding/json"
 	"fmt"
 	"log/slog"
+	"sync"
 	"time"
 
 	"gorm.io/gorm"
 )
 
+// investigationCooldown suppresses repeated PersistInvestigation calls with
+// the same (trigger_service, root_service, root_operation) inside a sliding
+// window. Without this, a single stuck service produces one insert every
+// anomaly tick (default 10s) indefinitely.
+type investigationCooldown struct {
+	mu       sync.Mutex
+	lastSeen map[string]time.Time
+	window   time.Duration
+}
+
+func newInvestigationCooldown(window time.Duration) *investigationCooldown {
+	return &investigationCooldown{
+		lastSeen: map[string]time.Time{},
+		window:   window,
+	}
+}
+
+// allow returns true when the key has not been seen within the sliding
+// window. On allow, it records now as the new last-seen timestamp.
+func (c *investigationCooldown) allow(key string, now time.Time) bool {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	if t, ok := c.lastSeen[key]; ok && now.Sub(t) < c.window {
+		return false
+	}
+	c.lastSeen[key] = now
+	return true
+}
+
+// prune drops entries older than cutoff to bound map size. Called from
+// the refresh tick.
+func (c *investigationCooldown) prune(cutoff time.Time) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	for k, t := range c.lastSeen {
+		if t.Before(cutoff) {
+			delete(c.lastSeen, k)
+		}
+	}
+}
+
 // Investigation is a persisted record of an automated error investigation.
 type Investigation struct {
 	ID               string          `gorm:"primaryKey;size:64" json:"id"`
@@ -49,6 +91,18 @@ func (g *GraphRAG) PersistInvestigation(triggerService string, chains []ErrorCha
 		return
 	}
 
+	// Cooldown: suppress repeat investigations for the same
+	// (trigger_service, root_service, root_operation) inside a sliding window.
+	// Without this guard, a stuck service produces one insert every anomaly
+	// tick (default 10s) indefinitely.
+	key := triggerService + "|" + firstChain.RootCause.Service + "|" + firstChain.RootCause.Operation
+	if g.invCooldown != nil && !g.invCooldown.allow(key, time.Now()) {
+		return
+	}
+	// Increment BEFORE db.Create so the counter reflects "cooldown allowed;
+	// persist attempted". See InvestigationInsertCount's doc comment.
+	g.invInserts.Add(1)
+
 	id := fmt.Sprintf("inv_%d", time.Now().UnixNano())
 
 	severity := "warning"
@@ -122,6 +176,10 @@ func (g *GraphRAG) PersistInvestigation(triggerService string, chains []ErrorCha
 		SpanChain:        spanJSON,
 	}
 
+	if g.repo == nil || g.repo.DB() == nil {
+		// No repo (e.g., test harness): cooldown still applied; skip DB write.
+		return
+	}
 	if err := g.repo.DB().Create(&inv).Error; err != nil {
 		slog.Error("Failed to persist investigation", "error", err)
 		return
diff --git a/internal/graphrag/investigation_cooldown_test.go b/internal/graphrag/investigation_cooldown_test.go
new file mode 100644
index 0000000..2d90933
--- /dev/null
+++ b/internal/graphrag/investigation_cooldown_test.go
@@ -0,0 +1,46 @@
+package graphrag
+
+import (
+	"testing"
+)
+
+// TestPersistInvestigation_Cooldown asserts that PersistInvestigation
+// suppresses repeat calls for the same (trigger_service, root_service,
+// root_operation) inside the configured cooldown window. Without this,
+// a single stuck service produces one investigation insert every
+// anomaly tick (default 10s) indefinitely.
+//
+// The counter exposed via InvestigationInsertCount() increments when
+// the cooldown check passes, BEFORE the DB write — so the test is
+// meaningful even when the test helper wires a nil repo. See the
+// doc comment on InvestigationInsertCount for the exact semantics.
+func TestPersistInvestigation_Cooldown(t *testing.T) {
+	g := newTestGraphRAG(t)
+
+	chains := []ErrorChainResult{{
+		TraceID:   "tr",
+		RootCause: &RootCauseInfo{Service: "orders", Operation: "op"},
+	}}
+
+	g.PersistInvestigation("orders", chains, nil)
+	first := g.InvestigationInsertCount()
+	if first == 0 {
+		t.Fatalf("first PersistInvestigation should insert, got count=0")
+	}
+
+	g.PersistInvestigation("orders", chains, nil)
+	second := g.InvestigationInsertCount()
+	if second != first {
+		t.Fatalf("second PersistInvestigation within cooldown should be suppressed; got %d new inserts", second-first)
+	}
+
+	chains2 := []ErrorChainResult{{
+		TraceID:   "tr2",
+		RootCause: &RootCauseInfo{Service: "payments", Operation: "op"},
+	}}
+	g.PersistInvestigation("payments", chains2, nil)
+	third := g.InvestigationInsertCount()
+	if third <= second {
+		t.Fatalf("distinct service should bypass cooldown; got %d, want > %d", third, second)
+	}
+}
diff --git a/internal/graphrag/refresh.go b/internal/graphrag/refresh.go
index 430ffbd..66a3cf8 100644
--- a/internal/graphrag/refresh.go
+++ b/internal/graphrag/refresh.go
@@ -27,6 +27,11 @@ func (g *GraphRAG) refreshLoop(ctx context.Context) {
 				slog.Debug("GraphRAG pruned expired traces/spans", "count", pruned)
 			}
 			g.pruneOldAnomalies()
+			// Bound the investigation cooldown map. 2× window keeps
+			// entries through the active suppression plus a grace period.
+			if g.invCooldown != nil {
+				g.invCooldown.prune(time.Now().Add(-10 * time.Minute))
+			}
 		}
 	}
 }
@@ -86,7 +91,7 @@ func (g *GraphRAG) rebuildFromDB() {
 	var rows []spanRow
 	err := g.repo.DB().
 		Table("spans").
-		Select("span_id, parent_span_id, service_name, operation_name, duration, trace_id, start_time").
+		Select("span_id, parent_span_id, service_name, operation_name, duration, trace_id, status, start_time").
 		Where("start_time > ?", since).
 		Order("start_time ASC").
 		Limit(50000).
diff --git a/internal/ingest/otlp.go b/internal/ingest/otlp.go
index 4838701..a316e69 100644
--- a/internal/ingest/otlp.go
+++ b/internal/ingest/otlp.go
@@ -322,6 +322,7 @@ func (s *TraceServer) Export(ctx context.Context, req *coltracepb.ExportTraceSer
 						EndTime:        endTime,
 						Duration:       duration,
 						ServiceName:    serviceName,
+						Status:         statusStr,
 						AttributesJSON: storage.CompressedText(attrs),
 					}
 					localSpans = append(localSpans, sModel)
diff --git a/internal/ingest/otlp_grpc_limits_test.go b/internal/ingest/otlp_grpc_limits_test.go
new file mode 100644
index 0000000..48df3b7
--- /dev/null
+++ b/internal/ingest/otlp_grpc_limits_test.go
@@ -0,0 +1,106 @@
+//go:build integration
+
+package ingest
+
+import (
+	"context"
+	"net"
+	"testing"
+	"time"
+
+	coltracepb "go.opentelemetry.io/proto/otlp/collector/trace/v1"
+	commonpb "go.opentelemetry.io/proto/otlp/common/v1"
+	resourcepb "go.opentelemetry.io/proto/otlp/resource/v1"
+	tracepb "go.opentelemetry.io/proto/otlp/trace/v1"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/credentials/insecure"
+)
+
+// noopTraceServer implements coltracepb.TraceServiceServer and accepts any Export.
+type noopTraceServer struct {
+	coltracepb.UnimplementedTraceServiceServer
+}
+
+func (*noopTraceServer) Export(ctx context.Context, req *coltracepb.ExportTraceServiceRequest) (*coltracepb.ExportTraceServiceResponse, error) {
+	return &coltracepb.ExportTraceServiceResponse{}, nil
+}
+
+// buildFatTraceRequest constructs an ExportTraceServiceRequest whose protobuf
+// wire size exceeds targetBytes. Each span carries a large string attribute to
+// bulk up the payload without needing an absurd number of spans.
+func buildFatTraceRequest(t *testing.T, targetBytes int) *coltracepb.ExportTraceServiceRequest {
+	t.Helper()
+	padding := make([]byte, 4096)
+	for i := range padding {
+		padding[i] = 'x'
+	}
+	paddingStr := string(padding)
+
+	var req coltracepb.ExportTraceServiceRequest
+	req.ResourceSpans = []*tracepb.ResourceSpans{{
+		Resource: &resourcepb.Resource{
+			Attributes: []*commonpb.KeyValue{{
+				Key:   "service.name",
+				Value: &commonpb.AnyValue{Value: &commonpb.AnyValue_StringValue{StringValue: "loadtest"}},
+			}},
+		},
+		ScopeSpans: []*tracepb.ScopeSpans{{
+			Spans: make([]*tracepb.Span, 0, targetBytes/4096+1),
+		}},
+	}}
+	ss := req.ResourceSpans[0].ScopeSpans[0]
+	size := 0
+	for size < targetBytes {
+		ss.Spans = append(ss.Spans, &tracepb.Span{
+			TraceId:           make([]byte, 16),
+			SpanId:            make([]byte, 8),
+			Name:              "fat-span",
+			StartTimeUnixNano: 1,
+			EndTimeUnixNano:   2,
+			Attributes: []*commonpb.KeyValue{{
+				Key:   "padding",
+				Value: &commonpb.AnyValue{Value: &commonpb.AnyValue_StringValue{StringValue: paddingStr}},
+			}},
+		})
+		size += len(paddingStr)
+	}
+	return &req
+}
+
+// TestGRPC_AcceptsLargeOTLPBatch verifies that a gRPC server configured with
+// MaxRecvMsgSize accepts a batch larger than the default 4 MiB limit.
+//
+// This test *proves the option takes effect*: without MaxRecvMsgSize the
+// server would reject the 5 MiB payload; with it, the payload is accepted.
+func TestGRPC_AcceptsLargeOTLPBatch(t *testing.T) {
+	lis, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatalf("listen: %v", err)
+	}
+	s := grpc.NewServer(
+		grpc.MaxRecvMsgSize(16*1024*1024),
+		grpc.MaxConcurrentStreams(1000),
+	)
+	coltracepb.RegisterTraceServiceServer(s, &noopTraceServer{})
+	go s.Serve(lis)
+	t.Cleanup(func() { s.Stop() })
+
+	conn, err := grpc.NewClient(
+		lis.Addr().String(),
+		grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(grpc.MaxCallSendMsgSize(16*1024*1024)),
+	)
+	if err != nil {
+		t.Fatalf("dial: %v", err)
+	}
+	defer conn.Close()
+
+	client := coltracepb.NewTraceServiceClient(conn)
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	req := buildFatTraceRequest(t, 5*1024*1024) // 5 MiB payload
+	if _, err := client.Export(ctx, req); err != nil {
+		t.Fatalf("Export rejected 5 MiB batch: %v", err)
+	}
+}
diff --git a/internal/queue/dlq.go b/internal/queue/dlq.go
index b045e47..29aa846 100644
--- a/internal/queue/dlq.go
+++ b/internal/queue/dlq.go
@@ -8,7 +8,10 @@ import (
 	"os"
 	"path/filepath"
 	"sync"
+	"sync/atomic"
 	"time"
+
+	"github.com/RandomCodeSpace/otelcontext/internal/telemetry"
 )
 
 // DeadLetterQueue provides disk-based resilience for failed database writes.
@@ -36,6 +39,11 @@ type DeadLetterQueue struct {
 	onSuccess   func()
 	onFailure   func()
 	onDiskBytes func(int64)
+
+	// Eviction observability (Task 8)
+	evicted      atomic.Int64
+	evictedBytes atomic.Int64
+	metricsTel   *telemetry.Metrics // nil-safe; enables otelcontext_dlq_evicted_* counters
 }
 
 // NewDLQ creates a new Dead Letter Queue.
@@ -80,6 +88,19 @@ func (d *DeadLetterQueue) SetMetrics(onEnqueue, onSuccess, onFailure func(), onD
 	d.mu.Unlock()
 }
 
+// SetTelemetryMetrics wires the Prometheus registry so eviction counts surface
+// in telemetry. Safe to call with a nil *telemetry.Metrics (disables the hook).
+func (d *DeadLetterQueue) SetTelemetryMetrics(m *telemetry.Metrics) {
+	d.metricsTel = m
+}
+
+// EvictedCount reports the cumulative number of DLQ files dropped due to
+// MaxFiles/MaxDiskMB caps. Exposed for tests; see otelcontext_dlq_evicted_total.
+func (d *DeadLetterQueue) EvictedCount() int64 { return d.evicted.Load() }
+
+// EvictedBytesCount reports the byte volume dropped alongside EvictedCount.
+func (d *DeadLetterQueue) EvictedBytesCount() int64 { return d.evictedBytes.Load() }
+
 // DiskBytes returns the current total bytes of files in the DLQ directory.
 func (d *DeadLetterQueue) DiskBytes() int64 {
 	d.mu.Lock()
@@ -180,6 +201,8 @@ func (d *DeadLetterQueue) enforceLimits(incomingBytes int64) {
 	}
 
 	maxBytes := d.maxDiskMB * 1024 * 1024
+	var evictedThisCall int
+	var evictedBytesThisCall int64
 	i := 0
 	for i < len(files) {
 		overFiles := d.maxFiles > 0 && len(files)-i >= d.maxFiles
@@ -195,8 +218,29 @@ func (d *DeadLetterQueue) enforceLimits(incomingBytes int64) {
 		_ = os.Remove(path)
 		delete(d.retries, files[i].name)
 		slog.Warn("🗑️  DLQ FIFO eviction", "file", files[i].name)
+		d.evicted.Add(1)
+		d.evictedBytes.Add(files[i].size)
+		evictedThisCall++
+		evictedBytesThisCall += files[i].size
+		if d.metricsTel != nil {
+			if d.metricsTel.DLQEvictedTotal != nil {
+				d.metricsTel.DLQEvictedTotal.Inc()
+			}
+			if d.metricsTel.DLQEvictedBytesTotal != nil {
+				d.metricsTel.DLQEvictedBytesTotal.Add(float64(files[i].size))
+			}
+		}
 		i++
 	}
+
+	if evictedThisCall > 0 {
+		slog.Warn("dlq: evicted oldest files to stay under cap",
+			"files", evictedThisCall,
+			"bytes", evictedBytesThisCall,
+			"max_files", d.maxFiles,
+			"max_disk_mb", d.maxDiskMB,
+		)
+	}
 }
 
 // Size returns the number of files currently in the DLQ directory.
diff --git a/internal/queue/dlq_eviction_test.go b/internal/queue/dlq_eviction_test.go
new file mode 100644
index 0000000..a3eb91f
--- /dev/null
+++ b/internal/queue/dlq_eviction_test.go
@@ -0,0 +1,40 @@
+package queue
+
+import (
+	"testing"
+	"time"
+)
+
+// TestDLQ_EvictionIncrementsCounters verifies that when the DLQ is bounded by
+// MaxFiles and exceeded, the eviction atomic counters climb. The underlying
+// prometheus counters are exercised indirectly via a nil metricsTel path
+// (no wiring) and explicitly in main.go integration.
+func TestDLQ_EvictionIncrementsCounters(t *testing.T) {
+	dir := t.TempDir()
+
+	// replayFn is a no-op — we care about eviction during Enqueue, not replay.
+	noReplay := func(_ []byte) error { return nil }
+
+	// Cap at 2 files with a long replay interval so the worker doesn't interfere.
+	dlq, err := NewDLQWithLimits(dir, time.Hour, noReplay, 2, 0, 0)
+	if err != nil {
+		t.Fatalf("NewDLQWithLimits: %v", err)
+	}
+	defer dlq.Stop()
+
+	payload := map[string]any{"type": "spans", "data": []string{}}
+	for i := 0; i < 3; i++ {
+		if err := dlq.Enqueue(payload); err != nil {
+			t.Fatalf("enqueue %d: %v", i, err)
+		}
+		// Tiny sleep to guarantee distinct nanosecond prefixes for FIFO order.
+		time.Sleep(5 * time.Millisecond)
+	}
+
+	if got := dlq.EvictedCount(); got == 0 {
+		t.Fatalf("expected eviction count > 0, got %d", got)
+	}
+	if got := dlq.EvictedBytesCount(); got == 0 {
+		t.Fatalf("expected evicted bytes > 0, got %d", got)
+	}
+}
diff --git a/internal/storage/dialect_test.go b/internal/storage/dialect_test.go
index 6d7de21..918b24e 100644
--- a/internal/storage/dialect_test.go
+++ b/internal/storage/dialect_test.go
@@ -248,7 +248,7 @@ func TestPurgeLogs_DeadlineExceeded_ReturnsPromptly(t *testing.T) {
 
 	done := make(chan struct{})
 	go func() {
-		_, _ = repo.PurgeLogsBatched(ctx, time.Now(), 10)
+		_, _ = repo.PurgeLogsBatched(ctx, time.Now(), 10, 5*time.Millisecond)
 		close(done)
 	}()
 	select {
diff --git a/internal/storage/log_repo.go b/internal/storage/log_repo.go
index d9dd4fd..dfaae97 100644
--- a/internal/storage/log_repo.go
+++ b/internal/storage/log_repo.go
@@ -160,7 +160,7 @@ func (r *Repository) PurgeLogs(olderThan time.Time) (int64, error) {
 // Tenant scope: this is a SYSTEM-WIDE retention operation and intentionally
 // does NOT filter by tenant. All rows older than olderThan are purged across
 // every tenant. Never expose this on a tenant-scoped API surface.
-func (r *Repository) PurgeLogsBatched(ctx context.Context, olderThan time.Time, batchSize int) (int64, error) {
+func (r *Repository) PurgeLogsBatched(ctx context.Context, olderThan time.Time, batchSize int, sleep time.Duration) (int64, error) {
 	if batchSize <= 0 {
 		batchSize = 10_000
 	}
@@ -189,7 +189,7 @@ func (r *Repository) PurgeLogsBatched(ctx context.Context, olderThan time.Time,
 		select {
 		case <-ctx.Done():
 			return total, ctx.Err()
-		case <-time.After(5 * time.Millisecond):
+		case <-time.After(sleep):
 		}
 	}
 }
diff --git a/internal/storage/metrics_p99_test.go b/internal/storage/metrics_p99_test.go
new file mode 100644
index 0000000..eb4c515
--- /dev/null
+++ b/internal/storage/metrics_p99_test.go
@@ -0,0 +1,299 @@
+package storage
+
+import (
+	"context"
+	"math"
+	"testing"
+	"time"
+
+	"gorm.io/gorm"
+)
+
+// ---------------------------------------------------------------------------
+// Step 1: dialect-dispatch tests (DryRun + real SQLite)
+// ---------------------------------------------------------------------------
+
+// TestP99_SQLite_DispatchLimit verifies that the SQLite path issues a query
+// with Limit(sqliteP99RowCap+1). We use a real in-memory SQLite DB with a
+// DryRun session to capture the generated SQL.
+func TestP99_SQLite_DispatchLimit(t *testing.T) {
+	repo := newTestRepo(t)
+
+	// Build a session identical to how GetDashboardStats passes it:
+	// Model(&Trace{}) + Where clause + Session(&gorm.Session{}).
+	baseQuery := repo.db.Model(&Trace{}).Where("tenant_id = ? AND timestamp BETWEEN ? AND ?", "default", time.Now().Add(-time.Hour), time.Now())
+	session := baseQuery.Session(&gorm.Session{DryRun: true})
+
+	// We need a variant of the helper that returns the statement instead of executing.
+	// Since we can't easily intercept a live query without executing it, we verify
+	// the cap constant and the helper runs without error on a real (empty) DB.
+	//
+	// The real cap-and-limit behaviour is covered by the large-data test below.
+	// Here we verify the helper is callable and returns (0, nil) for an empty DB.
+	p99, err := repo.p99DurationForQuery(context.Background(), session)
+	if err != nil {
+		t.Fatalf("p99DurationForQuery (sqlite, empty): %v", err)
+	}
+	if p99 != 0 {
+		t.Fatalf("want 0 for empty DB, got %d", p99)
+	}
+}
+
+// TestP99_MySQL_Dispatch verifies that swapping r.driver to "mysql" causes the
+// helper to take the MySQL two-query path. We use the SQLite engine underneath
+// (same SQL is compatible for COUNT+OFFSET) to verify it doesn't panic and
+// returns a sane value.
+func TestP99_MySQL_Dispatch(t *testing.T) {
+	repo := newTestRepo(t)
+	repo.driver = "mysql" // force MySQL path on SQLite engine (SQL is compatible)
+
+	now := time.Now().UTC()
+	traces := makeTraces(t, 10, now)
+	if err := repo.db.Create(&traces).Error; err != nil {
+		t.Fatalf("seed: %v", err)
+	}
+
+	baseQuery := repo.db.Model(&Trace{}).Where("tenant_id = ? AND timestamp BETWEEN ? AND ?", "default", now.Add(-time.Hour), now.Add(time.Hour))
+	session := baseQuery.Session(&gorm.Session{})
+
+	p99, err := repo.p99DurationForQuery(context.Background(), session)
+	if err != nil {
+		t.Fatalf("p99DurationForQuery (mysql path): %v", err)
+	}
+	// With 10 rows of duration 1000..10000 (step 1000), p99 index = ceil(10*0.99)-1 = 9
+	// → duration = 10000.
+	if p99 != 10000 {
+		t.Fatalf("want 10000 (p99 of 10 sorted rows), got %d", p99)
+	}
+}
+
+// TestP99_MySQL_EmptyTable ensures the MySQL path returns (0, nil) on empty.
+func TestP99_MySQL_EmptyTable(t *testing.T) {
+	repo := newTestRepo(t)
+	repo.driver = "mysql"
+
+	baseQuery := repo.db.Model(&Trace{}).Where("tenant_id = ? AND timestamp BETWEEN ? AND ?", "default", time.Now().Add(-time.Hour), time.Now())
+	session := baseQuery.Session(&gorm.Session{})
+
+	p99, err := repo.p99DurationForQuery(context.Background(), session)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if p99 != 0 {
+		t.Fatalf("want 0 for empty, got %d", p99)
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Step 3a: SQLite end-to-end test with small data
+// ---------------------------------------------------------------------------
+
+// TestP99_SQLite_SmallData inserts 50 traces with durations 1000..50000 (step
+// 1000 µs) and asserts GetDashboardStats returns p99 = 50000.
+func TestP99_SQLite_SmallData(t *testing.T) {
+	repo := newTestRepo(t)
+	now := time.Now().UTC()
+
+	traces := makeTraces(t, 50, now)
+	if err := repo.db.Create(&traces).Error; err != nil {
+		t.Fatalf("seed: %v", err)
+	}
+
+	ctx := context.Background()
+	stats, err := repo.GetDashboardStats(ctx, now.Add(-time.Hour), now.Add(time.Hour), nil)
+	if err != nil {
+		t.Fatalf("GetDashboardStats: %v", err)
+	}
+
+	// ceil(50*0.99) - 1 = 49 → durations[49] = 50*1000 = 50000
+	want := int64(50 * 1000)
+	if stats.P99Latency != want {
+		t.Fatalf("P99Latency: want %d, got %d", want, stats.P99Latency)
+	}
+}
+
+// TestP99_SQLite_SingleRow ensures p99 of a single row is that row's value.
+func TestP99_SQLite_SingleRow(t *testing.T) {
+	repo := newTestRepo(t)
+	now := time.Now().UTC()
+	repo.db.Create(&Trace{
+		TraceID: "t1", ServiceName: "svc", Duration: 42000,
+		Status: "OK", Timestamp: now, TenantID: "default",
+	})
+
+	ctx := context.Background()
+	stats, err := repo.GetDashboardStats(ctx, now.Add(-time.Hour), now.Add(time.Hour), nil)
+	if err != nil {
+		t.Fatalf("GetDashboardStats: %v", err)
+	}
+	if stats.P99Latency != 42000 {
+		t.Fatalf("want 42000, got %d", stats.P99Latency)
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Step 3b: Large-data / cap test
+// ---------------------------------------------------------------------------
+
+// TestP99_SQLite_CapTriggers temporarily shrinks sqliteP99RowCap and inserts
+// just over that amount, verifying the cap path is exercised:
+//  1. GetDashboardStats returns without error.
+//  2. P99Latency is non-zero.
+//  3. The call completes quickly (small dataset).
+//
+// The cap is a var specifically so this test can run under -race without
+// seeding 200k rows. Default cap remains 200_000 in production.
+func TestP99_SQLite_CapTriggers(t *testing.T) {
+	orig := sqliteP99RowCap
+	sqliteP99RowCap = 200
+	t.Cleanup(func() { sqliteP99RowCap = orig })
+
+	repo := newTestRepo(t)
+	now := time.Now().UTC()
+
+	insertCount := sqliteP99RowCap + 50 // just over the (shrunk) cap
+
+	batch := make([]Trace, 0, insertCount)
+	for i := 0; i < insertCount; i++ {
+		batch = append(batch, Trace{
+			TraceID:     "t" + p99Itoa(i),
+			ServiceName: "svc",
+			Duration:    int64(i + 1),
+			Status:      "OK",
+			Timestamp:   now,
+			TenantID:    "default",
+		})
+	}
+	if err := repo.db.CreateInBatches(batch, 100).Error; err != nil {
+		t.Fatalf("seed batch: %v", err)
+	}
+
+	ctx := context.Background()
+	stats, err := repo.GetDashboardStats(ctx, now.Add(-time.Hour), now.Add(time.Hour), nil)
+	if err != nil {
+		t.Fatalf("GetDashboardStats: %v", err)
+	}
+	if stats.P99Latency <= 0 {
+		t.Fatalf("P99Latency should be positive, got %d", stats.P99Latency)
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+// makeTraces creates n Trace rows with durations i*1000 (i=1..n), all at ts.
+func makeTraces(t *testing.T, n int, ts time.Time) []Trace {
+	t.Helper()
+	traces := make([]Trace, n)
+	for i := 0; i < n; i++ {
+		traces[i] = Trace{
+			TraceID:     "tr" + p99Itoa(i),
+			ServiceName: "svc",
+			Duration:    int64((i + 1) * 1000),
+			Status:      "OK",
+			Timestamp:   ts,
+			TenantID:    "default",
+		}
+	}
+	return traces
+}
+
+// p99Itoa is a tiny int→string helper to avoid importing strconv.
+func p99Itoa(n int) string {
+	if n == 0 {
+		return "0"
+	}
+	neg := n < 0
+	if neg {
+		n = -n
+	}
+	buf := [20]byte{}
+	pos := 20
+	for n > 0 {
+		pos--
+		buf[pos] = byte('0' + n%10)
+		n /= 10
+	}
+	if neg {
+		pos--
+		buf[pos] = '-'
+	}
+	return string(buf[pos:])
+}
+
+// verifyP99Index is the reference formula used in assertions.
+func verifyP99Index(n int) int {
+	idx := int(math.Ceil(float64(n)*0.99)) - 1
+	if idx < 0 {
+		idx = 0
+	}
+	if idx >= n {
+		idx = n - 1
+	}
+	return idx
+}
+
+// ---------------------------------------------------------------------------
+// Critical 2: verify MySQL branch preserves tenant filter
+// ---------------------------------------------------------------------------
+
+// TestP99_MySQLBranch_PreservesTenantFilter inserts 10 rows for tenant "a"
+// with durations 1000..10000 µs and 10 rows for tenant "b" with durations
+// 100000..1000000 µs. It then calls GetDashboardStats scoped to tenant "a"
+// with r.driver forced to "mysql" and asserts P99Latency == 10000 (tenant "a"
+// p99), not a value contaminated by tenant "b" rows.
+//
+// If the GORM Session+Model dance loses the WHERE clause the Count or Offset
+// calculation will be wrong (n=20 instead of 10) and the p99 will land in
+// tenant "b"'s range (≥100000), causing the assertion to fail.
+func TestP99_MySQLBranch_PreservesTenantFilter(t *testing.T) {
+	repo := newTestRepo(t)
+	repo.driver = "mysql" // force MySQL path on SQLite engine
+
+	now := time.Now().UTC()
+
+	// Tenant "a": durations 1000, 2000, ..., 10000 µs
+	tracesA := make([]Trace, 10)
+	for i := 0; i < 10; i++ {
+		tracesA[i] = Trace{
+			TraceID:     "a-" + p99Itoa(i),
+			ServiceName: "svc",
+			Duration:    int64((i + 1) * 1000),
+			Status:      "OK",
+			Timestamp:   now,
+			TenantID:    "a",
+		}
+	}
+	if err := repo.db.Create(&tracesA).Error; err != nil {
+		t.Fatalf("seed tenant a: %v", err)
+	}
+
+	// Tenant "b": durations 100000, 200000, ..., 1000000 µs (10× larger)
+	tracesB := make([]Trace, 10)
+	for i := 0; i < 10; i++ {
+		tracesB[i] = Trace{
+			TraceID:     "b-" + p99Itoa(i),
+			ServiceName: "svc",
+			Duration:    int64((i + 1) * 100_000),
+			Status:      "OK",
+			Timestamp:   now,
+			TenantID:    "b",
+		}
+	}
+	if err := repo.db.Create(&tracesB).Error; err != nil {
+		t.Fatalf("seed tenant b: %v", err)
+	}
+
+	ctx := WithTenantContext(context.Background(), "a")
+	stats, err := repo.GetDashboardStats(ctx, now.Add(-time.Hour), now.Add(time.Hour), nil)
+	if err != nil {
+		t.Fatalf("GetDashboardStats: %v", err)
+	}
+
+	// ceil(10*0.99)-1 = 9 → tracesA[9].Duration = 10*1000 = 10000
+	const want = int64(10_000)
+	if stats.P99Latency != want {
+		t.Fatalf("P99Latency: want %d (tenant a p99), got %d — tenant filter may be lost in MySQL branch", want, stats.P99Latency)
+	}
+}
diff --git a/internal/storage/metrics_repo.go b/internal/storage/metrics_repo.go
index ac019f8..1d2a756 100644
--- a/internal/storage/metrics_repo.go
+++ b/internal/storage/metrics_repo.go
@@ -12,6 +12,13 @@ import (
 	"gorm.io/gorm"
 )
 
+// sqliteP99RowCap is the maximum number of duration rows fetched for the
+// in-memory p99 sort on SQLite. Queries returning more rows than this are
+// capped with a warning; accuracy degrades gracefully at the tail.
+// Declared as var (not const) so tests can temporarily shrink it to exercise
+// the cap path without seeding 200k rows under -race.
+var sqliteP99RowCap = 200_000
+
 // TrafficPoint represents a data point for the traffic chart.
 type TrafficPoint struct {
 	Timestamp  time.Time `json:"timestamp"`
@@ -89,6 +96,86 @@ func (r *Repository) GetMetricNames(ctx context.Context, serviceName string) ([]
 	return names, nil
 }
 
+// p99DurationForQuery computes the p99 latency (in microseconds) from the
+// matching rows of session. It dispatches on r.driver:
+//
+//   - postgres / postgresql: native percentile_disc aggregate (single query).
+//   - mysql: two-query COUNT + ORDER BY … OFFSET approach.
+//   - default (sqlite + unknown): in-memory sort capped at sqliteP99RowCap rows.
+//
+// The caller must pass a fresh Session so nothing leaks across subsequent calls.
+// ctx is threaded into every sub-session so client cancellation (disconnect/timeout)
+// is honoured at the driver level.
+func (r *Repository) p99DurationForQuery(ctx context.Context, session *gorm.DB) (int64, error) {
+	switch strings.ToLower(r.driver) {
+	case "postgres", "postgresql":
+		// Use Rows() (not Row()) so we can explicitly Close the underlying
+		// *sql.Rows — otherwise the connection leaks on sustained traffic.
+		rows, err := session.Session(&gorm.Session{Context: ctx}).Select("COALESCE(percentile_disc(0.99) WITHIN GROUP (ORDER BY duration), 0)::bigint").Rows()
+		if err != nil {
+			return 0, err
+		}
+		defer rows.Close()
+		var p int64
+		if rows.Next() {
+			if err := rows.Scan(&p); err != nil {
+				return 0, err
+			}
+		}
+		if err := rows.Err(); err != nil {
+			return 0, err
+		}
+		return p, nil
+
+	case "mysql":
+		var n int64
+		if err := session.Session(&gorm.Session{Context: ctx}).Model(&Trace{}).Count(&n).Error; err != nil {
+			return 0, err
+		}
+		if n == 0 {
+			return 0, nil
+		}
+		offset := int64(math.Ceil(float64(n)*0.99)) - 1
+		if offset < 0 {
+			offset = 0
+		} else if offset >= n {
+			offset = n - 1
+		}
+		var p int64
+		if err := session.Session(&gorm.Session{Context: ctx}).Select("duration").Order("duration ASC").Offset(int(offset)).Limit(1).Scan(&p).Error; err != nil {
+			return 0, err
+		}
+		return p, nil
+
+	default: // sqlite and any unknown driver
+		var durations []int64
+		q := session.Session(&gorm.Session{Context: ctx}).Select("duration").Order("duration ASC").Limit(sqliteP99RowCap + 1)
+		if err := q.Find(&durations).Error; err != nil {
+			return 0, err
+		}
+		if len(durations) == 0 {
+			return 0, nil
+		}
+		if len(durations) > sqliteP99RowCap {
+			// Truncate to cap — accuracy degrades gracefully. Operators alert on
+			// the counter (dataset is too large for in-memory p99 — migrate to
+			// Postgres). Keep a low-volume debug log for dev observability.
+			if r.metrics != nil {
+				r.metrics.DashboardP99RowCapHitsTotal.Inc()
+			}
+			slog.Debug("p99 SQLite fallback capped rows", "cap", sqliteP99RowCap)
+			durations = durations[:sqliteP99RowCap]
+		}
+		idx := int(math.Ceil(float64(len(durations))*0.99)) - 1
+		if idx < 0 {
+			idx = 0
+		} else if idx >= len(durations) {
+			idx = len(durations) - 1
+		}
+		return durations[idx], nil
+	}
+}
+
 // GetDashboardStats calculates high-level metrics for the dashboard, scoped to
 // the tenant on ctx.
 func (r *Repository) GetDashboardStats(ctx context.Context, start, end time.Time, serviceNames []string) (*DashboardStats, error) {
@@ -147,23 +234,11 @@ func (r *Repository) GetDashboardStats(ctx context.Context, start, end time.Time
 	}
 
 	// 6. P99 Latency
-	var durations []int64
-	if err := baseQuery.Session(&gorm.Session{}).
-		Select("duration").
-		Order("duration ASC").
-		Find(&durations).Error; err != nil {
-		return nil, fmt.Errorf("failed to fetch durations for p99: %w", err)
-	}
-
-	if len(durations) > 0 {
-		p99Index := int(math.Ceil(float64(len(durations))*0.99)) - 1
-		if p99Index < 0 {
-			p99Index = 0
-		} else if p99Index >= len(durations) {
-			p99Index = len(durations) - 1
-		}
-		stats.P99Latency = durations[p99Index]
+	p99, err := r.p99DurationForQuery(ctx, baseQuery.Session(&gorm.Session{}))
+	if err != nil {
+		return nil, fmt.Errorf("failed to compute p99 latency: %w", err)
 	}
+	stats.P99Latency = p99
 
 	// 7. Top Failing Services
 	type svcCount struct {
@@ -279,7 +354,7 @@ func (r *Repository) GetLatencyHeatmap(ctx context.Context, start, end time.Time
 // Tenant scope: this is a SYSTEM-WIDE retention operation and intentionally
 // does NOT filter by tenant. Rows are deleted across every tenant. Never expose
 // this on a tenant-scoped API surface.
-func (r *Repository) PurgeMetricBucketsBatched(ctx context.Context, olderThan time.Time, batchSize int) (int64, error) {
+func (r *Repository) PurgeMetricBucketsBatched(ctx context.Context, olderThan time.Time, batchSize int, sleep time.Duration) (int64, error) {
 	if batchSize <= 0 {
 		batchSize = 10_000
 	}
@@ -308,7 +383,7 @@ func (r *Repository) PurgeMetricBucketsBatched(ctx context.Context, olderThan ti
 		select {
 		case <-ctx.Done():
 			return total, ctx.Err()
-		case <-time.After(5 * time.Millisecond):
+		case <-time.After(sleep):
 		}
 	}
 }
diff --git a/internal/storage/models.go b/internal/storage/models.go
index 6ea8266..8585c61 100644
--- a/internal/storage/models.go
+++ b/internal/storage/models.go
@@ -118,6 +118,7 @@ type Span struct {
 	EndTime        time.Time      `json:"end_time"`
 	Duration       int64          `json:"duration"`                                                                     // Microseconds
 	ServiceName    string         `gorm:"size:255;index:idx_spans_tenant_service_start,priority:2" json:"service_name"` // Originating service
+	Status         string         `gorm:"size:50;default:'STATUS_CODE_UNSET';index" json:"status"`                      // OTLP status code (e.g. STATUS_CODE_ERROR); drives GraphRAG error signal
 	AttributesJSON CompressedText `json:"attributes_json"`                                                              // Compressed JSON string
 }
 
diff --git a/internal/storage/pg_integration_test.go b/internal/storage/pg_integration_test.go
index 108a578..a1a2bbd 100644
--- a/internal/storage/pg_integration_test.go
+++ b/internal/storage/pg_integration_test.go
@@ -230,7 +230,7 @@ func TestPG_VacuumAnalyze_OutsideTx(t *testing.T) {
 	// Seed a few rows so VACUUM ANALYZE has something to observe.
 	seedLogs(t, repo.db, 10, time.Now().UTC(), "svc")
 
-	sched := NewRetentionScheduler(repo, 7)
+	sched := NewRetentionScheduler(repo, 7, 10_000, 5*time.Millisecond)
 	ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second)
 	defer cancel()
 
@@ -283,7 +283,7 @@ func TestPG_PurgeLogsBatched_LargeVolume(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second)
 	defer cancel()
 
-	n, err := repo.PurgeLogsBatched(ctx, time.Now().UTC().Add(-time.Hour), batch)
+	n, err := repo.PurgeLogsBatched(ctx, time.Now().UTC().Add(-time.Hour), batch, 5*time.Millisecond)
 	if err != nil {
 		t.Fatalf("PurgeLogsBatched: %v", err)
 	}
@@ -326,7 +326,7 @@ func TestPG_PurgeTracesBatched_OrphanSpanSweep_NOT_IN(t *testing.T) {
 	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 	defer cancel()
 
-	if _, err := repo.PurgeTracesBatched(ctx, cutoff, 10); err != nil {
+	if _, err := repo.PurgeTracesBatched(ctx, cutoff, 10, 5*time.Millisecond); err != nil {
 		t.Fatalf("PurgeTracesBatched: %v", err)
 	}
 
diff --git a/internal/storage/purge_test.go b/internal/storage/purge_test.go
index ea3500f..037d30f 100644
--- a/internal/storage/purge_test.go
+++ b/internal/storage/purge_test.go
@@ -9,7 +9,7 @@ import (
 
 func TestPurgeLogsBatched_EmptyTable(t *testing.T) {
 	repo := newTestRepo(t)
-	n, err := repo.PurgeLogsBatched(context.Background(), time.Now(), 100)
+	n, err := repo.PurgeLogsBatched(context.Background(), time.Now(), 100, 5*time.Millisecond)
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
 	}
@@ -23,7 +23,7 @@ func TestPurgeLogsBatched_AllOld_AllDeleted(t *testing.T) {
 	old := time.Now().UTC().Add(-10 * 24 * time.Hour)
 	seedLogs(t, repo.db, 50, old, "svc")
 
-	n, err := repo.PurgeLogsBatched(context.Background(), time.Now().UTC().Add(-time.Hour), 10)
+	n, err := repo.PurgeLogsBatched(context.Background(), time.Now().UTC().Add(-time.Hour), 10, 5*time.Millisecond)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -39,7 +39,7 @@ func TestPurgeLogsBatched_AllNew_NoneDeleted(t *testing.T) {
 	repo := newTestRepo(t)
 	seedLogs(t, repo.db, 20, time.Now().UTC(), "svc")
 
-	n, err := repo.PurgeLogsBatched(context.Background(), time.Now().UTC().Add(-time.Hour), 10)
+	n, err := repo.PurgeLogsBatched(context.Background(), time.Now().UTC().Add(-time.Hour), 10, 5*time.Millisecond)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -59,7 +59,7 @@ func TestPurgeLogsBatched_ZeroBatchSize_DefaultsTo10k(t *testing.T) {
 	// batchSize=0 must default internally and not loop forever.
 	done := make(chan struct{})
 	go func() {
-		_, _ = repo.PurgeLogsBatched(context.Background(), time.Now(), 0)
+		_, _ = repo.PurgeLogsBatched(context.Background(), time.Now(), 0, 5*time.Millisecond)
 		close(done)
 	}()
 	select {
@@ -78,7 +78,7 @@ func TestPurgeLogsBatched_ContextCancellation(t *testing.T) {
 	cancel() // pre-cancelled
 
 	// SQLite path is single-shot so ctx.Err() may not be observed; validate it doesn't panic.
-	_, _ = repo.PurgeLogsBatched(ctx, time.Now(), 10)
+	_, _ = repo.PurgeLogsBatched(ctx, time.Now(), 10, 5*time.Millisecond)
 }
 
 func TestPurgeLogsBatched_BoundaryTimestamp(t *testing.T) {
@@ -89,7 +89,7 @@ func TestPurgeLogsBatched_BoundaryTimestamp(t *testing.T) {
 	seedLogs(t, repo.db, 1, cutoff.Add(-time.Nanosecond), "just-before")
 	seedLogs(t, repo.db, 1, cutoff.Add(time.Nanosecond), "just-after")
 
-	n, err := repo.PurgeLogsBatched(context.Background(), cutoff, 100)
+	n, err := repo.PurgeLogsBatched(context.Background(), cutoff, 100, 5*time.Millisecond)
 	if err != nil {
 		t.Fatalf("err: %v", err)
 	}
@@ -126,7 +126,7 @@ func TestPurgeTracesBatched_OrphanSpanSweep(t *testing.T) {
 	// T3: fresh trace + fresh spans — all preserved.
 	seedTrace(t, repo.db, "t-new", nowUTC, []time.Time{nowUTC})
 
-	_, err := repo.PurgeTracesBatched(context.Background(), cutoff, 10)
+	_, err := repo.PurgeTracesBatched(context.Background(), cutoff, 10, 5*time.Millisecond)
 	if err != nil {
 		t.Fatalf("PurgeTracesBatched: %v", err)
 	}
@@ -188,7 +188,7 @@ func TestPurgeTracesBatched_DoesNotDeleteLiveSpans(t *testing.T) {
 		t.Fatalf("seed live spans: %v", err)
 	}
 
-	if _, err := repo.PurgeTracesBatched(context.Background(), cutoff, 10); err != nil {
+	if _, err := repo.PurgeTracesBatched(context.Background(), cutoff, 10, 5*time.Millisecond); err != nil {
 		t.Fatalf("purge: %v", err)
 	}
 
@@ -224,7 +224,7 @@ func TestPurgeMetricBucketsBatched(t *testing.T) {
 		t.Fatalf("create buckets: %v", err)
 	}
 
-	n, err := repo.PurgeMetricBucketsBatched(context.Background(), time.Now().UTC().Add(-time.Hour), 10)
+	n, err := repo.PurgeMetricBucketsBatched(context.Background(), time.Now().UTC().Add(-time.Hour), 10, 5*time.Millisecond)
 	if err != nil {
 		t.Fatalf("purge: %v", err)
 	}
@@ -265,7 +265,7 @@ func TestPurgeLogs_ConcurrentIngestWhilePurging(t *testing.T) {
 	}()
 	time.Sleep(20 * time.Millisecond)
 
-	_, err := repo.PurgeLogsBatched(context.Background(), time.Now().UTC().Add(-time.Hour), 50)
+	_, err := repo.PurgeLogsBatched(context.Background(), time.Now().UTC().Add(-time.Hour), 50, 5*time.Millisecond)
 	if err != nil {
 		t.Fatalf("purge: %v", err)
 	}
diff --git a/internal/storage/retention.go b/internal/storage/retention.go
index c671052..2d69a52 100644
--- a/internal/storage/retention.go
+++ b/internal/storage/retention.go
@@ -14,11 +14,12 @@ import (
 // On startup and hourly thereafter it deletes rows older than retentionDays.
 // Daily it runs driver-appropriate maintenance (VACUUM ANALYZE / OPTIMIZE / VACUUM).
 type RetentionScheduler struct {
-	repo           *Repository
-	retentionDays  int
-	purgeInterval  time.Duration
-	vacuumInterval time.Duration
-	purgeBatchSize int
+	repo            *Repository
+	retentionDays   int
+	purgeInterval   time.Duration
+	vacuumInterval  time.Duration
+	purgeBatchSize  int
+	purgeBatchSleep time.Duration
 
 	// started is an atomic so a fast-path Stop() before Start() is lock-free.
 	// mu serializes the Start/Stop transition itself (protects cancel + done).
@@ -38,14 +39,22 @@ type RetentionScheduler struct {
 }
 
 // NewRetentionScheduler constructs a scheduler but does not start it.
-func NewRetentionScheduler(repo *Repository, retentionDays int) *RetentionScheduler {
+// batchSize <= 0 defaults to 10_000; batchSleep < 0 defaults to 5ms.
+func NewRetentionScheduler(repo *Repository, retentionDays, batchSize int, batchSleep time.Duration) *RetentionScheduler {
+	if batchSize <= 0 {
+		batchSize = 10_000
+	}
+	if batchSleep < 0 {
+		batchSleep = 5 * time.Millisecond
+	}
 	return &RetentionScheduler{
-		repo:           repo,
-		retentionDays:  retentionDays,
-		purgeInterval:  1 * time.Hour,
-		vacuumInterval: 24 * time.Hour,
-		purgeBatchSize: 10_000,
-		done:           make(chan struct{}),
+		repo:            repo,
+		retentionDays:   retentionDays,
+		purgeInterval:   1 * time.Hour,
+		vacuumInterval:  24 * time.Hour,
+		purgeBatchSize:  batchSize,
+		purgeBatchSleep: batchSleep,
+		done:            make(chan struct{}),
 	}
 }
 
@@ -122,16 +131,96 @@ func (r *RetentionScheduler) runPurge(ctx context.Context) {
 	if driver == "" {
 		driver = "sqlite"
 	}
-	metrics := r.repo.metrics
+	cutoff := time.Now().UTC().Add(-time.Duration(r.retentionDays) * 24 * time.Hour)
 
+	// SQLite: single-writer, parallel purges would just contend on the DB lock.
+	if driver == "sqlite" {
+		r.runPurgeSerial(ctx, cutoff, driver)
+		return
+	}
+
+	metrics := r.repo.metrics
 	start := time.Now()
-	cutoff := time.Now().UTC().Add(-time.Duration(r.retentionDays) * 24 * time.Hour)
 
-	// Fix 6: track failure across all three purges so we can expose
-	// retention_consecutive_failures{job="purge"} accurately.
+	// Observe rows-behind before we start — good for dashboards, costs a COUNT.
+	// Only on Postgres/MySQL where the extra scan is cheap relative to the purge.
+	r.observeRowsBehind(ctx, driver, cutoff)
+
+	type result struct {
+		kind string
+		n    int64
+		err  error
+	}
+	results := make(chan result, 3)
+
+	// runGuarded wraps each purge goroutine so a panic still sends on the
+	// results channel. Without this, a panic inside a repo method would leave
+	// the main loop blocked on `<-results`, and the outer `running` guard
+	// would keep every subsequent tick from firing.
+	runGuarded := func(kind string, fn func() (int64, error)) {
+		go func() {
+			defer func() {
+				if rec := recover(); rec != nil {
+					slog.Error("retention: purge panic", "kind", kind, "panic", rec)
+					results <- result{kind, 0, fmt.Errorf("%s purge panic: %v", kind, rec)}
+				}
+			}()
+			n, err := fn()
+			results <- result{kind, n, err}
+		}()
+	}
+	runGuarded("logs", func() (int64, error) {
+		return r.repo.PurgeLogsBatched(ctx, cutoff, r.purgeBatchSize, r.purgeBatchSleep)
+	})
+	runGuarded("traces", func() (int64, error) {
+		return r.repo.PurgeTracesBatched(ctx, cutoff, r.purgeBatchSize, r.purgeBatchSleep)
+	})
+	runGuarded("metric_buckets", func() (int64, error) {
+		return r.repo.PurgeMetricBucketsBatched(ctx, cutoff, r.purgeBatchSize, r.purgeBatchSleep)
+	})
+
+	purgeFailed := false
+	totals := map[string]int64{}
+	for i := 0; i < 3; i++ {
+		res := <-results
+		if res.err != nil {
+			slog.Error("retention: purge failed", "kind", res.kind, "error", res.err)
+			purgeFailed = true
+		}
+		totals[res.kind] += res.n
+		if metrics != nil && res.n > 0 {
+			metrics.RetentionRowsPurgedTotal.WithLabelValues(res.kind, driver).Add(float64(res.n))
+		}
+	}
+
+	if metrics != nil {
+		metrics.RetentionPurgeDurationSeconds.WithLabelValues(driver).Observe(time.Since(start).Seconds())
+		if purgeFailed {
+			metrics.RetentionConsecutiveFailures.WithLabelValues("purge").Inc()
+		} else {
+			metrics.RetentionConsecutiveFailures.WithLabelValues("purge").Set(0)
+			metrics.RetentionLastSuccessTimestamp.WithLabelValues("purge").Set(float64(time.Now().Unix()))
+		}
+	}
+
+	slog.Info("retention purge complete",
+		"driver", driver,
+		"duration", time.Since(start),
+		"logs_deleted", totals["logs"],
+		"traces_deleted", totals["traces"],
+		"metrics_deleted", totals["metric_buckets"],
+	)
+}
+
+// runPurgeSerial is the SQLite path: running the three purges concurrently buys
+// nothing because the driver holds a single writer lock, so we serialize them
+// to keep the "running" gauge accurate and avoid goroutine launch cost.
+func (r *RetentionScheduler) runPurgeSerial(ctx context.Context, cutoff time.Time, driver string) {
+	metrics := r.repo.metrics
+	start := time.Now()
 	purgeFailed := false
 
-	logs, err := r.repo.PurgeLogsBatched(ctx, cutoff, r.purgeBatchSize)
+	logs, err := r.repo.PurgeLogsBatched(ctx, cutoff, r.purgeBatchSize, r.purgeBatchSleep)
 	if err != nil {
 		slog.Error("retention: purge logs failed", "error", err)
 		purgeFailed = true
@@ -140,19 +229,16 @@ func (r *RetentionScheduler) runPurge(ctx context.Context) {
 		metrics.RetentionRowsPurgedTotal.WithLabelValues("logs", driver).Add(float64(logs))
 	}
 
-	traces, err := r.repo.PurgeTracesBatched(ctx, cutoff, r.purgeBatchSize)
+	traces, err := r.repo.PurgeTracesBatched(ctx, cutoff, r.purgeBatchSize, r.purgeBatchSleep)
 	if err != nil {
 		slog.Error("retention: purge traces failed", "error", err)
 		purgeFailed = true
 	}
 	if metrics != nil && traces > 0 {
-		// PurgeTracesBatched deletes traces and sweeps orphan spans. The returned
-		// count reflects traces; report under the "traces" label. Spans are swept
-		// as a side effect — no separate authoritative count is returned.
 		metrics.RetentionRowsPurgedTotal.WithLabelValues("traces", driver).Add(float64(traces))
 	}
 
-	metricsPurged, err := r.repo.PurgeMetricBucketsBatched(ctx, cutoff, r.purgeBatchSize)
+	metricsPurged, err := r.repo.PurgeMetricBucketsBatched(ctx, cutoff, r.purgeBatchSize, r.purgeBatchSleep)
 	if err != nil {
 		slog.Error("retention: purge metrics failed", "error", err)
 		purgeFailed = true
@@ -172,6 +258,7 @@ func (r *RetentionScheduler) runPurge(ctx context.Context) {
 	}
 
 	slog.Info("retention purge complete",
+		"driver", driver,
 		"cutoff", cutoff.Format(time.RFC3339),
 		"logs_deleted", logs,
 		"traces_deleted", traces,
@@ -180,6 +267,32 @@ func (r *RetentionScheduler) runPurge(ctx context.Context) {
 	)
 }
 
+// observeRowsBehind populates RetentionRowsBehindGauge so operators can see
+// when ingest is outrunning purge. Best-effort — a failed COUNT is logged and
+// skipped rather than failing the purge.
+func (r *RetentionScheduler) observeRowsBehind(ctx context.Context, driver string, cutoff time.Time) {
+	metrics := r.repo.metrics
+	if metrics == nil || metrics.RetentionRowsBehindGauge == nil {
+		return
+	}
+	probes := []struct {
+		table    string
+		model    any
+		tsColumn string
+	}{
+		{"logs", &Log{}, "timestamp"},
+		{"traces", &Trace{}, "timestamp"},
+		{"metric_buckets", &MetricBucket{}, "time_bucket"},
+	}
+	for _, p := range probes {
+		var n int64
+		if err := r.repo.db.WithContext(ctx).Model(p.model).Where(p.tsColumn+" < ?", cutoff).Count(&n).Error; err != nil {
+			continue // count failure is non-fatal; skip this label
+		}
+		metrics.RetentionRowsBehindGauge.WithLabelValues(p.table, driver).Set(float64(n))
+	}
+}
+
 func (r *RetentionScheduler) runMaintenance(ctx context.Context) {
 	if !r.running.CompareAndSwap(false, true) {
 		r.skippedRuns.Add(1)
diff --git a/internal/storage/retention_parallel_test.go b/internal/storage/retention_parallel_test.go
new file mode 100644
index 0000000..7419fc6
--- /dev/null
+++ b/internal/storage/retention_parallel_test.go
@@ -0,0 +1,43 @@
+package storage
+
+import (
+	"context"
+	"testing"
+	"time"
+)
+
+// TestRunPurge_RunsTablesInParallel_SQLiteSerialFallback exercises runPurge
+// across logs + traces + metric_buckets in a single call. On SQLite the
+// parallelization path is intentionally a no-op (runPurgeSerial), but the
+// observable behaviour — all three tables drained past cutoff — must hold.
+func TestRunPurge_RunsTablesInParallel_SQLiteSerialFallback(t *testing.T) {
+	repo := newTestRepo(t)
+	now := time.Now().UTC()
+	old := now.Add(-100 * 24 * time.Hour)
+
+	seedLogs(t, repo.db, 200, old, "svc")
+	seedTrace(t, repo.db, "old-trace", old, []time.Time{old})
+
+	// Seed a couple of old metric buckets so the metric_buckets branch isn't
+	// exercising a no-op. time_bucket < cutoff -> eligible for purge.
+	buckets := []MetricBucket{
+		{Name: "m1", ServiceName: "svc", TimeBucket: old, Count: 1},
+		{Name: "m2", ServiceName: "svc", TimeBucket: old, Count: 2},
+	}
+	if err := repo.db.Create(&buckets).Error; err != nil {
+		t.Fatalf("seed metric buckets: %v", err)
+	}
+
+	sched := NewRetentionScheduler(repo, 1, 100, 0) // batch 100, sleep 0
+	sched.runPurge(context.Background())
+
+	if c := mustCount(t, repo.db, &Log{}); c != 0 {
+		t.Fatalf("logs not purged: %d rows remain", c)
+	}
+	if c := mustCount(t, repo.db, &Trace{}); c != 0 {
+		t.Fatalf("traces not purged: %d rows remain", c)
+	}
+	if c := mustCount(t, repo.db, &MetricBucket{}); c != 0 {
+		t.Fatalf("metric_buckets not purged: %d rows remain", c)
+	}
+}
diff --git a/internal/storage/retention_test.go b/internal/storage/retention_test.go
index 399d855..afc4676 100644
--- a/internal/storage/retention_test.go
+++ b/internal/storage/retention_test.go
@@ -9,7 +9,7 @@ import (
 
 func TestRetentionScheduler_StopBeforeStart_NoDeadlock(t *testing.T) {
 	repo := newTestRepo(t)
-	r := NewRetentionScheduler(repo, 7)
+	r := NewRetentionScheduler(repo, 7, 10_000, 5*time.Millisecond)
 	done := make(chan struct{})
 	go func() {
 		r.Stop()
@@ -24,7 +24,7 @@ func TestRetentionScheduler_StopBeforeStart_NoDeadlock(t *testing.T) {
 
 func TestRetentionScheduler_DoubleStop(t *testing.T) {
 	repo := newTestRepo(t)
-	r := NewRetentionScheduler(repo, 7)
+	r := NewRetentionScheduler(repo, 7, 10_000, 5*time.Millisecond)
 	r.Start(context.Background())
 	r.Stop()
 	done := make(chan struct{})
@@ -41,7 +41,7 @@ func TestRetentionScheduler_DoubleStop(t *testing.T) {
 
 func TestRetentionScheduler_DoubleStart_Idempotent(t *testing.T) {
 	repo := newTestRepo(t)
-	r := NewRetentionScheduler(repo, 7)
+	r := NewRetentionScheduler(repo, 7, 10_000, 5*time.Millisecond)
 	r.Start(context.Background())
 	r.Start(context.Background())
 	r.Stop()
@@ -56,7 +56,7 @@ func TestRetentionScheduler_InitialPurgeRunsImmediately(t *testing.T) {
 	old := time.Now().UTC().Add(-30 * 24 * time.Hour)
 	seedLogs(t, repo.db, 100, old, "old-service")
 
-	r := NewRetentionScheduler(repo, 7)
+	r := NewRetentionScheduler(repo, 7, 10_000, 5*time.Millisecond)
 	r.purgeInterval = time.Hour
 	r.vacuumInterval = time.Hour
 	r.Start(context.Background())
@@ -74,7 +74,7 @@ func TestRetentionScheduler_InitialPurgeRunsImmediately(t *testing.T) {
 
 func TestRetentionScheduler_ContextCancellationStopsLoop(t *testing.T) {
 	repo := newTestRepo(t)
-	r := NewRetentionScheduler(repo, 7)
+	r := NewRetentionScheduler(repo, 7, 10_000, 5*time.Millisecond)
 	ctx, cancel := context.WithCancel(context.Background())
 	r.Start(ctx)
 	cancel()
@@ -91,7 +91,7 @@ func TestRetentionScheduler_MaintenanceVacuumsSQLite(t *testing.T) {
 	repo := newTestRepo(t)
 	seedLogs(t, repo.db, 1000, time.Now().UTC(), "svc")
 
-	r := NewRetentionScheduler(repo, 7)
+	r := NewRetentionScheduler(repo, 7, 10_000, 5*time.Millisecond)
 	r.runMaintenance(context.Background())
 	// If runMaintenance crashes on "cannot run inside a transaction", this test fails.
 	if mustCount(t, repo.db, &Log{}) != 1000 {
@@ -101,7 +101,7 @@ func TestRetentionScheduler_MaintenanceVacuumsSQLite(t *testing.T) {
 
 func TestRetentionScheduler_NoDataNoError(t *testing.T) {
 	repo := newTestRepo(t)
-	r := NewRetentionScheduler(repo, 7)
+	r := NewRetentionScheduler(repo, 7, 10_000, 5*time.Millisecond)
 	// Must not panic/error against empty tables.
 	r.runPurge(context.Background())
 	r.runMaintenance(context.Background())
@@ -109,7 +109,7 @@ func TestRetentionScheduler_NoDataNoError(t *testing.T) {
 
 func TestRetentionScheduler_ConcurrentStartStop(t *testing.T) {
 	repo := newTestRepo(t)
-	r := NewRetentionScheduler(repo, 7)
+	r := NewRetentionScheduler(repo, 7, 10_000, 5*time.Millisecond)
 
 	// Race detector target: 20 goroutines hammering Start/Stop simultaneously.
 	// atomic.Bool CAS means at most one goroutine transitions the flag, and
@@ -152,7 +152,7 @@ func TestRetentionScheduler_OverlapGuard(t *testing.T) {
 	old := time.Now().UTC().Add(-30 * 24 * time.Hour)
 	seedLogs(t, repo.db, 500, old, "svc")
 
-	r := NewRetentionScheduler(repo, 7)
+	r := NewRetentionScheduler(repo, 7, 10_000, 5*time.Millisecond)
 
 	// Pin the "running" flag as if a purge were already executing. Because
 	// CompareAndSwap(false, true) fails, the concurrent call must skip.
@@ -191,7 +191,7 @@ func TestRetentionScheduler_OverlapGuard(t *testing.T) {
 // purge tick.
 func TestRetentionScheduler_MaintenanceRespectsOverlapGuard(t *testing.T) {
 	repo := newTestRepo(t)
-	r := NewRetentionScheduler(repo, 7)
+	r := NewRetentionScheduler(repo, 7, 10_000, 5*time.Millisecond)
 
 	r.running.Store(true) // pretend purge is in flight
 	before := r.SkippedRuns()
@@ -204,7 +204,7 @@ func TestRetentionScheduler_MaintenanceRespectsOverlapGuard(t *testing.T) {
 
 func TestRetentionScheduler_ConcurrentStopCallers(t *testing.T) {
 	repo := newTestRepo(t)
-	r := NewRetentionScheduler(repo, 7)
+	r := NewRetentionScheduler(repo, 7, 10_000, 5*time.Millisecond)
 	r.Start(context.Background())
 
 	var wg sync.WaitGroup
diff --git a/internal/storage/trace_repo.go b/internal/storage/trace_repo.go
index 4c20645..65a541c 100644
--- a/internal/storage/trace_repo.go
+++ b/internal/storage/trace_repo.go
@@ -296,7 +296,7 @@ func (r *Repository) PurgeTraces(olderThan time.Time) (int64, error) {
 // Tenant scope: this is a SYSTEM-WIDE retention operation and intentionally
 // does NOT filter by tenant. Rows are deleted across every tenant. Never
 // expose this on a tenant-scoped API surface.
-func (r *Repository) PurgeTracesBatched(ctx context.Context, olderThan time.Time, batchSize int) (int64, error) {
+func (r *Repository) PurgeTracesBatched(ctx context.Context, olderThan time.Time, batchSize int, sleep time.Duration) (int64, error) {
 	if batchSize <= 0 {
 		batchSize = 10_000
 	}
@@ -343,7 +343,7 @@ func (r *Repository) PurgeTracesBatched(ctx context.Context, olderThan time.Time
 		select {
 		case <-ctx.Done():
 			return total, ctx.Err()
-		case <-time.After(5 * time.Millisecond):
+		case <-time.After(sleep):
 		}
 	}
 
@@ -367,7 +367,7 @@ func (r *Repository) PurgeTracesBatched(ctx context.Context, olderThan time.Time
 		select {
 		case <-ctx.Done():
 			return total, ctx.Err()
-		case <-time.After(5 * time.Millisecond):
+		case <-time.After(sleep):
 		}
 	}
 
diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go
index 0de38e7..4c183ef 100644
--- a/internal/telemetry/metrics.go
+++ b/internal/telemetry/metrics.go
@@ -1,6 +1,7 @@
 package telemetry
 
 import (
+	"database/sql"
 	"encoding/json"
 	"net/http"
 	"runtime"
@@ -12,6 +13,13 @@ import (
 	"github.com/prometheus/client_golang/prometheus/promhttp"
 )
 
+// Metric naming convention: historical metrics use the PascalCase
+// `OtelContext_*` prefix; all metrics added during the backend-robustness
+// initiative (and later) use the Prometheus-idiomatic lowercase
+// `otelcontext_*` prefix. Both are kept for backward compatibility —
+// operators querying by prefix should match `(?i)^otelcontext_`. Migrate
+// the legacy names when a major version bump is acceptable.
+
 // Metrics holds all internal Prometheus metrics for OtelContext self-monitoring.
 type Metrics struct {
 	// --- Existing ---
@@ -52,6 +60,7 @@ type Metrics struct {
 	RetentionRowsPurgedTotal       *prometheus.CounterVec
 	RetentionPurgeDurationSeconds  *prometheus.HistogramVec
 	RetentionVacuumDurationSeconds *prometheus.HistogramVec
+	RetentionRowsBehindGauge       *prometheus.GaugeVec
 
 	// --- Runtime ---
 	GoGoroutines     prometheus.Gauge
@@ -66,6 +75,23 @@ type Metrics struct {
 	RetentionConsecutiveFailures  *prometheus.GaugeVec
 	DBUp                          *prometheus.GaugeVec
 
+	// --- GraphRAG overflow ---
+	GraphRAGEventsDroppedTotal *prometheus.CounterVec
+
+	// --- DB pool (sampled every 5s from sql.DB.Stats) ---
+	DBPoolOpenConnections prometheus.Gauge
+	DBPoolInUse           prometheus.Gauge
+	DBPoolIdle            prometheus.Gauge
+	DBPoolWaitCount       prometheus.Gauge
+	DBPoolWaitDuration    prometheus.Gauge // cumulative seconds
+
+	// --- DLQ eviction (Task 8) ---
+	DLQEvictedTotal      prometheus.Counter
+	DLQEvictedBytesTotal prometheus.Counter
+
+	// --- Dashboard p99 (Task 10) ---
+	DashboardP99RowCapHitsTotal prometheus.Counter
+
 	// Atomic counters for JSON health endpoint (avoids scraping Prometheus)
 	totalIngested  atomic.Int64
 	activeConns    atomic.Int64
@@ -193,6 +219,10 @@ func New() *Metrics {
 			Help:    "Duration of per-table retention maintenance (VACUUM/ANALYZE/OPTIMIZE), by driver and table.",
 			Buckets: prometheus.ExponentialBuckets(0.01, 2, 16),
 		}, []string{"driver", "table"}),
+		RetentionRowsBehindGauge: promauto.NewGaugeVec(prometheus.GaugeOpts{
+			Name: "otelcontext_retention_rows_behind",
+			Help: "Rows older than retention cutoff that have not yet been purged. Climbing means purge cannot keep pace with ingest.",
+		}, []string{"table", "driver"}),
 
 		// Runtime
 		GoGoroutines: promauto.NewGauge(prometheus.GaugeOpts{
@@ -233,7 +263,46 @@ func New() *Metrics {
 			Name: "OtelContext_db_up",
 			Help: "Database reachability (1=up, 0=down) by driver.",
 		}, []string{"driver"}),
+
+		GraphRAGEventsDroppedTotal: promauto.NewCounterVec(prometheus.CounterOpts{
+			Name: "otelcontext_graphrag_events_dropped_total",
+			Help: "Events dropped because the GraphRAG event channel was full.",
+		}, []string{"signal"}),
+
+		// DB pool (Task 7 — visibility for DB_MAX_OPEN_CONNS sizing).
+		DBPoolOpenConnections: promauto.NewGauge(prometheus.GaugeOpts{
+			Name: "otelcontext_db_pool_open_connections",
+			Help: "Current number of open DB connections in the pool.",
+		}),
+		DBPoolInUse: promauto.NewGauge(prometheus.GaugeOpts{
+			Name: "otelcontext_db_pool_in_use",
+			Help: "Current number of DB connections in use.",
+		}),
+		DBPoolIdle: promauto.NewGauge(prometheus.GaugeOpts{
+			Name: "otelcontext_db_pool_idle",
+			Help: "Current number of idle DB connections.",
+		}),
+		DBPoolWaitCount: promauto.NewGauge(prometheus.GaugeOpts{
+			Name: "otelcontext_db_pool_wait_count",
+			Help: "Cumulative connection waits since DB open (gauge-reported; compute rate() over this value).",
+		}),
+		DBPoolWaitDuration: promauto.NewGauge(prometheus.GaugeOpts{
+			Name: "otelcontext_db_pool_wait_duration_seconds",
+			Help: "Cumulative wait duration for pool acquisition, in seconds (gauge-reported; compute rate() over this value).",
+		}),
 	}
+	m.DLQEvictedTotal = promauto.NewCounter(prometheus.CounterOpts{
+		Name: "otelcontext_dlq_evicted_total",
+		Help: "DLQ files evicted to stay under MaxFiles/MaxDiskMB. Non-zero means backlog exceeds cap — investigate DB health.",
+	})
+	m.DLQEvictedBytesTotal = promauto.NewCounter(prometheus.CounterOpts{
+		Name: "otelcontext_dlq_evicted_bytes_total",
+		Help: "Total bytes evicted from DLQ. Rate indicates data-loss volume during backlog.",
+	})
+	m.DashboardP99RowCapHitsTotal = promauto.NewCounter(prometheus.CounterOpts{
+		Name: "otelcontext_dashboard_p99_row_cap_hits_total",
+		Help: "Number of dashboard p99 computations that hit the SQLite row cap (200k). Indicates the dataset is too large for in-memory p99 — use Postgres for prod.",
+	})
 	return m
 }
 
@@ -251,6 +320,24 @@ func (m *Metrics) StartRuntimeMetrics() {
 	}()
 }
 
+// SampleDBPoolStats writes the live pool stats into the DBPool* gauges. Safe
+// to call from a ticker goroutine. A nil receiver or a nil *sql.DB is a no-op
+// so callers don't need to guard at every call site.
+//
+// WaitCount and WaitDuration from sql.DBStats are cumulative values (always
+// monotonically increasing) — operators should compute rate() over them.
+func (m *Metrics) SampleDBPoolStats(sqlDB *sql.DB) {
+	if m == nil || sqlDB == nil {
+		return
+	}
+	s := sqlDB.Stats()
+	m.DBPoolOpenConnections.Set(float64(s.OpenConnections))
+	m.DBPoolInUse.Set(float64(s.InUse))
+	m.DBPoolIdle.Set(float64(s.Idle))
+	m.DBPoolWaitCount.Set(float64(s.WaitCount))
+	m.DBPoolWaitDuration.Set(s.WaitDuration.Seconds())
+}
+
 // --- Existing helper methods ---
 
 func (m *Metrics) RecordIngestion(count int) {
diff --git a/internal/telemetry/metrics_pool_test.go b/internal/telemetry/metrics_pool_test.go
new file mode 100644
index 0000000..47079ae
--- /dev/null
+++ b/internal/telemetry/metrics_pool_test.go
@@ -0,0 +1,68 @@
+package telemetry
+
+import (
+	"database/sql"
+	"testing"
+
+	_ "github.com/glebarez/go-sqlite" // registers "sqlite" driver used by glebarez/sqlite GORM dialect
+	"github.com/prometheus/client_golang/prometheus"
+	dto "github.com/prometheus/client_model/go"
+)
+
+// gaugeValueForTest scrapes the current value of a Prometheus gauge.
+func gaugeValueForTest(t *testing.T, g prometheus.Gauge) float64 {
+	t.Helper()
+	var m dto.Metric
+	if err := g.Write(&m); err != nil {
+		t.Fatalf("gauge write: %v", err)
+	}
+	return m.GetGauge().GetValue()
+}
+
+// TestSampleDBPoolStats exercises both the happy path and nil safety. It is
+// a single test because telemetry.New() registers metrics against the global
+// Prometheus default registry via promauto — calling it twice in the same
+// test binary panics with a duplicate-collector error. Using one New() plus
+// subtests keeps coverage without colliding with the global registry.
+func TestSampleDBPoolStats(t *testing.T) {
+	m := New()
+
+	t.Run("WritesGauges", func(t *testing.T) {
+		db, err := sql.Open("sqlite", ":memory:")
+		if err != nil {
+			t.Fatalf("sql.Open: %v", err)
+		}
+		t.Cleanup(func() { _ = db.Close() })
+
+		// Force a connection so the pool reports something.
+		if err := db.Ping(); err != nil {
+			t.Fatalf("ping: %v", err)
+		}
+
+		m.SampleDBPoolStats(db)
+
+		stats := db.Stats()
+		if got := gaugeValueForTest(t, m.DBPoolOpenConnections); got != float64(stats.OpenConnections) {
+			t.Fatalf("open_connections: got %v want %v", got, stats.OpenConnections)
+		}
+		if got := gaugeValueForTest(t, m.DBPoolInUse); got != float64(stats.InUse) {
+			t.Fatalf("in_use: got %v want %v", got, stats.InUse)
+		}
+		if got := gaugeValueForTest(t, m.DBPoolIdle); got != float64(stats.Idle) {
+			t.Fatalf("idle: got %v want %v", got, stats.Idle)
+		}
+		// wait_count and wait_duration start at 0 on a fresh pool; just verify
+		// the gauges exist and are readable without error.
+		_ = gaugeValueForTest(t, m.DBPoolWaitCount)
+		_ = gaugeValueForTest(t, m.DBPoolWaitDuration)
+	})
+
+	t.Run("NilSafe", func(t *testing.T) {
+		// nil *sql.DB must not panic.
+		m.SampleDBPoolStats(nil)
+
+		// nil receiver must not panic either.
+		var m2 *Metrics
+		m2.SampleDBPoolStats(nil)
+	})
+}
diff --git a/main.go b/main.go
index 0827f6a..c94be26 100644
--- a/main.go
+++ b/main.go
@@ -49,6 +49,7 @@ import (
 	"google.golang.org/grpc/codes"
 	"google.golang.org/grpc/credentials"
 	_ "google.golang.org/grpc/encoding/gzip" // Register gzip decompressor
+	"google.golang.org/grpc/keepalive"
 	"google.golang.org/grpc/reflection"
 	"google.golang.org/grpc/status"
 )
@@ -134,6 +135,13 @@ func main() {
 	if err := cfg.Validate(); err != nil {
 		fatal("invalid configuration", err)
 	}
+	if err := cfg.ValidateDBForEnv(); err != nil {
+		fatal("DB/Env validation", err)
+	}
+	if strings.EqualFold(cfg.DBDriver, "sqlite") {
+		slog.Warn("SQLite driver in use — suitable for dev/small deployments only. " +
+			"Expected cap: ~5 services, ~1k events/sec sustained.")
+	}
 
 	// Initialize structured logger
 	var level slog.Level
@@ -181,7 +189,12 @@ func main() {
 
 	// 2a. Retention scheduler: hourly batched purge + daily VACUUM/ANALYZE.
 	ctxRetention, cancelRetention := context.WithCancel(context.Background())
-	retention := storage.NewRetentionScheduler(repo, cfg.HotRetentionDays)
+	retention := storage.NewRetentionScheduler(
+		repo,
+		cfg.HotRetentionDays,
+		cfg.RetentionBatchSize,
+		time.Duration(cfg.RetentionBatchSleepMs)*time.Millisecond,
+	)
 	retention.Start(ctxRetention)
 	slog.Info("🧹 Retention scheduler started", "retention_days", cfg.HotRetentionDays)
 
@@ -243,6 +256,7 @@ func main() {
 		func() { metrics.DLQReplayFailure.Inc() },
 		func(b int64) { metrics.DLQDiskBytes.Set(float64(b)) },
 	)
+	dlq.SetTelemetryMetrics(metrics)
 	slog.Info("🔁 DLQ initialized", "path", cfg.DLQPath, "interval", replayInterval)
 
 	// 4. Initialize Real-Time WebSocket Hub
@@ -336,10 +350,17 @@ func main() {
 
 	// 4g. Initialize GraphRAG (replaces simple graph for advanced queries)
 	graphrag.SetPanicMetrics(metrics)
-	graphRAG := graphrag.New(repo, vectorIdx, tsdbAgg, ringBuf, graphrag.DefaultConfig())
+	graphRAGCfg := graphrag.DefaultConfig()
+	graphRAGCfg.WorkerCount = cfg.GraphRAGWorkerCount
+	graphRAGCfg.ChannelSize = cfg.GraphRAGEventQueueSize
+	graphRAG := graphrag.New(repo, vectorIdx, tsdbAgg, ringBuf, graphRAGCfg)
+	graphRAG.SetMetrics(metrics)
 	ctxGraphRAG, cancelGraphRAG := context.WithCancel(context.Background())
 	go graphRAG.Start(ctxGraphRAG)
-	slog.Info("GraphRAG started (layered graph with anomaly detection)")
+	slog.Info("GraphRAG started (layered graph with anomaly detection)",
+		"workers", cfg.GraphRAGWorkerCount,
+		"event_queue_size", cfg.GraphRAGEventQueueSize,
+	)
 
 	// Auto-migrate GraphRAG models (Investigation, GraphSnapshot)
 	if err := graphrag.AutoMigrateGraphRAG(repo.DB()); err != nil {
@@ -460,13 +481,39 @@ func main() {
 	if err != nil {
 		fatal("Failed to listen on gRPC port", err, "port", cfg.GRPCPort)
 	}
+	recvBytes := cfg.GRPCMaxRecvMB
+	if recvBytes <= 0 {
+		recvBytes = 16
+	}
+	streams := cfg.GRPCMaxConcurrentStreams
+	if streams <= 0 {
+		streams = 1000
+	}
+
 	grpcOpts := []grpc.ServerOption{
+		grpc.MaxRecvMsgSize(recvBytes * 1024 * 1024),
+		grpc.MaxConcurrentStreams(uint32(streams)),
+		grpc.KeepaliveParams(keepalive.ServerParameters{
+			Time:                  60 * time.Second, // ping idle clients
+			Timeout:               10 * time.Second, // drop if no pong
+			MaxConnectionIdle:     10 * time.Minute, // garbage-collect dead NAT entries
+			MaxConnectionAge:      2 * time.Hour,    // force periodic reconnects
+			MaxConnectionAgeGrace: 30 * time.Second,
+		}),
+		grpc.KeepaliveEnforcementPolicy(keepalive.EnforcementPolicy{
+			MinTime:             5 * time.Second,
+			PermitWithoutStream: true,
+		}),
 		// Recovery FIRST so a panic inside the metrics interceptor is still caught.
 		grpc.ChainUnaryInterceptor(
 			recoveryUnaryInterceptor(metrics),
 			metricsUnaryInterceptor(metrics),
 		),
 	}
+	slog.Info("📡 gRPC server tuned",
+		"max_recv_mb", recvBytes,
+		"max_concurrent_streams", streams,
+	)
 	switch tlsMode {
 	case "cert-file":
 		creds, err := credentials.NewServerTLSFromFile(tlsCertPath, tlsKeyPath)
@@ -561,8 +608,34 @@ func main() {
 	httpHandler = api.MetricsMiddleware(metrics, httpHandler)
 	if cfg.APIRateLimitRPS > 0 {
 		rl := api.NewRateLimiter(float64(cfg.APIRateLimitRPS))
-		httpHandler = rl.Middleware(httpHandler)
-		slog.Info("🛡️  API rate limiter enabled", "rps_per_ip", cfg.APIRateLimitRPS)
+		// OTLP ingestion paths (/v1/*) are exempt from the per-IP rate limiter.
+		//
+		// Why: OTLP collectors batch aggressively and a healthy agent routinely
+		// exceeds the API_RATE_LIMIT_RPS default (100 RPS/IP). Throttling the
+		// ingestion path drops legitimate telemetry — the exact data this
+		// platform exists to capture — so /v1/* bypasses the limiter.
+		//
+		// DoS trade-off (acknowledged): the APIKeyGate runs *downstream* of the
+		// limiter in the middleware chain, which means an unauthenticated
+		// attacker can push /v1/* requests past the (bypassed) limiter all the
+		// way to the auth check before getting a 401. This is acceptable
+		// because APIKeyGate is header-only: it inspects the Authorization
+		// header and returns 401 without parsing the request body, so the
+		// per-request CPU cost is bounded and small (no protobuf decode, no
+		// JSON parse, no DB touch). Layer-4/7 protections (firewall, LB,
+		// WAF, mTLS) remain the primary defense against volumetric abuse.
+		//
+		// TODO: if this trade-off becomes a concern (e.g. abuse observed in
+		// prod, or CPU pressure from 401 storms), add a separate
+		// higher-ceiling OTLP-specific limiter scoped to /v1/* — tuned for
+		// collector-class RPS — rather than lowering the general API limit.
+		httpHandler = rl.MiddlewareExcept(func(path string) bool {
+			return strings.HasPrefix(path, "/v1/")
+		})(httpHandler)
+		slog.Info("🛡️  API rate limiter enabled",
+			"rps_per_ip", cfg.APIRateLimitRPS,
+			"exempt_prefixes", []string{"/v1/"},
+		)
 	}
 
 	// DB health fast-fail gate: returns 503 for DB-dependent paths when the
@@ -593,6 +666,30 @@ func main() {
 		}
 	}()
 
+	// DB pool stats sampler (Task 7 — visibility for DB_MAX_OPEN_CONNS sizing).
+	// sql.DB.Stats() is cheap (atomic loads on the pool struct), so 5s is fine.
+	bootWG.Add(1)
+	go func() {
+		defer bootWG.Done()
+		sqlDB, err := repo.DB().DB()
+		if err != nil || sqlDB == nil {
+			slog.Warn("DB pool sampler disabled (cannot get *sql.DB)", "error", err)
+			return
+		}
+		// Initial sample so the gauge has a value immediately after startup.
+		metrics.SampleDBPoolStats(sqlDB)
+		tick := time.NewTicker(5 * time.Second)
+		defer tick.Stop()
+		for {
+			select {
+			case <-appCtx.Done():
+				return
+			case <-tick.C:
+				metrics.SampleDBPoolStats(sqlDB)
+			}
+		}
+	}()
+
 	// Panic recovery: OUTERMOST middleware below OTel tracing — ensures any
 	// panic in downstream middleware or handlers is logged + metered and the
 	// process survives.
diff --git a/test/loadsim/README.md b/test/loadsim/README.md
new file mode 100644
index 0000000..cd6a113
--- /dev/null
+++ b/test/loadsim/README.md
@@ -0,0 +1,53 @@
+# loadsim — 200-service OTLP load simulator
+
+A single Go binary that spins up N simulated services as goroutines and drives
+sustained OTLP/gRPC traffic into OtelContext. Used to verify backend robustness
+and gate releases via `make loadtest`.
+
+## What it does
+
+- Launches `--services` (default 200) concurrent producers, each with its own
+  tracer provider and OTLP gRPC exporter.
+- Each producer emits `--rate` (default 50) spans/sec for `--duration` (default 60s).
+- Spans cycle round-robin across 5 synthetic operations, durations in
+  [5ms, 500ms], deterministic 5% error rate.
+- Every 10th span is a parent with 1–3 children in the same trace.
+- Producers come online linearly over `--warmup` (default 5s).
+- Progress reported every 5s; final summary on exit.
+
+## Run
+
+```bash
+# Requires OtelContext running on the target endpoint.
+make loadtest                           # full 200-service, 60s run
+make loadtest-build                     # build-only → bin/loadsim
+go test -tags loadtest ./test/loadsim/...  # unit tests
+```
+
+## Flags
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--endpoint` | `localhost:4317` | OTLP gRPC endpoint |
+| `--services` | `200` | Number of simulated services |
+| `--rate` | `50` | Spans per second per service |
+| `--duration` | `60s` | Test duration |
+| `--insecure` | `true` | Skip TLS verification |
+| `--tenant-id` | `""` | Attach `x-tenant-id` metadata (empty = omit) |
+| `--warmup` | `5s` | Linear producer ramp-up window |
+
+## Output
+
+Progress lines look like `[T+10s] sent=37000 errors=1850 rate=5000/s`,
+followed by a summary block with total spans, errors, effective rate.
+
+## What "healthy" looks like
+
+- No OTLP `ResourceExhausted` or `Unavailable` errors in producer output.
+- Backend `/ready` returns 200 throughout.
+- `/metrics`: `OtelContext_retention_consecutive_failures` stays 0.
+- p99 ingestion latency (`otelcontext_http_request_duration_seconds`) stays
+  within 2× baseline; goroutine count levels off within 30s.
+
+Caveat: this simulator does **not** start OtelContext — a live backend
+must already be accepting gRPC on the target endpoint.
diff --git a/test/loadsim/main.go b/test/loadsim/main.go
new file mode 100644
index 0000000..697c12c
--- /dev/null
+++ b/test/loadsim/main.go
@@ -0,0 +1,431 @@
+//go:build loadtest
+
+package main
+
+import (
+	"context"
+	"errors"
+	"flag"
+	"fmt"
+	"log"
+	"math/rand"
+	"os"
+	"os/signal"
+	"sync"
+	"sync/atomic"
+	"syscall"
+	"time"
+
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/exporters/otlp/otlptrace"
+	"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
+	"go.opentelemetry.io/otel/sdk/resource"
+	sdktrace "go.opentelemetry.io/otel/sdk/trace"
+	semconv "go.opentelemetry.io/otel/semconv/v1.17.0"
+	"go.opentelemetry.io/otel/trace"
+)
+
+// operations is the fixed pool picked round-robin per producer.
+var operations = []string{
+	"GET /api/items",
+	"POST /api/orders",
+	"GET /health",
+	"GET /api/users",
+	"POST /api/payments",
+}
+
+// -------------------------------------------------------------------------
+// Pure helper functions (tested directly by main_test.go)
+// -------------------------------------------------------------------------
+
+// serviceName returns the zero-padded service name for index i.
+func serviceName(i int) string {
+	return fmt.Sprintf("loadsim-svc-%03d", i)
+}
+
+// pickOperation returns an operation name using round-robin on the global ops slice.
+func pickOperation(seq int) string {
+	return operations[seq%len(operations)]
+}
+
+// randomDuration returns a uniformly random duration in [5ms, 500ms].
+// Uses the shared global RNG; the hot-path variant is (*producer).randomDuration.
+func randomDuration() time.Duration {
+	// 5ms + [0, 495ms)
+	return time.Duration(5+rand.Intn(496)) * time.Millisecond
+}
+
+// randomDuration returns a uniformly random duration in [5ms, 500ms] using the
+// producer's private RNG (no cross-goroutine mutex contention).
+func (p *producer) randomDuration() time.Duration {
+	return time.Duration(5+p.rng.Intn(496)) * time.Millisecond
+}
+
+// isError returns true for approximately 5% of call sites (seq % 20 == 0).
+// This is deterministic for a given seq, giving exactly 5% over a complete cycle.
+func isError(seq int) bool {
+	return seq%20 == 0
+}
+
+// -------------------------------------------------------------------------
+// Ticker-based rate limiter (no golang.org/x/time/rate dependency)
+// -------------------------------------------------------------------------
+
+type rateLimiter struct {
+	ticker *time.Ticker
+	ch     chan struct{}
+	done   chan struct{}
+}
+
+func newRateLimiter(rps int) *rateLimiter {
+	interval := time.Second / time.Duration(rps)
+	rl := &rateLimiter{
+		ticker: time.NewTicker(interval),
+		ch:     make(chan struct{}, 1), // capacity 1 avoids head-of-line blocking
+		done:   make(chan struct{}),
+	}
+	go func() {
+		for {
+			select {
+			case <-rl.ticker.C:
+				select {
+				case rl.ch <- struct{}{}:
+				default: // drop tick if consumer is behind — no burst accumulation
+				}
+			case <-rl.done:
+				return
+			}
+		}
+	}()
+	return rl
+}
+
+// wait blocks until one token is available.
+func (rl *rateLimiter) wait() {
+	<-rl.ch
+}
+
+func (rl *rateLimiter) stop() {
+	rl.ticker.Stop()
+	close(rl.done)
+}
+
+// -------------------------------------------------------------------------
+// Per-producer state
+// -------------------------------------------------------------------------
+
+type producer struct {
+	idx      int
+	endpoint string
+	tenantID string
+	insecure bool
+
+	tp     *sdktrace.TracerProvider
+	tracer trace.Tracer
+
+	// rng is a per-producer RNG — avoids 200-goroutine contention on the global
+	// math/rand mutex in the hot path (duration, child count).
+	rng *rand.Rand
+
+	sentTotal  atomic.Int64
+	errorTotal atomic.Int64
+}
+
+func newProducer(ctx context.Context, idx int, endpoint, tenantID string, insecure bool) (*producer, error) {
+	svc := serviceName(idx)
+
+	opts := []otlptracegrpc.Option{
+		otlptracegrpc.WithEndpoint(endpoint),
+	}
+	if insecure {
+		opts = append(opts, otlptracegrpc.WithInsecure())
+	}
+	if tenantID != "" {
+		opts = append(opts, otlptracegrpc.WithHeaders(map[string]string{"x-tenant-id": tenantID}))
+	}
+
+	client := otlptracegrpc.NewClient(opts...)
+	exp, err := otlptrace.New(ctx, client)
+	if err != nil {
+		return nil, fmt.Errorf("producer %d exporter: %w", idx, err)
+	}
+
+	res, err := resource.New(ctx,
+		resource.WithAttributes(semconv.ServiceName(svc)),
+	)
+	if err != nil {
+		return nil, fmt.Errorf("producer %d resource: %w", idx, err)
+	}
+
+	tp := sdktrace.NewTracerProvider(
+		sdktrace.WithSampler(sdktrace.AlwaysSample()),
+		sdktrace.WithResource(res),
+		sdktrace.WithBatcher(exp),
+	)
+
+	return &producer{
+		idx:      idx,
+		endpoint: endpoint,
+		tenantID: tenantID,
+		insecure: insecure,
+		tp:       tp,
+		tracer:   tp.Tracer(svc),
+		rng:      rand.New(rand.NewSource(time.Now().UnixNano() + int64(idx))),
+	}, nil
+}
+
+// run emits spans at the given rate for the given duration, then returns.
+func (p *producer) run(ctx context.Context, rps int, dur time.Duration) {
+	rl := newRateLimiter(rps)
+	defer rl.stop()
+
+	deadline := time.Now().Add(dur)
+	seq := 0
+
+	for time.Now().Before(deadline) {
+		select {
+		case <-ctx.Done():
+			return
+		default:
+		}
+
+		rl.wait()
+		p.emitSpan(ctx, seq)
+		seq++
+	}
+}
+
+// emitSpan creates one span (with optional child spans every 10th call).
+func (p *producer) emitSpan(ctx context.Context, seq int) {
+	op := pickOperation(seq)
+	dur := p.randomDuration()
+	errored := isError(seq)
+
+	// Every 10th span: create a parent with 1–3 children in the same trace.
+	if seq%10 == 0 {
+		parentCtx, parentSpan := p.tracer.Start(ctx, op)
+		if errored {
+			parentSpan.SetStatus(codes.Error, "simulated error")
+			parentSpan.RecordError(errors.New("fake failure"))
+			p.errorTotal.Add(1)
+		}
+
+		numChildren := 1 + p.rng.Intn(3) // [1,3]
+		for c := 0; c < numChildren; c++ {
+			childOp := pickOperation(seq + c + 1)
+			_, childSpan := p.tracer.Start(parentCtx, childOp)
+			time.Sleep(dur / time.Duration(numChildren+1))
+			childSpan.End()
+			p.sentTotal.Add(1)
+		}
+
+		time.Sleep(dur / time.Duration(numChildren+1))
+		parentSpan.End()
+		p.sentTotal.Add(1)
+	} else {
+		_, span := p.tracer.Start(ctx, op)
+		if errored {
+			span.SetStatus(codes.Error, "simulated error")
+			span.RecordError(errors.New("fake failure"))
+			p.errorTotal.Add(1)
+		}
+		time.Sleep(dur)
+		span.End()
+		p.sentTotal.Add(1)
+	}
+}
+
+// shutdown flushes the exporter and waits up to the given timeout.
+func (p *producer) shutdown(timeout time.Duration) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	defer cancel()
+	if err := p.tp.Shutdown(ctx); err != nil {
+		log.Printf("producer %d shutdown error: %v", p.idx, err)
+	}
+}
+
+// -------------------------------------------------------------------------
+// Coordinator
+// -------------------------------------------------------------------------
+
+type coordinator struct {
+	startTime time.Time
+
+	totalSent   atomic.Int64
+	totalErrors atomic.Int64
+}
+
+func (c *coordinator) progressLoop(ctx context.Context, interval time.Duration) {
+	ticker := time.NewTicker(interval)
+	defer ticker.Stop()
+
+	var prevSent int64
+	prevTime := time.Now()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			now := time.Now()
+			elapsed := now.Sub(c.startTime).Seconds()
+			sent := c.totalSent.Load()
+			errs := c.totalErrors.Load()
+			delta := sent - prevSent
+			dt := now.Sub(prevTime).Seconds()
+			rate := float64(delta) / dt
+			prevSent = sent
+			prevTime = now
+			fmt.Printf("[T+%3.0fs] sent=%d errors=%d rate=%.0f/s\n", elapsed, sent, errs, rate)
+		}
+	}
+}
+
+// -------------------------------------------------------------------------
+// Main
+// -------------------------------------------------------------------------
+
+func main() {
+	endpoint := flag.String("endpoint", "localhost:4317", "OTLP gRPC endpoint")
+	numServices := flag.Int("services", 200, "Number of simulated services")
+	rps := flag.Int("rate", 50, "Spans per second per service")
+	duration := flag.Duration("duration", 60*time.Second, "Test duration")
+	insecure := flag.Bool("insecure", true, "Use insecure gRPC connection")
+	tenantID := flag.String("tenant-id", "", "x-tenant-id gRPC metadata value (empty = omit)")
+	warmup := flag.Duration("warmup", 5*time.Second, "Stagger window for producer startup")
+	flag.Parse()
+
+	ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
+	defer stop()
+
+	// Suppress default OTel global TracerProvider noise.
+	otel.SetTracerProvider(sdktrace.NewTracerProvider())
+
+	fmt.Printf("Starting %d-service load simulator → %s\n", *numServices, *endpoint)
+	fmt.Printf("Rate: %d span/s per service | Duration: %s | Warmup: %s\n", *rps, *duration, *warmup)
+	fmt.Println("Press Ctrl+C to stop early.")
+
+	coord := &coordinator{startTime: time.Now()}
+
+	// Create all producers up front (no connections yet — lazy dial).
+	producers := make([]*producer, *numServices)
+	for i := 0; i < *numServices; i++ {
+		p, err := newProducer(ctx, i, *endpoint, *tenantID, *insecure)
+		if err != nil {
+			log.Fatalf("Failed to create producer %d: %v", i, err)
+		}
+		producers[i] = p
+	}
+
+	// Stagger goroutine to roll out producers linearly over warmup window.
+	staggerDelay := time.Duration(0)
+	if *numServices > 1 {
+		staggerDelay = *warmup / time.Duration(*numServices-1)
+	}
+
+	var wg sync.WaitGroup
+
+	// Progress reporter (runs until ctx cancelled or all producers done).
+	progressCtx, stopProgress := context.WithCancel(ctx)
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		coord.progressLoop(progressCtx, 5*time.Second)
+	}()
+
+	// Aggregator: fold per-producer counters into coordinator totals.
+	// We refresh once per second in a background goroutine.
+	aggDone := make(chan struct{})
+	go func() {
+		defer close(aggDone)
+		ticker := time.NewTicker(500 * time.Millisecond)
+		defer ticker.Stop()
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case <-ticker.C:
+				var s, e int64
+				for _, p := range producers {
+					s += p.sentTotal.Load()
+					e += p.errorTotal.Load()
+				}
+				coord.totalSent.Store(s)
+				coord.totalErrors.Store(e)
+			}
+		}
+	}()
+
+	// Launch producers with stagger.
+	producersDone := make(chan struct{})
+	go func() {
+		defer close(producersDone)
+		var pwg sync.WaitGroup
+	warmupLoop:
+		for i, p := range producers {
+			if i > 0 && staggerDelay > 0 {
+				select {
+				case <-ctx.Done():
+					break warmupLoop
+				case <-time.After(staggerDelay):
+				}
+			}
+			pwg.Add(1)
+			pp := p
+			go func() {
+				defer pwg.Done()
+				pp.run(ctx, *rps, *duration)
+			}()
+		}
+		pwg.Wait()
+	}()
+
+	// Wait for producers to finish or signal.
+	select {
+	case <-producersDone:
+	case <-ctx.Done():
+		fmt.Println("\nShutting down early (signal received)…")
+	}
+
+	// Stop aggregator and progress reporter.
+	stop() // cancel signal context so agg loop exits
+	stopProgress()
+	wg.Wait()
+	<-aggDone
+
+	// Final aggregate.
+	var totalSent, totalErrors int64
+	for _, p := range producers {
+		totalSent += p.sentTotal.Load()
+		totalErrors += p.errorTotal.Load()
+	}
+
+	// Flush all exporters (up to 5s total).
+	fmt.Printf("Flushing %d exporters…\n", len(producers))
+	flushTimeout := 5 * time.Second / time.Duration(len(producers)+1)
+	if flushTimeout < 100*time.Millisecond {
+		flushTimeout = 100 * time.Millisecond
+	}
+	var shutWg sync.WaitGroup
+	for _, p := range producers {
+		shutWg.Add(1)
+		pp := p
+		go func() {
+			defer shutWg.Done()
+			pp.shutdown(flushTimeout)
+		}()
+	}
+	shutWg.Wait()
+
+	elapsed := time.Since(coord.startTime)
+	successCount := totalSent - totalErrors
+
+	fmt.Println("─────────────────────────────────────────")
+	fmt.Printf("Duration:        %s\n", elapsed.Round(time.Millisecond))
+	fmt.Printf("Total spans:     %d\n", totalSent)
+	fmt.Printf("Errors:          %d (%.1f%%)\n", totalErrors, 100*float64(totalErrors)/float64(totalSent+1))
+	fmt.Printf("Success:         %d\n", successCount)
+	fmt.Printf("Effective rate:  %.0f span/s\n", float64(totalSent)/elapsed.Seconds())
+	fmt.Println("─────────────────────────────────────────")
+}
diff --git a/test/loadsim/main_test.go b/test/loadsim/main_test.go
new file mode 100644
index 0000000..befb5b9
--- /dev/null
+++ b/test/loadsim/main_test.go
@@ -0,0 +1,108 @@
+//go:build loadtest
+
+package main
+
+import (
+	"math"
+	"testing"
+	"time"
+)
+
+// TestServiceName verifies zero-padded naming scheme.
+func TestServiceName(t *testing.T) {
+	cases := []struct {
+		idx  int
+		want string
+	}{
+		{0, "loadsim-svc-000"},
+		{1, "loadsim-svc-001"},
+		{99, "loadsim-svc-099"},
+		{199, "loadsim-svc-199"},
+	}
+	for _, tc := range cases {
+		got := serviceName(tc.idx)
+		if got != tc.want {
+			t.Errorf("serviceName(%d) = %q, want %q", tc.idx, got, tc.want)
+		}
+	}
+}
+
+// TestSpanFactory verifies round-robin ops, duration range, and ~5% error rate.
+func TestSpanFactory(t *testing.T) {
+	const samples = 10_000
+
+	opCounts := make(map[string]int)
+	errorCount := 0
+	tooShort := 0
+	tooLong := 0
+
+	for i := 0; i < samples; i++ {
+		op := pickOperation(i)
+		opCounts[op]++
+
+		dur := randomDuration()
+		if dur < 5*time.Millisecond {
+			tooShort++
+		}
+		if dur > 500*time.Millisecond {
+			tooLong++
+		}
+
+		if isError(i) {
+			errorCount++
+		}
+	}
+
+	// All 5 operations must appear.
+	for _, op := range operations {
+		if opCounts[op] == 0 {
+			t.Errorf("operation %q never selected in %d samples", op, samples)
+		}
+	}
+
+	// Round-robin: each op should appear exactly samples/5 times.
+	expected := samples / len(operations)
+	for _, op := range operations {
+		if opCounts[op] != expected {
+			t.Errorf("operation %q count = %d, want %d (strict round-robin)", op, opCounts[op], expected)
+		}
+	}
+
+	// Duration must stay in [5ms, 500ms].
+	if tooShort > 0 {
+		t.Errorf("%d durations < 5ms", tooShort)
+	}
+	if tooLong > 0 {
+		t.Errorf("%d durations > 500ms", tooLong)
+	}
+
+	// Error rate: 5% ± 1% (absolute).
+	errorRate := float64(errorCount) / float64(samples)
+	if math.Abs(errorRate-0.05) > 0.01 {
+		t.Errorf("error rate = %.4f, want 0.05 ± 0.01", errorRate)
+	}
+}
+
+// TestRateLimiter drives the ticker-based limiter for ~1 second and checks throughput.
+func TestRateLimiter(t *testing.T) {
+	const targetRPS = 50
+	rl := newRateLimiter(targetRPS)
+	defer rl.stop()
+
+	start := time.Now()
+	count := 0
+	deadline := start.Add(1 * time.Second)
+
+	for time.Now().Before(deadline) {
+		rl.wait()
+		count++
+	}
+
+	// Allow ±10% of the target (50 ± 5).
+	rpsF := float64(targetRPS)
+	low := int(rpsF * 0.90)
+	high := int(rpsF * 1.10)
+	if count < low || count > high {
+		t.Errorf("rate limiter issued %d tokens in 1s, want %d–%d (target %d ±10%%)", count, low, high, targetRPS)
+	}
+}