diff --git a/CLAUDE.md b/CLAUDE.md index 3f98896..003e2e1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -214,7 +214,7 @@ Key settings in `internal/config/config.go`: - `MCP_ENABLED` (true), `MCP_PATH` (/mcp) - `VECTOR_INDEX_MAX_ENTRIES` (100000) - `DLQ_MAX_FILES` (1000), `DLQ_MAX_DISK_MB` (500), `DLQ_MAX_RETRIES` (10) -- `GRAPHRAG_WORKER_COUNT` (4), `GRAPHRAG_EVENT_QUEUE_SIZE` (10000) — raise both at 100+ services if `otelcontext_graphrag_events_dropped_total` climbs +- `GRAPHRAG_WORKER_COUNT` (16), `GRAPHRAG_EVENT_QUEUE_SIZE` (100000) — sized for 100–200 services; raise further if `otelcontext_graphrag_events_dropped_total` climbs - `GRPC_MAX_RECV_MB` (16), `GRPC_MAX_CONCURRENT_STREAMS` (1000) — OTLP gRPC server caps, validated to 1..256 and 1..1_000_000 - `RETENTION_BATCH_SIZE` (50000), `RETENTION_BATCH_SLEEP_MS` (1) — purge pacing; raise the sleep on busy production DBs - `APP_ENV` (`"development"`), `OTELCONTEXT_ALLOW_SQLITE_PROD` (false) — SQLite is refused when `APP_ENV=production` unless the allow flag is set diff --git a/docs/OPERATIONS.md b/docs/OPERATIONS.md index 0db5343..a04e961 100644 --- a/docs/OPERATIONS.md +++ b/docs/OPERATIONS.md @@ -92,7 +92,7 @@ DB_DSN="host=my-server.postgres.database.azure.com user=my-mi@tenant.onmicrosoft - `API_RATE_LIMIT_RPS=100` - `VECTOR_INDEX_MAX_ENTRIES=100000` - `SAMPLING_*` (defaults keep 100% + always-on errors) -- `GRAPHRAG_WORKER_COUNT=4`, `GRAPHRAG_EVENT_QUEUE_SIZE=10000` — raise worker count at 100+ services if you see `graphrag_events_dropped_total` climbing +- `GRAPHRAG_WORKER_COUNT=16`, `GRAPHRAG_EVENT_QUEUE_SIZE=100000` — sized for 100–200 services. Lower for tiny deployments; raise further if `graphrag_events_dropped_total` climbs. - `GRPC_MAX_RECV_MB=16`, `GRPC_MAX_CONCURRENT_STREAMS=1000` — OTLP gRPC server caps - `RETENTION_BATCH_SIZE=50000`, `RETENTION_BATCH_SLEEP_MS=1` — purge pacing; raise the sleep for busy production DBs @@ -249,6 +249,101 @@ If any of those trip, use the corresponding metric alert from the Observability --- +## Edge Pre-processing (OTel Collector) + +OtelContext is an OTLP **destination**, not a collector. Beyond ~150 services emitting unsampled telemetry, put a Collector in front to absorb cardinality, batch efficiently, and drop low-value traces before they hit the DB. SDKs → Collector → OtelContext. + +### When to deploy a Collector in front + +- Aggregate ingest rate exceeds ~30k spans/s — DB writes become the bottleneck before the wire does. +- You need processors OtelContext doesn't run: `tail_sampling`, `batch`, `memory_limiter`, `transform`, `filter`, `attributes`. +- You ingest from non-OTLP sources (Jaeger, Zipkin, Prometheus scrape, Fluent, syslog, Kafka) — OtelContext only speaks OTLP. +- Multi-region: edge Collectors batch + compress before crossing the WAN. + +### Recommended pipeline + +The two highest-impact processors are `tail_sampling` (10–20× volume reduction with full diagnostic value retained) and `batch` (cuts gRPC overhead per span). `memory_limiter` is mandatory in front of any Collector exposed to bursty traffic. + +```yaml +# otelcol-edge.yaml +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + memory_limiter: + check_interval: 1s + limit_mib: 1024 + spike_limit_mib: 256 + + tail_sampling: + decision_wait: 10s + num_traces: 50000 + expected_new_traces_per_sec: 5000 + policies: + - name: errors-always + type: status_code + status_code: { status_codes: [ERROR] } + - name: slow-always + type: latency + latency: { threshold_ms: 500 } + - name: probabilistic-healthy + type: probabilistic + probabilistic: { sampling_percentage: 5 } + + batch: + send_batch_size: 8192 + send_batch_max_size: 10000 + timeout: 2s + +exporters: + otlp/otelcontext: + endpoint: otelcontext.internal:4317 + tls: + insecure: false + headers: + authorization: "Bearer ${env:OTELCONTEXT_API_KEY}" + x-tenant-id: "${env:TENANT_ID}" + sending_queue: + enabled: true + queue_size: 10000 + retry_on_failure: + enabled: true + initial_interval: 1s + max_interval: 30s + +service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter, tail_sampling, batch] + exporters: [otlp/otelcontext] + logs: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [otlp/otelcontext] + metrics: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [otlp/otelcontext] +``` + +### Sampling policy notes + +- **Errors and slow traces are always sampled.** OtelContext's internal sampler does the same; keep parity at the edge so error/diagnostic data is never dropped. +- **5% probabilistic on healthy traces** is the right default for 150–200 services. Adjust based on the volume you can store within `HOT_RETENTION_DAYS`. +- The `tail_sampling` processor needs ~10s of buffer per trace to make the decision after spans have arrived — the `decision_wait` setting. Memory cost: `decision_wait × spans_per_sec × avg_span_size`. Plan for 256 MiB+ on the Collector at 30k spans/s. + +### Don't double-sample + +If the edge Collector applies tail-sampling, set `SAMPLING_RATE=1.0` on OtelContext. The SDK → Collector → OtelContext chain should sample exactly once. Default OtelContext config already keeps 100%, so no change is needed unless you previously tuned it. + +--- + ## Upgrade Path 1. **Back up the DB** (see Backup & Restore above). diff --git a/internal/config/config.go b/internal/config/config.go index 52802d4..6a8f760 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -202,8 +202,8 @@ func Load(customPath string) (*Config, error) { VectorIndexMaxEntries: getEnvInt("VECTOR_INDEX_MAX_ENTRIES", 100000), // GraphRAG - GraphRAGWorkerCount: getEnvInt("GRAPHRAG_WORKER_COUNT", 4), - GraphRAGEventQueueSize: getEnvInt("GRAPHRAG_EVENT_QUEUE_SIZE", 10000), + GraphRAGWorkerCount: getEnvInt("GRAPHRAG_WORKER_COUNT", 16), + GraphRAGEventQueueSize: getEnvInt("GRAPHRAG_EVENT_QUEUE_SIZE", 100000), // TLS TLSCertFile: getEnv("TLS_CERT_FILE", ""), diff --git a/internal/graphrag/builder.go b/internal/graphrag/builder.go index 0de3519..8be781f 100644 --- a/internal/graphrag/builder.go +++ b/internal/graphrag/builder.go @@ -39,8 +39,8 @@ func guardWorker(name string) { } const ( - defaultWorkerCount = 4 - defaultChannelSize = 10000 + defaultWorkerCount = 16 + defaultChannelSize = 100000 defaultTraceTTL = 1 * time.Hour defaultRefreshEvery = 60 * time.Second defaultSnapshotEvery = 15 * time.Minute diff --git a/internal/graphrag/builder_test.go b/internal/graphrag/builder_test.go index e73fe21..3639c96 100644 --- a/internal/graphrag/builder_test.go +++ b/internal/graphrag/builder_test.go @@ -137,8 +137,9 @@ func TestOnSpanIngested_DropsIncrementMetric(t *testing.T) { g := New(nil, nil, nil, nil, DefaultConfig()) t.Cleanup(g.Stop) - // Fill the buffer well beyond capacity (default 10k). - for i := 0; i < 11000; i++ { + // Fill the buffer beyond capacity. Use the package constant so the test + // stays correct if defaultChannelSize is retuned. + for range defaultChannelSize + 1000 { g.OnSpanIngested(storage.Span{ TraceID: "t", SpanID: "s",